In [1]:
! pip install googletrans
! pip install google


from googletrans import Translator
from googlesearch import search
import requests
from bs4 import BeautifulSoup
from typing import List
import re
import json
import subprocess
import numpy as np
import string

class marcoSearch:
    def __init__(self, languages: List[str]) -> None:
        self.translator = Translator()
        self.languages = languages
        if 'en' not in self.languages:
            self.languages.append('en')

    def translate_string(self, list_of_strings: List[str] , destination_language: str) -> List[str]:
        if type(list_of_strings) != list:
            list_of_strings = [list_of_strings]
        translations = self.translator.translate(list_of_strings, dest = destination_language)
        translated_strings = [s.text for s in translations]
        #translated_strings = translations.text
        return(translated_strings)

    def search_single_query(self, query: str) -> dict:
        url_dict = {}
        for lang in self.languages:
            url_dict[lang] = []
            curr_query = self.translate_string(query, destination_language = lang)
            url_dict[lang].extend(search(query= curr_query[0], tld='com', lang=lang, num= 10, stop= 10))
        return(url_dict)

    def parse_page(self, url: str, translate: bool = True) -> List[str]:
        page = requests.get(url = url)
        soup = BeautifulSoup(page.text, 'html.parser')
        texts = [v.get_text() for v in soup.find_all('p')]
        texts = list(map(self.clean_text, texts))
        texts = list(filter(lambda x: x != '', texts))
        if translate:
            translated_texts = self.translate_string(texts, destination_language = 'en')
            return(translated_texts)
        else:
            return(texts)

    def search_query(self, query: str) -> tuple:
        print('Retrieving URLs')
        url_dict = self.search_single_query(query)
        docs = {}
        docs['en'] = [self.parse_page(url = u, translate = False) for u in url_dict['en']]
        #print(docs)
        print('Translating pages')
        for lang in url_dict:
            if lang != 'en':
                docs[lang] = []
                docs[lang].extend([self.parse_page(url = u) for u in url_dict[lang]])
        return((docs, url_dict))

    @staticmethod
    def clean_text(html_text: str) -> str:
        clean_text = re.sub('<.*>', '', html_text)
        exclusion_set = set(string.punctuation) - {'.', '?', ',', '!'}
        clean_text = [v for v in list(clean_text) if v not in exclusion_set]
        clean_text = ''.join(clean_text)
        return(clean_text)

    @staticmethod
    def get_marco_vectors(strings_list: List[str]) -> List[dict]:
        standard_query = 'curl https://api.msturing.org/gen/encode -H "Ocp-Apim-Subscription-Key: d52c40fc4c2c4fcfb768ce18a7d1bafc" --data \'{"queries": ["JHU Hackathon starts now!", "Microsoft Bing Turing team is here to help you"]}\''
        strings = '" , "'.join(strings_list)
        strings = '["' + strings[0:1000] + '"]'
        query = re.sub('\[.*\]', strings, standard_query)
        #query = query + strings + '}\ '.rstrip() + '\''
        #print(query)
        result = subprocess.check_output(query, shell=True)
        result_json = json.loads(result)
        return(result_json)

    def get_sorted_tuples(self, query: str) -> List[tuple]:
        docs, url_dict = self.search_query(query)
        query_vector = np.array(self.get_marco_vectors([query])[0]['vector'])
        output_list = []
        for lang in self.languages:
            count = 0
            for website in docs[lang]:
                curr_string = ' '.join(website)
                try:
                    vector = np.array(self.get_marco_vectors([curr_string])[0]['vector'])
                    score = np.dot(query_vector, vector)
                    score /= np.linalg.norm(vector)*np.linalg.norm(query_vector)
                except:
                    score = 0
                #print(score)
                cur_url = url_dict[lang]
                curr_tuple = (url_dict[lang][count], score, curr_string)
                output_list.append(curr_tuple)
                count += 1
        output_list.sort(key = lambda x: x[1], reverse = True)
        return(output_list)

Collecting googletrans
  Downloading https://files.pythonhosted.org/packages/fd/f0/a22d41d3846d1f46a4f20086141e0428ccc9c6d644aacbfd30990cf46886/googletrans-2.4.0.tar.gz
Building wheels for collected packages: googletrans
  Building wheel for googletrans (setup.py) ... [?25l[?25hdone
  Created wheel for googletrans: filename=googletrans-2.4.0-cp36-none-any.whl size=15776 sha256=183c765c95c3c913c9e347710fc6f3c394a421f65e72c20274f8d4cef02a3e19
  Stored in directory: /root/.cache/pip/wheels/50/d6/e7/a8efd5f2427d5eb258070048718fa56ee5ac57fd6f53505f95
Successfully built googletrans
Installing collected packages: googletrans
Successfully installed googletrans-2.4.0


In [2]:
languages = ['en', 'ru']
ms = marcoSearch(languages= languages)

out = ms.get_sorted_tuples('st petersburg mayor')

Retrieving URLs
Translating pages


In [3]:
for tup in out[0:5]:
  print('Similarity score: {}, Website : {}'.format(tup[1], tup[0]))

Similarity score: 0.6292185334262044, Website : https://ru.wikipedia.org/wiki/%D0%93%D1%83%D0%B1%D0%B5%D1%80%D0%BD%D0%B0%D1%82%D0%BE%D1%80_%D0%A1%D0%B0%D0%BD%D0%BA%D1%82-%D0%9F%D0%B5%D1%82%D0%B5%D1%80%D0%B1%D1%83%D1%80%D0%B3%D0%B0
Similarity score: 0.6292185334262044, Website : https://ru.wikipedia.org/wiki/%D0%93%D1%83%D0%B1%D0%B5%D1%80%D0%BD%D0%B0%D1%82%D0%BE%D1%80_%D0%A1%D0%B0%D0%BD%D0%BA%D1%82-%D0%9F%D0%B5%D1%82%D0%B5%D1%80%D0%B1%D1%83%D1%80%D0%B3%D0%B0#%D0%9F%D0%BE%D0%BB%D0%BD%D0%BE%D0%BC%D0%BE%D1%87%D0%B8%D1%8F_%D0%93%D1%83%D0%B1%D0%B5%D1%80%D0%BD%D0%B0%D1%82%D0%BE%D1%80%D0%B0_%D0%A1%D0%B0%D0%BD%D0%BA%D1%82-%D0%9F%D0%B5%D1%82%D0%B5%D1%80%D0%B1%D1%83%D1%80%D0%B3%D0%B0
Similarity score: 0.6292185334262044, Website : https://ru.wikipedia.org/wiki/%D0%93%D1%83%D0%B1%D0%B5%D1%80%D0%BD%D0%B0%D1%82%D0%BE%D1%80_%D0%A1%D0%B0%D0%BD%D0%BA%D1%82-%D0%9F%D0%B5%D1%82%D0%B5%D1%80%D0%B1%D1%83%D1%80%D0%B3%D0%B0#%D0%98%D1%81%D1%82%D0%BE%D1%80%D0%B8%D1%8F
Similarity score: 0.6292185334262044, Websit

In [4]:
#Content for top suggested link
print(out[0][2])

The Governor of St. Petersburg is the highest official of St. Petersburg, heading the executive branch and the Government of St. Petersburg. The governor, elected by citizens of the Russian Federation residing in St. Petersburg and possessing, in accordance with federal law, active suffrage, on the basis of universal equal and direct suffrage by secret ballot for a term of 5 years. The powers of the governor include a representative office of the city, legislative initiative in the city parliament, the signing and promulgation of the laws of St. Petersburg, the formation of the Government and the provision of reports on its activities, determining the structure of the executive bodies of state power of the city of St. Petersburg and a number of other issues. The legal status of the governor is determined by the Charter of St. Petersburg, adopted on January 14, 1998. Since October 3, 2018, the duties of the Governor of St. Petersburg have been performed by Alexander Beglov1. According t