In [1]:
import os
import bs4
import json
import time
import lxml
import deepl
import requests
import cloudscraper
from pathlib import Path
from random import randint
from collections import Counter
from googletrans import Translator
from fake_useragent import UserAgent
from profanity_filter import ProfanityFilter
from IPython.display import clear_output as clear

In [2]:
ua = UserAgent()
pf = ProfanityFilter()
translator = Translator()
dl_translator = deepl.Translator("4a2f6ee1-0d7a-8c77-9b1d-9476a086f204:fx")

In [3]:
path = os.path.abspath('').replace('src', '')
top_dir = Path(path).parent.absolute()
conjugations_data_dir = os.path.join(top_dir, 'conjugations', 'data', 'language-specific')
out_dir = os.path.join(path, 'out')

In [4]:
headers = {"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
           "accept-encoding": "gzip, deflate, br",
           "accept-language": "en-GB,en;q=0.9,es-ES;q=0.8,es;q=0.7,en-US;q=0.6,eu;q=0.5",
           "cache-control": "max-age=0",
           "cookie": "t=238707487; _ga=GA1.2.1376835774.1641262578; _gid=GA1.2.1482423077.1641262578; _fbp=fb.1.1641262579526.851471446",
           "referer": "https://hidemy.name/en/proxy-list/?start=64",
           "sec-ch-ua-mobile": "?0",
           "sec-ch-ua-platform": "macOS",
           "sec-fetch-dest": "document",
           "sec-fetch-mode": "navigate",
           "sec-fetch-site": "same-origin",
           "sec-fetch-user": "?1",
           "upgrade-insecure-requests": "1",
           "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"}

In [5]:
def getDeepLTranslations(verb, language):

    url = f'https://dict.deepl.com/{language}-english/search?ajax=1&source={language}&onlyDictEntries=1&translator=dnsof7h3k2lgh3gda&kind=full&eventkind=change&forleftside=true&il=en'
    data = {'query': verb}
    page = requests.post(url, data = data, headers = headers)
    soup = bs4.BeautifulSoup(page.text, 'lxml')

    translations = []

    primary = dl_translator.translate_text(verb, source_lang = "ES", target_lang="EN-US").text

    for block in soup.select('.lemma.featured'):
        
        title = block.select('.lemma_desc .tag_lemma')[0]

        if title.select('.dictLink') and title.select('.tag_wordtype'):
            
            title_word = title.select('.dictLink')[0].getText()
            title_type = title.select('.tag_wordtype')[0].getText()

            translations_lines = block.select('.lemma_content .translation_lines .translation.featured')
            
            if title_word == verb and 'verb' in title_type.lower():
                
                for line in translations_lines:
                    
                    line_title = line.select('.translation_desc .tag_trans')[0]
                    line_translation = line_title.select('.dictLink')[0].getText()
                    line_type = line_title.select('.tag_type')[0].getText()

                    if line_type.lower().strip() == 'v':
                        translation = line_translation.replace('(sth.)', '').replace('(sb.)', '').replace('sb./sth.', '').replace('sth./sb.', '').replace('sth.', 'something').replace('sb.', 'somebody').replace('()', '')
                        
                        if translation.replace('something', '').replace('somebody', '').strip() not in translations:
                            translations.append(translation.strip())

    if primary in translations and primary != translations[0]:
        translations.remove(primary)
        translations.insert(0, primary)

    return [translations, primary]

In [6]:
def extractText(string):
    skip = 0
    output = ''
    
    for char in string:
        if char != '(' and char != ')':
            if skip == 0:
                output += char
        else:
            if char == '(':
                skip += 1
            if char == ')':
                skip -= 1
    
    return output.strip().replace('  ', ' ')

In [7]:
def getCollinsTranslations(verb, language):
    
    retries = 1
    equivalent = ''

    while True:
        try:
            url = f'https://www.collinsdictionary.com/dictionary/{language}-english/{verb}'
            scraper = cloudscraper.create_scraper(
                browser={
                    'browser': 'firefox',
                    'platform': 'windows',
                    'mobile': False
                })
            soup = bs4.BeautifulSoup(scraper.get(url, headers = {"useragent": f"{ua.random}"}).text, 'lxml')

            if 'Cloudflare' not in soup.select('title')[0].getText():
                translations = []

                for section in soup.select('.page .dictionary .hom'):
                    if section.select('.hi.rend-sc .pos'):
                        section_type = section.select('.hi.rend-sc .pos')[0].getText().lower()
                    elif section.select('.gramGrp .pos'):
                        section_type = section.select('.gramGrp .pos')[0].getText().lower()
                    else:
                        section_type = False
                    
                        if section.select('.sense .xr a.ref'):
                            if 'Translation of' in str(section.select('.sense .xr a.ref')[0]):
                                equivalent = section.select('.sense .xr a.ref')[0].getText()
                    
                    if section_type:
                        if 'verb' in section_type and 'adverb' not in section_type and 'reflexive' not in section_type:
                            for entry in section.select('.sense:not(.type-example)>.cit.type-translation .quote'):
                                
                                translation = entry.getText().strip()

                                if entry.select('.or.i'):
                                    translation = translation.split(' or ')[0]  

                                if translation[:3] == 'to ':
                                    translation = translation[3:]
                                
                                    if translation.replace(' ','').isalpha() and translation not in translations:
                                        translations.append(translation.strip())

                if translations:
                    return translations
                else:
                    return equivalent
            
            else:
                retries += 1
        
        except:
            retries += 1

        if retries > 5:
            return []     

In [8]:
def getCambridgeTranslations(verb, language):
    url = f'https://dictionary.cambridge.org/dictionary/{language}-english/{verb}'
    page = requests.get(url, headers = headers)
    soup = bs4.BeautifulSoup(page.text, 'lxml')

    translations = []

    for main in soup.select('.pr.dictionary'):
        try: 
            if main.select('.pos.dpos')[0].getText() == 'verb':
                    for item in main.select('.di-body.normal-entry-body .pr:not(.phrase-block)')[0].select('.trans.dtrans:not(.hdb)'):
                        translation = item.getText()
                        
                        if translation[-1] == ')' and translation.count('(') == 1:
                            translation = translation.split('(')[0].strip()
                
                        if translation[:3] == 'to ' and translation[3:].replace(' ','').isalpha() and translation[3:] not in translations:
                            translations.append(translation[3:].replace(' sth', ' something').replace(' sb', ' somebody'))
        except:
            pass

        try:
            for main in soup.select('.pr.dictionary'):
                if main.select('h2.c_hh'):
                    for block in main.select('.di-body.normal-entry-body .pr:not(.phrase-block)'):
                        
                        item = block.select('.def-body.ddef_b.ddef_b-t')[0]
                        word_class = item.getText().split('[')[1].split(']')[0].strip()

                        if word_class == 'verb':
                            
                            generic = item.getText().split('[')[0].strip()

                            if generic.replace(' ','').isalpha() and generic not in translations:
                                translations.append(generic.replace(' sth', ' something').replace(' sb', ' somebody'))

                            detail = item.getText().split(']')[1].strip()

                            for translation in detail.split(';'):
                                formatted = extractText(translation.strip())
                                
                                if formatted[:3] == 'to ':
                                    candidate = formatted[3:]

                                    if len(candidate.split(' ')) == 1:
                                        translations.append(candidate.replace(' sth', ' something').replace(' sb', ' somebody').strip())
                                        break

                                    if ' or ' in candidate and len(candidate.split(' ')) == 3:
                                        for c in candidate.split(' or '):
                                            if c not in translations:
                                                translations.append(c.replace(' sth', ' something').replace(' sb', ' somebody').strip())
        except:
            pass

    return list({t: '' for t in translations if t.replace(' ','').isalpha()}.keys())

In [9]:
def getSpanishDictTranslations(verb, language):
    translations = []

    if language == 'spanish':
        request = requests.get(f"https://www.spanishdict.com/translate/{verb}")
        soup = bs4.BeautifulSoup(request.text, "lxml")

        if not soup.select('._25QSB23Y'):

            for i in range(0, len(soup.select("._2vd6M2gR"))):
                entry = soup.select("._2vd6M2gR")[i].getText()

                if entry[0:3] == "to " and entry[3:] not in translations:
                    if entry[3:].replace(' ','').isalpha():
                        translations.append(entry[3:].strip())

            for i in range(0, len(soup.select(".YR6epHeU"))):
                entry = soup.select(".YR6epHeU")[i].getText()

                if entry[0:3] == "to " and entry[3:] not in translations:
                    if entry[3:].replace(' ','').isalpha():
                        translations.append(entry[3:].strip())

            for i in range(0, len(soup.select(".gram_cat"))):
                block = soup.select(".gram_cat")[i]

                if verb + 'se' not in block.getText():

                    for i in range(0, len(block.select(".tran_main"))):
                        entry = block.select(".tran_main")[i].getText()

                        if entry[0:3] == "to " and entry[3:] not in translations:
                            if entry[3:].replace(' ','').isalpha():
                                translations.append(entry[3:].strip())
        
    return translations

In [10]:
def getGoogleTranslations(verb, language):

    language_code = {
        'spanish': 'es',
        'french': 'fr',
        'german': 'de',
        'italian': 'it',
        'portuguese': 'pt'
    }
    
    try: 
        translations = translator.translate(verb, dest = 'en', src = language_code[language]).extra_data['all-translations']

        if translations != None:
            for x in range(0, len(translations)):
                if translations[x][0] == 'verb':
                    translations = translations[x][1]
                    break

        else:
            translations = []
                    
    except Exception as ex:
            translations = []

    if translations and isinstance(translations, list) and isinstance(translations[0], str):
        return translations
    else:
        return []

In [11]:
def weightTranslations(translations):
    
    weighted = {}
    total = []

    for array in translations:
        for i in range(0, len(array)):
            translation = array[i]
            weight = 10 - i

            if weight < 1:
                weight = 1

            if translation in weighted.keys():
                weighted[translation] += weight
            else:
                weighted[translation] = weight

        total.extend(array)

    total_count = Counter(total)

    for translation in weighted:
        weighted[translation] *= total_count[translation]

    for translation in weighted:
        if f'{translation} to' in weighted:
            if weighted[translation] >= weighted[f'{translation} to']:
                weighted[translation] += weighted[f'{translation} to']
                weighted.pop(f'{translation} to')
                break
            else:
                weighted[f'{translation} to'] += weighted[translation]
                weighted.pop(translation)
                break

    ordered = dict(sorted(weighted.items(), key = lambda item: item[1])[::-1])
    
    return ordered

In [12]:
def filter(translations):
    output = []

    for translation in translations:
        if translation.replace(' ','').isalpha() and '*' not in pf.censor(translation):
            output.append(translation)

    return output

In [13]:
def translate(verb, language):
    data = {}
    flag = 0
    primaries = []

    # Translate
    [deepl, deepl_primary] = getDeepLTranslations(verb, language)
    collins = filter(getCollinsTranslations(verb, language))
    cambridge = filter(getCambridgeTranslations(verb, language))
    google = filter(getGoogleTranslations(verb, language))
    spanishdict = filter(getSpanishDictTranslations(verb, language))

    # Change deepl primary if necessary
    for array in [collins, cambridge, google, spanishdict]:
        if array:
            primaries.append(array[0])
    
    if deepl and deepl_primary != deepl[0]:
        if deepl_primary in primaries:
            deepl.insert(0, deepl_primary)

    deepl = filter(deepl)

    primaries = []

    #Weight and order
    ordered = weightTranslations([deepl, collins, cambridge, google, spanishdict])

    for array in [deepl, collins, cambridge, google, spanishdict]:
        if array:
            primaries.append(array[0])

            if array[0] != list(ordered.keys())[0]:
                flag += 1

    flag = round(flag/len(primaries), 2)
    principal = list(ordered.keys())[0]
    consensus = max(set(primaries), key = primaries.count)
    agreement = round(Counter(primaries)[consensus] / len(primaries), 2)

    return {
        'metadata': {
            'principal': principal,
            'flag': flag,
            'consensus': consensus,
            'agreement': agreement,
            'primaries': primaries
        },
        'weighted': ordered,
        'deepl': deepl,
        'collins': collins,
        'cambridge': cambridge,
        'google': google,
        'spanishdict': spanishdict
    }

In [14]:
language = 'portuguese'

In [15]:
with open(os.path.join(conjugations_data_dir, f'conjugations_{language}.json'), 'r', encoding = 'utf8') as file:
    conjugations = json.loads(file.read())

In [16]:
translations = {}

In [27]:
start = 2055

In [28]:
for i in range(start, 2100):
    
    verb = list(conjugations.keys())[i]
    translations[verb] = translate(verb, language)

    if (i + 1) % 100 == 0:
        with open(os.path.join(out_dir, f'translations_{language}_v2.json'), 'w', encoding = 'utf8') as file:
            json.dump(translations, file, indent = 4, ensure_ascii = False)
    
    clear(); print(f"{verb} ({conjugations[verb]['rank']}) – complete")

    time.sleep(8)

engrenar (2100) – complete
