In [1]:
import os
import bs4
import json
import time
import lxml
import requests
import cloudscraper
from pathlib import Path
from random import randint
from fake_useragent import UserAgent
from IPython.display import clear_output as clear

In [2]:
ua = UserAgent()

In [3]:
path = os.path.abspath('').replace('src', '')
top_dir = Path(path).parent.absolute()
conjugations_data_dir = os.path.join(top_dir, 'conjugations', 'data', 'language-specific')
out_dir = os.path.join(path, 'out')

In [4]:
headers = {"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
           "accept-encoding": "gzip, deflate, br",
           "accept-language": "en-GB,en;q=0.9,es-ES;q=0.8,es;q=0.7,en-US;q=0.6,eu;q=0.5",
           "cache-control": "max-age=0",
           "cookie": "t=238707487; _ga=GA1.2.1376835774.1641262578; _gid=GA1.2.1482423077.1641262578; _fbp=fb.1.1641262579526.851471446",
           "referer": "https://hidemy.name/en/proxy-list/?start=64",
           "sec-ch-ua-mobile": "?0",
           "sec-ch-ua-platform": "macOS",
           "sec-fetch-dest": "document",
           "sec-fetch-mode": "navigate",
           "sec-fetch-site": "same-origin",
           "sec-fetch-user": "?1",
           "upgrade-insecure-requests": "1",
           "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"}

In [5]:
def extractText(string):
    skip = 0
    output = ''
    
    for char in string:
        if char != '(' and char != ')':
            if skip == 0:
                output += char
        else:
            if char == '(':
                skip += 1
            if char == ')':
                skip -= 1
    
    return output.strip().replace('  ', ' ')

In [6]:
def getCollinsTranslations(verb):
    
    retries = 1
    equivalent = ''

    while True:
        try:
            url = f'https://www.collinsdictionary.com/dictionary/{language}-english/{verb}'
            scraper = cloudscraper.create_scraper(
                browser={
                    'browser': 'firefox',
                    'platform': 'windows',
                    'mobile': False
                })
            soup = bs4.BeautifulSoup(scraper.get(url, headers = {"useragent": f"{ua.random}"}).text, 'lxml')

            if 'Cloudflare' not in soup.select('title')[0].getText():
                translations = []

                for section in soup.select('.page .dictionary .hom'):
                    if section.select('.hi.rend-sc .pos'):
                        section_type = section.select('.hi.rend-sc .pos')[0].getText().lower()
                    elif section.select('.gramGrp .pos'):
                        section_type = section.select('.gramGrp .pos')[0].getText().lower()
                    else:
                        section_type = False
                    
                        if section.select('.sense .xr a.ref'):
                            if 'Translation of' in str(section.select('.sense .xr a.ref')[0]):
                                equivalent = section.select('.sense .xr a.ref')[0].getText()
                    
                    if section_type:
                        if 'verb' in section_type and 'adverb' not in section_type and 'reflexive' not in section_type:
                            for entry in section.select('.sense:not(.type-example)>.cit.type-translation .quote'):
                                
                                translation = entry.getText().strip()

                                if entry.select('.or.i'):
                                    translation = translation.split(' or ')[0]  

                                if translation[:3] == 'to ':
                                    translation = translation[3:]
                                
                                    if translation.replace(' ','').isalpha() and translation not in translations:
                                        translations.append(translation)

                if translations:
                    return translations
                else:
                    return equivalent
            
            else:
                retries += 1
        
        except:
            retries += 1

        if retries > 5:
            return []     

In [7]:
def getCambridgeTranslations(verb):
    url = f'https://dictionary.cambridge.org/dictionary/{language}-english/{verb}'
    page = requests.get(url, headers = headers)
    soup = bs4.BeautifulSoup(page.text, 'lxml')

    translations = []

    for main in soup.select('.pr.dictionary'):
        try: 
            if main.select('.pos.dpos')[0].getText() == 'verb':
                    for item in main.select('.di-body.normal-entry-body .pr:not(.phrase-block)')[0].select('.trans.dtrans:not(.hdb)'):
                        translation = item.getText()
                        
                        if translation[-1] == ')' and translation.count('(') == 1:
                            translation = translation.split('(')[0].strip()
                
                        if translation[:3] == 'to ' and translation[3:].replace(' ','').isalpha() and translation[3:] not in translations:
                            translations.append(translation[3:].replace(' sth', ' something').replace(' sb', ' somebody'))
        except:
            pass

        try:
            for main in soup.select('.pr.dictionary'):
                if main.select('h2.c_hh'):
                    for block in main.select('.di-body.normal-entry-body .pr:not(.phrase-block)'):
                        
                        item = block.select('.def-body.ddef_b.ddef_b-t')[0]
                        word_class = item.getText().split('[')[1].split(']')[0].strip()

                        if word_class == 'verb':
                            
                            generic = item.getText().split('[')[0].strip()

                            if generic.replace(' ','').isalpha() and generic not in translations:
                                translations.append(generic.replace(' sth', ' something').replace(' sb', ' somebody'))

                            detail = item.getText().split(']')[1].strip()

                            for translation in detail.split(';'):
                                formatted = extractText(translation.strip())
                                
                                if formatted[:3] == 'to ':
                                    candidate = formatted[3:]

                                    if len(candidate.split(' ')) == 1:
                                        translations.append(candidate.replace(' sth', ' something').replace(' sb', ' somebody'))
                                        break

                                    if ' or ' in candidate and len(candidate.split(' ')) == 3:
                                        for c in candidate.split(' or '):
                                            if c not in translations:
                                                translations.append(c.replace(' sth', ' something').replace(' sb', ' somebody'))
        except:
            pass

    return translations[:10]

In [8]:
def getSpanishDictTranslations(verb):
    output = []

    request = requests.get(f"https://www.spanishdict.com/translate/{verb}")
    soup = bs4.BeautifulSoup(request.text, "lxml")

    if not soup.select('._25QSB23Y'):

        for i in range(0, len(soup.select("._2vd6M2gR"))):
            entry = soup.select("._2vd6M2gR")[i].getText()

            if entry[0:3] == "to " and entry[3:] not in output:
                if entry[3:].replace(' ','').isalpha():
                    output.append(entry[3:])

        for i in range(0, len(soup.select(".gram_cat"))):
            block = soup.select(".gram_cat")[i]

            if verb + 'se' not in block.getText():

                for i in range(0, len(block.select(".tran_main"))):
                    entry = block.select(".tran_main")[i].getText()

                    if entry[0:3] == "to " and entry[3:] not in output:
                        if entry[3:].replace(' ','').isalpha():
                            output.append(entry[3:])
        
    return output

In [9]:
# language = 'portuguese'
# current = 1
# data = {}

In [10]:
# try:
#     with open(os.path.join(path, 'out', f'translations_{language}.json'), 'r', encoding = 'utf8') as file:
#         data = json.loads(file.read())
# except:
#     pass

In [11]:
for language in ['french', 'german', 'italian', 'portuguese']:
    
    data = {}

    try:
        with open(os.path.join(path, 'out', f'translations_{language}.json'), 'r', encoding = 'utf8') as file:
            data = json.loads(file.read())
    except:
        pass
    
    with open(os.path.join(conjugations_data_dir, f'conjugations_{language}.json'), 'r', encoding = 'utf8') as file:
        conjugations = json.loads(file.read())

    for verb in conjugations:

        if (verb not in data.keys() or len(data[verb]) < 1) and (conjugations[verb]['rank'] < len(data) + 1 or len([verb for verb in data if data[verb] != []]) - 100 < 2000):

            try:
                collins = getCollinsTranslations(verb)

                if isinstance(collins, str):
                    collins = getCollinsTranslations(collins)
                    
                cambridge = getCambridgeTranslations(verb)

                if collins:
                    output = collins

                    for translation in cambridge:
                        if translation not in output:
                            output.append(translation)

                else:
                    output = cambridge

                data[verb] = output[:10]

                clear(); print(f'{verb} ({conjugations[verb]["rank"]}) - complete ({len(data[verb])})')

            except Exception as ex:
                data[verb] = []
                clear(); print(f'{verb} ({conjugations[verb]["rank"]}) - error ({ex})')
                
            time.sleep(randint(1000, 2000) / 10000)

    with open(os.path.join(out_dir, f'translations_{language}.json'), 'w', encoding = 'utf8') as file:
        json.dump(data, file, indent = 4, ensure_ascii = False)

    if language == 'spanish':

        with open(os.path.join(out_dir, f'translations_spanish.json'), 'r', encoding = 'utf8') as file:
            data = json.loads(file.read())

        combined = {}

        for verb in data:
            spanishdict = getSpanishDictTranslations(verb)
            translations = data[verb][::-1]

            output = spanishdict

            for n in range(0, len(translations)):
                
                if translations[n] not in output:
                    iterate = True
                    x = n

                    while iterate:
                        if translations[x] in output:
                            for i in range(0, len(output)):
                                if output[i] == translations[x]:
                                    output.insert(i + 1, translations[n])
                                    iterate = False
                                    break
                        else:
                            if x == 0:
                                iterate = False
                            else:
                                x -= 1
                
                if translations[n] not in output:
                    output.append(translations[n])

            combined[verb] = output[:10]
            
            clear(); print(f'{verb} ({conjugations[verb]["rank"]}) - complete ({len(combined[verb])})')

        with open(os.path.join(out_dir, f'translations_spanish.json'), 'w', encoding = 'utf8') as file:
            json.dump(combined, file, indent = 4, ensure_ascii = False)

apensar (2238) - complete (0)


In [12]:
# with open(os.path.join(conjugations_data_dir, f'conjugations_{language}.json'), 'r', encoding = 'utf8') as file:
#     conjugations = json.loads(file.read())

# for verb in conjugations:

#     if (verb not in data.keys() or len(data[verb]) < 2) and conjugations[verb]['rank'] < 2101:

#         retries = 1

#         while True:
#             try:
#                 url = f'https://www.collinsdictionary.com/dictionary/{language}-english/{verb}'
#                 scraper = cloudscraper.create_scraper(
#                     browser={
#                         'browser': 'firefox',
#                         'platform': 'windows',
#                         'mobile': False
#                     })
#                 soup = bs4.BeautifulSoup(scraper.get(url, headers = {"useragent": f"{ua.random}"}).text, 'lxml')

#                 if 'Cloudflare' not in soup.select('title')[0].getText():
#                     translations = []

#                     for section in soup.select('.page .dictionary .hom'):
#                         if section.select('.hi.rend-sc.pos'):
#                             section_type = section.select('.hi.rend-sc.pos')[0]
                            
#                             if 'verb' in section_type.getText() and 'adverb' not in section_type.getText():
#                                 for entry in section.select('.sense:not(.type-example)>.cit.type-translation .quote'):
#                                     translation = entry.getText()
                                    
#                                     if translation[:3] == 'to ':
#                                         translation = translation[3:]
                                    
#                                     if translation.replace(' ','').isalpha() and translation not in translations:
#                                         translations.append(translation)

#                     cambridge = getCambridgeTranslations(verb)

#                     output = translations

#                     for n in range(0, len(cambridge)):
                        
#                         if cambridge[n] not in output:
#                             iterate = True
#                             x = n

#                             while iterate:
#                                 if cambridge[x] in output:
#                                     for i in range(0, len(output)):
#                                         if output[i] == cambridge[x]:
#                                             output.insert(i + 1, cambridge[n])
#                                             iterate = False
#                                             break
#                                 else:
#                                     if x == 0:
#                                         iterate = False
#                                     else:
#                                         x -= 1
                        
#                         if cambridge[n] not in output:
#                             output.append(cambridge[n])

#                     data[verb] = output[:10]

#                     clear(); print(f'{verb} ({conjugations[verb]["rank"]}) - complete ({len(data[verb])})')
#                     break

#                 else:
#                     clear(); print(f'{verb} ({conjugations[verb]["rank"]}) - retrying ({retries})')
#                     retries += 1

#             except Exception as ex:
#                 data[verb] = []
#                 clear(); print(f'{verb} ({conjugations[verb]["rank"]}) - error ({ex})')
#                 break

#         time.sleep(randint(1000, 2000) / 10000)

#     current += 1

# with open(os.path.join(out_dir, f'translations_{language}.json'), 'w', encoding = 'utf8') as file:
#     json.dump(data, file, indent = 4, ensure_ascii = False)

# if language == 'spanish':

#     with open(os.path.join(out_dir, f'translations_spanish.json'), 'r', encoding = 'utf8') as file:
#         data = json.loads(file.read())

#     combined = {}

#     for verb in data:
#         spanishdict = getSpanishDictTranslations(verb)
#         collins = data[verb][::-1]

#         output = spanishdict

#         for n in range(0, len(collins)):
            
#             if collins[n] not in output:
#                 iterate = True
#                 x = n

#                 while iterate:
#                     if collins[x] in output:
#                         for i in range(0, len(output)):
#                             if output[i] == collins[x]:
#                                 output.insert(i + 1, collins[n])
#                                 iterate = False
#                                 break
#                     else:
#                         if x == 0:
#                             iterate = False
#                         else:
#                             x -= 1
            
#             if collins[n] not in output:
#                 output.append(collins[n])

#         combined[verb] = output[:10]
        
#         clear(); print(f'{verb} ({conjugations[verb]["rank"]}) - complete ({len(collins)} -> {len(output)})')

#     with open(os.path.join(out_dir, f'translations_spanish.json'), 'w', encoding = 'utf8') as file:
#         json.dump(combined, file, indent = 4, ensure_ascii = False)