In [2]:
import os
import requests
import bs4
import lxml
import json
import re
import time
import uuid
from googletrans import Translator
from unidecode import unidecode
from collections import defaultdict
from IPython.display import clear_output as clear

In [3]:
path = os.path.abspath('').replace('scripts', '')
data_dir = os.path.join(path, 'data')

In [4]:
class nestedDict(dict):
    def __missing__(self, key):
        value = self[key] = type(self)()
        return value

In [5]:
translator = Translator()

In [6]:
#---HEADERS FOR WEB SCRAPING---#
headers = {"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
           "accept-encoding": "gzip, deflate, br",
           "accept-language": "en-GB,en;q=0.9,es-ES;q=0.8,es;q=0.7,en-US;q=0.6,eu;q=0.5",
           "cache-control": "max-age=0",
           "cookie": "t=238707487; _ga=GA1.2.1376835774.1641262578; _gid=GA1.2.1482423077.1641262578; _fbp=fb.1.1641262579526.851471446",
           "referer": "https://hidemy.name/en/proxy-list/?start=64",
           "sec-ch-ua-mobile": "?0",
           "sec-ch-ua-platform": "macOS",
           "sec-fetch-dest": "document",
           "sec-fetch-mode": "navigate",
           "sec-fetch-site": "same-origin",
           "sec-fetch-user": "?1",
           "upgrade-insecure-requests": "1",
           "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"}

In [7]:
def findAdditionalVerbs(verb):

    url = f'https://api.verbix.com/conjugator/iv1/ab8e7bb5-9ac6-11e7-ab6a-00089be4dcbc/1/1/101/{verb}'
    page = requests.get(url, headers = headers)
    
    try: 
        similar_raw = re.search(r'Verbs conjugated like(.*?)<h3>', page.text).group(1)
        similar_parsed = re.findall(r'\\">(.*?)</a>', similar_raw)
        similar_verbs = [x for x in similar_parsed if x.isalpha()]
    except: similar_verbs = []

    try: 
        prefix_raw = re.search(r'Other Verbs with Separable Prefix(.*?)<h3>', page.text).group(1)
        prefix_parsed = re.findall(r'\\">(.*?)</a>', prefix_raw)
        prefix_verbs = [x for x in prefix_parsed if x.isalpha()]
    except: prefix_verbs = []

    try: 
        base_raw = re.search(r'Other Verbs with the same Base Verb(.*?)<h3>', page.text).group(1)
        base_parsed = re.findall(r'\\">(.*?)</a>', base_raw)
        base_verbs = [x for x in base_parsed if x.isalpha()]
    except: base_verbs = []

    try: 
        synonyms_raw = re.search(r'<h4>Synonyms</h4>(.*?)<h3>', page.text).group(1)
        synonyms_parsed = re.findall(r'\\">(.*?)</a>', synonyms_raw)
        synonyms_verbs = [x for x in synonyms_parsed if x.isalpha()]
    except: synonyms_verbs = []

    return list(set(similar_verbs + prefix_verbs + base_verbs + synonyms_verbs))

In [8]:
def getNgramData(verb, language, years):
    corpus = {'spanish':32, 'french':19, 'italian':22, 'german':20}
    syear,eyear = years
    raw = requests.get(f'https://books.google.com/ngrams/json?content={verb}&year_start={syear}&year_end={eyear}&corpus={corpus[language]}&smoothing=0', headers = headers)
        
    if raw.text != '[]': data = json.loads(raw.text)[0]
    else: return 'ngram not found'
    
    if data['ngram'] == verb:
        values = data['timeseries']
        return sum(values)/len(values)
    else: return 'error'

In [9]:
def checkRegularity(page):

    if 'NOTRECOGVERB' in page.text:
        return 'x'
    elif "class=\\\"irregular\\\">" in page.text:
        return 'i'
    elif "class=\\\"orto\\\">" in page.text:
        return 'sc'
    else:
        return 'r'

In [10]:
def formatResults(string):
    yo = re.search(r'\\nyo(.*?)\\', string.text).group(1)
    tu = re.search(r'\\ntú(.*?)\\', string.text).group(1)
    el = re.search(r'\\nél(.*?)\\', string.text).group(1)
    nosotros = re.search(r'\\nnosotros(.*?)\\', string.text).group(1)
    vosotros = re.search(r'\\nvosotros(.*?)\\', string.text).group(1)
    ellos = re.search(r'\\nellos(.*?)\\', string.text).group(1)

    yo = re.sub('[\(].*?[\)]', '', yo).split(";")[0]
    tu = re.sub('[\(].*?[\)]', '', tu).split(";")[0]
    el = re.sub('[\(].*?[\)]', '', el).split(";")[0]
    nosotros = re.sub('[\(].*?[\)]', '', nosotros).split(";")[0]
    vosotros = re.sub('[\(].*?[\)]', '', vosotros).split(";")[0]
    ellos = re.sub('[\(].*?[\)]', '', ellos).split(";")[0]

    return [yo, tu, el, nosotros, vosotros, ellos]

In [11]:
def getEstar():
    url = f'https://api.verbix.com/conjugator/iv1/ab8e7bb5-9ac6-11e7-ab6a-00089be4dcbc/1/1/101/estar'
    page = requests.get(url)
    soup = bs4.BeautifulSoup(page.text, 'lxml') 

    dictionary = nestedDict()

    subjects_indicies = [0, 1, 2, 2, 2, 3, 4, 5, 5, 5]
    subjects_pronouns = ['yo', 'tu', 'el', 'ella', 'usted', 'nosotros', 'vosotros', 'ellos', 'ellas', 'ustedes']

    for i,p in zip(subjects_indicies, subjects_pronouns):

        dictionary['present'][p] = formatResults(soup.select('table')[0])[i]
        dictionary['preterite'][p] = formatResults(soup.select('table')[4])[i]
        dictionary['imperfect'][p] = formatResults(soup.select('table')[2])[i]
        dictionary['conditional'][p] = formatResults(soup.select('table')[16])[i]
        dictionary['future'][p] = formatResults(soup.select('table')[6])[i]

    return dictionary

In [12]:
def formatImperative(string, n = ''):
    if n:
        negimp = [x for x in string.text.split('\\n') if 'no' in x]

        tu = negimp[0]
        usted = negimp[1]
        nosotros = negimp[2]
        vosotros = negimp[3]
        ustedes = negimp[4]

        tu = re.sub('[\(].*?[\)]', '', tu).split(";")[0]
        usted = re.sub('[\(].*?[\)]', '', usted).split(";")[0]
        nosotros = re.sub('[\(].*?[\)]', '', nosotros).split(";")[0]
        vosotros = re.sub('[\(].*?[\)]', '', vosotros).split(";")[0]
        ustedes = re.sub('[\(].*?[\)]', '', ustedes).split(";")[0]
    
    else:
        tu = re.search(r'\\ntú(.*?)\\', string.text).group(1)
        usted = re.search(r'\\nél(.*?)\\', string.text).group(1)
        nosotros = re.search(r'\\nnosotros(.*?)\\', string.text).group(1)
        vosotros = re.search(r'\\nvosotros(.*?)\\', string.text).group(1)
        ustedes = re.search(r'\\nellos(.*?)\\', string.text).group(1)

        tu = re.sub('[\(].*?[\)]', '', tu).split(";")[0]
        usted = re.sub('[\(].*?[\)]', '', usted).split(";")[0]
        nosotros = re.sub('[\(].*?[\)]', '', nosotros).split(";")[0]
        vosotros = re.sub('[\(].*?[\)]', '', vosotros).split(";")[0]
        ustedes = re.sub('[\(].*?[\)]', '', ustedes).split(";")[0]

    return [tu, usted, nosotros, vosotros, ustedes]

In [13]:
#---SCRAPE REVERSO FOR TRANSLATIONS---#
def getReversoTranslations(verb):

    translations = []

    page = requests.get(f'https://context.reverso.net/translation/spanish-english/{verb}', headers = headers)
    soup = bs4.BeautifulSoup(page.text)

    if verb in soup.select('title')[0].getText():

        while len(soup.find_all('div', {"class": "mobile-hidden"})) > 0:
            soup.find_all('div', {"class": "mobile-hidden"})[0].extract()
        
        while len(soup.find_all('a', {"class": "mobile-hidden"})) > 0:  
            soup.find_all('a', {"class": "mobile-hidden"})[0].extract()

        for i in range(0, len(soup.select('#translations-content .translation.ltr.dict.v'))):
            entry = soup.select('#translations-content .translation.ltr.dict.v')[i].getText().replace('\n\n\n\r\n          ','').replace('\n','')
            
            if entry.replace(' ','').isalpha() and entry not in translations:
                translations.append(entry)

    return translations

In [14]:
#---USE GOOGLETRANS PACKAGE TO FIND TRANSLATONS---#
def getGoogleTranslations(verb):
    
    try: 
        translations = translator.translate(verb, dest = 'en', src = 'es').extra_data['all-translations']

        if translations != None:
            for x in range(0, len(translations)):
                if translations[x][0] == 'verb':
                    translations = translations[x][1]
                    break

        else:
            translations = []
                    
    except Exception as ex:
            translations = []
    
    return translations

In [15]:
#---USE SPANISHDICT TO FIND TRANSLATIONS---#
def getSpanishDictTranslations(verb):
    output = []

    request = requests.get(f"https://www.spanishdict.com/translate/{verb}")
    soup = bs4.BeautifulSoup(request.text, "lxml")

    if not soup.select('._25QSB23Y'):

        for i in range(0, len(soup.select("._2vd6M2gR"))):
            entry = soup.select("._2vd6M2gR")[i].getText()

            if entry[0:3] == "to " and entry[3:] not in output:
                output.append(entry[3:])

        for i in range(0, len(soup.select(".gram_cat"))):
            block = soup.select(".gram_cat")[i]

            if verb + 'se' not in block.getText():

                for i in range(0, len(block.select(".tran_main"))):
                    entry = block.select(".tran_main")[i].getText()

                    if entry[0:3] == "to " and entry[3:] not in output:
                        output.append(entry[3:])
        
    return output

In [16]:
def conjugate(verb_data):
    
    verb,rank = verb_data

    url = f'https://api.verbix.com/conjugator/iv1/ab8e7bb5-9ac6-11e7-ab6a-00089be4dcbc/1/1/101/{verb}'
    page = requests.get(url)
    soup = bs4.BeautifulSoup(page.text, 'lxml') 

    if rank < 50000:

        dictionary = nestedDict()

        dictionary['infinitive'] = verb
        dictionary['rank'] = rank
        dictionary['regularity'] = checkRegularity(page)

        dictionary['translations'] = []

        dictionary['participle'] = {'present': re.search(r'Gerund: (.*?)\\', soup.select('body')[0].getText()).group(1),
                                    'past': re.search(r'Participle: (.*?)Gerund', soup.select('body')[0].getText()).group(1)}

        subjects_indicies = [0, 1, 2, 2, 2, 3, 4, 5, 5, 5]
        subjects_pronouns = ['yo', 'tu', 'el', 'ella', 'usted', 'nosotros', 'vosotros', 'ellos', 'ellas', 'ustedes']

        subjects_indicies_imp = [0, 1, 2, 3, 4]
        subjects_pronouns_imp = ['tu', 'usted', 'nosotros', 'vosotros', 'ustedes']

        for i,p in zip(subjects_indicies, subjects_pronouns):

            #indicative
            dictionary['simple']['indicative']['present'][p] = formatResults(soup.select('table')[0])[i]
            dictionary['simple']['indicative']['preterite'][p] = formatResults(soup.select('table')[4])[i]
            dictionary['simple']['indicative']['imperfect'][p] = formatResults(soup.select('table')[2])[i]
            dictionary['simple']['indicative']['conditional'][p] = formatResults(soup.select('table')[16])[i]
            dictionary['simple']['indicative']['future'][p] = formatResults(soup.select('table')[6])[i]

            #subjunctive
            dictionary['simple']['subjunctive']['present'][p] = formatResults(soup.select('table')[8])[i]
            dictionary['simple']['subjunctive']['imperfect'][p] = formatResults(soup.select('table')[10])[i]
            dictionary['simple']['subjunctive']['future'][p] = formatResults(soup.select('table')[14])[i]

            #perfect indicative
            dictionary['compound']['indicative']['present'][p] = formatResults(soup.select('table')[1])[i]
            dictionary['compound']['indicative']['preterite'][p] = formatResults(soup.select('table')[5])[i]
            dictionary['compound']['indicative']['imperfect'][p] = formatResults(soup.select('table')[3])[i]
            dictionary['compound']['indicative']['conditional'][p] = formatResults(soup.select('table')[17])[i]
            dictionary['compound']['indicative']['future'][p] = formatResults(soup.select('table')[7])[i]

            #perfect subjunctive
            dictionary['compound']['subjunctive']['present'][p] = formatResults(soup.select('table')[9])[i]
            dictionary['compound']['subjunctive']['imperfect'][p] = formatResults(soup.select('table')[12])[i]
            dictionary['compound']['subjunctive']['future'][p] = formatResults(soup.select('table')[15])[i]

            #progressive
            dictionary['progressive']['indicative']['present'][p] = f"{estar['present'][p]} {dictionary['participle']['present']}"
            dictionary['progressive']['indicative']['preterite'][p] = f"{estar['preterite'][p]} {dictionary['participle']['present']}"
            dictionary['progressive']['indicative']['imperfect'][p] = f"{estar['imperfect'][p]} {dictionary['participle']['present']}"
            dictionary['progressive']['indicative']['conditional'][p] = f"{estar['conditional'][p]} {dictionary['participle']['present']}"
            dictionary['progressive']['indicative']['future'][p] = f"{estar['future'][p]} {dictionary['participle']['present']}"
        
        #imperative
            try:
                for i,p in zip(subjects_indicies_imp, subjects_pronouns_imp):
                    dictionary['simple']['imperative']['affirmative'][p] = formatImperative(soup.select('table')[18])[i]
                    dictionary['simple']['imperative']['negative'][p] = formatImperative(soup.select('table')[19], 'negative')[i]
            except:
                for i,p in zip(subjects_indicies_imp, subjects_pronouns_imp):
                    dictionary['simple']['imperative']['affirmative'][p] = ''
                    dictionary['simple']['imperative']['negative'][p] = ''
    
        return dictionary

In [19]:
estar = getEstar()
conjugate(['aportar', 318])

{'infinitive': 'aportar',
 'rank': 318,
 'regularity': 'r',
 'translations': [],
 'participle': {'present': 'aportando', 'past': 'aportado'},
 'simple': {'indicative': {'present': {'yo': 'aporto',
    'tu': 'aportas',
    'el': 'aporta',
    'ella': 'aporta',
    'usted': 'aporta',
    'nosotros': 'aportamos',
    'vosotros': 'aportáis',
    'ellos': 'aportan',
    'ellas': 'aportan',
    'ustedes': 'aportan'},
   'preterite': {'yo': 'aporté',
    'tu': 'aportaste',
    'el': 'aportó',
    'ella': 'aportó',
    'usted': 'aportó',
    'nosotros': 'aportamos',
    'vosotros': 'aportasteis',
    'ellos': 'aportaron',
    'ellas': 'aportaron',
    'ustedes': 'aportaron'},
   'imperfect': {'yo': 'aportaba',
    'tu': 'aportabas',
    'el': 'aportaba',
    'ella': 'aportaba',
    'usted': 'aportaba',
    'nosotros': 'aportábamos',
    'vosotros': 'aportabais',
    'ellos': 'aportaban',
    'ellas': 'aportaban',
    'ustedes': 'aportaban'},
   'conditional': {'yo': 'aportaría',
    'tu': 'apo

In [None]:
#---SCRAPE UNRANKED VERBS FROM COOLJUGATOR---#
page = requests.get('https://cooljugator.com/es/list/all', headers = headers)
soup = bs4.BeautifulSoup(page.text, 'lxml')
coolverbs = []

for item in soup.select('.ui.segment.stacked .item'):
    verb = item.getText().split(' ')[0]
    if len(item.getText().split(' ')) == 3 and verb.isalpha() and (verb[-1] == 'r' or verb[-2:] == 'se'):
        coolverbs.append(verb)

In [None]:
#---SCRAPE UNRANKED VERBS FROM WIKIPEDIA---#
wikiverbs = []
urlsafe = set([x for x in 'abcdefghijklmnopqrstuvwxyz'])
last = 'ababillarse'
run = True

while run:

    url = f'https://en.wiktionary.org/w/index.php?title=Category:Spanish_verbs&pagefrom={last}'
    page = requests.get(url, headers = headers)
    soup = bs4.BeautifulSoup(page.text, 'lxml') 
    list_items = soup.select('.mw-content-ltr .mw-category li a')

    for i in range(18, len(list_items)):
        verb = list_items[i].getText()

        if verb not in wikiverbs and ' ' not in verb and verb.isalpha() and (verb[-1] == 'r' or verb[-2:] == 'se'):
            wikiverbs.append(verb.lower())
            clear(); print(wikiverbs[-1])
    
    for verb in reversed(wikiverbs):
        split = set([x for x in verb])  
        if len(split - urlsafe) == 0:
            if verb != last:
                last = verb
                break
            else:
                run = False

In [None]:
#---COMBINE VERBS FROM COOLJUGATOR AND WIKIPEDIA---#
all_verbs = list(set(coolverbs + wikiverbs))

In [None]:
#---SET VARIABLES FOR NGRAM SCORES---#
verbs = []
infinitives = []
rank = 1
i = 0

In [None]:
#---ITERATE THROUGH ALL VERBS AND FIND NGRAM SCORES---#
while i < len(all_verbs):
    try:
        verb = all_verbs[i]
        value = getNgramData(verb, 'spanish', (1980, 2005))

        if isinstance(value, float):
            verbs.append([verb, value])
            clear(wait = True); print(f"{verbs[-1]} ({round(i*100/len(all_verbs), 3)}%)")
        
        i += 1; time.sleep(1)
        
    except Exception as ex:
        clear(wait = True); print(f"Error ({ex}) – Sleeping for 2 minutes ({round(i*100/len(all_verbs), 3)}%)")
        time.sleep(120)

In [None]:
#---RANK VERBS ACCORDING TO NGRAM SCORES---#
for verb in sorted(verbs, key = lambda x: x[1])[::-1]:
    infinitives.append([verb[0], rank])
    rank += 1

verbs = infinitives; del infinitives

In [None]:
#---FILTER DUPLICATES AND RERANK VERBS---#
decoded = []
unique = []
rank = 1

for verb in verbs:
    if unidecode(verb[0]) not in decoded:
        decoded.append(unidecode(verb[0]))
        unique.append([verb[0], rank])
        rank += 1

print(f"{len(verbs)} -> {len(unique)}")
infinitives = unique; del unique

In [None]:
#---CREATE INTERMEDIATE CHECKPOINT FILE AS CONTINGENCY IN CASE OF KERNAL TIMEOUT---#
f = os.path.join(data_dir, 'infinitives_spanish_intermediate.json')

with open(f, "w", encoding = 'utf8') as file:
    json.dump(infinitives, file, indent = 4, ensure_ascii = False)

In [None]:
#---OPEN CHECKPOINT FILE---#
with open(os.path.join(data_dir, 'infinitives_spanish_intermediate.json'), "r", encoding = 'utf8') as file:
    infinitives = json.loads(file.read())

In [None]:
#---GENERATE CONJUGATIONS---#
estar = getEstar()
conjugations = defaultdict()
defective = set()
skipped = []
rank = 1
exclude = ['']

for i in range(0, len(infinitives)):

    infinitive = infinitives[i]
    try:
        c = conjugate(infinitive)

        if c != None and infinitive[0] not in exclude: 
            conjugations[c['infinitive']] = c
            conjugations[infinitive[0]]['rank'] = rank
            rank += 1

            clear(wait = True), print(f"{infinitive[0]} – complete ({round(i*100/len(infinitives), 3)}%)")
        
        else:
            clear(wait = True), print(f"{infinitive[0]} – skipped ({round(i*100/len(infinitives), 3)}%)")
            skipped.append(infinitive)

    except Exception as ex:
        clear(wait = True), print(f"{infinitive[0]} – skipped – {ex} ({round(i*100/len(infinitives), 3)}%)")
        skipped.append(infinitive)

conjugations = dict(conjugations)

In [None]:
#---REMOVE VERBS WITH UNKNOWN REGULARITIES---#
remove = []
rank = 1

for verb in conjugations:
    if conjugations[verb]['regularity'] == 'x':
        remove.append(verb)
    else:
        conjugations[verb]['rank'] = rank
        rank += 1

print(f"{len(conjugations)} -> {len(conjugations) - len(remove)}")

for verb in remove:
    conjugations.pop[verb]

In [None]:
#---CREATE INTERMEDIATE CHECKPOINT FILE AS CONTINGENCY IN CASE OF KERNAL TIMEOUT---#
with open(os.path.join(data_dir, 'conjugations_spanish_intermediate.json'), "w", encoding = 'utf8') as file:
    json.dump(conjugations, file, indent = 4, ensure_ascii = False)

In [10]:
#---SET INDEX VARIABLE TO 1 BY DEFAULT---#
i = 9191

In [11]:
#---OPEN CHECKPOINT FILE---#
with open(os.path.join(data_dir, 'conjugations_translated_spanish_intermediate.json'), "r", encoding = 'utf8') as file:
    conjugations = json.loads(file.read())

In [12]:
#---ADD TRANSLATIONS---#
for verb in conjugations:

    if conjugations[verb]['rank'] > i - 1 and conjugations[verb]['translations'] == []:

        spanish_dict_translations = getSpanishDictTranslations(verb)
        reverso_translations = getReversoTranslations(verb)

        translations = spanish_dict_translations

        if not translations:
            translations = getReversoTranslations(verb)
            
        if not translations:
            translations = getGoogleTranslations(verb)
            time.sleep(9)

        conjugations[verb]['translations'] = translations

        if conjugations[verb]['rank'] % 200 == 0:
            with open(os.path.join(data_dir, 'conjugations_translated_spanish_intermediate.json'), "w", encoding = 'utf8') as file:
                json.dump(conjugations, file, indent = 4, ensure_ascii = False)

        clear(wait = True), print(f"{verb} ({conjugations[verb]['rank']}) – complete ({round(i*100/len(conjugations), 3)}%)")

        i = conjugations[verb]['rank']

        time.sleep(1)

wasapear (10679) – complete (99.991%)


In [None]:
#---REMOVE VERBS WITH NO TRANSLATIONS---#
remove = []
rank = 1

for verb in conjugations:
    if conjugations[verb]['translations'] == []:
        remove.append(verb)
    else:
        conjugations[verb]['rank'] = rank
        rank += 1

for verb in remove:
    conjugations.pop(verb)

In [None]:
#---SAVE CONJUGATIONS TO JSON---#
f = os.path.join(data_dir, 'conjugations_spanish.json')
with open(f, "w", encoding = 'utf8') as file:
    json.dump(conjugations, file, indent = 4, ensure_ascii = False)

In [None]:
#---PARSE INFINITIVES AND SAVE TO JSON---#
infinitives = []

for verb in conjugations:
    infinitives.append([verb, conjugations[verb]['rank'], conjugations[verb]['regularity']])

f = os.path.join(data_dir, 'infinitives_spanish.json')
with open(f, "w", encoding = 'utf8') as file:
    json.dump(infinitives, file, indent = 4, ensure_ascii = False)