In [52]:
import os
import requests
import bs4
import lxml
import json
import re
import time
import uuid
import copy
from googletrans import Translator
from unidecode import unidecode
from collections import defaultdict
from IPython.display import clear_output as clear

In [2]:
path = os.path.abspath('').replace('scripts', '')
data_dir = os.path.join(path, 'data')

In [3]:
class nestedDict(dict):
    def __missing__(self, key):
        value = self[key] = type(self)()
        return value

In [4]:
translator = Translator()

In [5]:
#---HEADERS FOR WEB SCRAPING---#
headers = {"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
           "accept-encoding": "gzip, deflate, br",
           "accept-language": "en-GB,en;q=0.9,es-ES;q=0.8,es;q=0.7,en-US;q=0.6,eu;q=0.5",
           "cache-control": "max-age=0",
           "cookie": "t=238707487; _ga=GA1.2.1376835774.1641262578; _gid=GA1.2.1482423077.1641262578; _fbp=fb.1.1641262579526.851471446",
           "referer": "https://hidemy.name/en/proxy-list/?start=64",
           "sec-ch-ua-mobile": "?0",
           "sec-ch-ua-platform": "macOS",
           "sec-fetch-dest": "document",
           "sec-fetch-mode": "navigate",
           "sec-fetch-site": "same-origin",
           "sec-fetch-user": "?1",
           "upgrade-insecure-requests": "1",
           "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"}

In [6]:
def findAdditionalVerbs(verb):

    url = f'https://api.verbix.com/conjugator/iv1/ab8e7bb5-9ac6-11e7-ab6a-00089be4dcbc/1/4/104/{verb}'
    page = requests.get(url, headers = headers)
    
    try: 
        similar_raw = re.search(r'Verbs conjugated like(.*?)<h3>', page.text).group(1)
        similar_parsed = re.findall(r'\\">(.*?)</a>', similar_raw)
        similar_verbs = [x for x in similar_parsed if x.isalpha()]
    except: similar_verbs = []

    try: 
        prefix_raw = re.search(r'Other Verbs with Separable Prefix(.*?)<h3>', page.text).group(1)
        prefix_parsed = re.findall(r'\\">(.*?)</a>', prefix_raw)
        prefix_verbs = [x for x in prefix_parsed if x.isalpha()]
    except: prefix_verbs = []

    try: 
        base_raw = re.search(r'Other Verbs with the same Base Verb(.*?)<h3>', page.text).group(1)
        base_parsed = re.findall(r'\\">(.*?)</a>', base_raw)
        base_verbs = [x for x in base_parsed if x.isalpha()]
    except: base_verbs = []

    try: 
        synonyms_raw = re.search(r'<h4>Synonyms</h4>(.*?)<h3>', page.text).group(1)
        synonyms_parsed = re.findall(r'\\">(.*?)</a>', synonyms_raw)
        synonyms_verbs = [x for x in synonyms_parsed if x.isalpha()]
    except: synonyms_verbs = []

    return list(set(similar_verbs + prefix_verbs + base_verbs + synonyms_verbs))

In [7]:
def getNgramData(verb, language, years):
    corpus = {'spanish':32, 'french':19, 'italian':22, 'german':20}
    syear,eyear = years
    raw = requests.get(f'https://books.google.com/ngrams/json?content={verb}&year_start={syear}&year_end={eyear}&corpus={corpus[language]}&smoothing=0', headers = headers)
        
    if raw.text != '[]': data = json.loads(raw.text)[0]
    else: return 'ngram not found'
    
    if data['ngram'] == verb:
        values = data['timeseries']
        return sum(values)/len(values)
    else: return 'error'

In [8]:
def checkRegularity(verb, page):

    if 'NOTRECOGVERB' in page.text:
        return 'x'
    elif "class=\\\"irregular\\\">" in page.text:
        return 'i'
    elif "class=\\\"orto\\\">" in page.text:
        return 'sc'
    else:
        return 'r'

In [9]:
def formatResults(string):
    io = re.search(r'\\nio(.*?)\\', string.text).group(1)
    tu = re.search(r'\\ntu(.*?)\\', string.text).group(1)
    lui = re.search(r'\\nlui(.*?)\\', string.text).group(1)
    noi = re.search(r'\\nnoi(.*?)\\', string.text).group(1)
    voi = re.search(r'\\nvoi(.*?)\\', string.text).group(1)
    loro = re.search(r'\\nloro(.*?)\\', string.text).group(1)

    io = re.sub('[\(].*?[\)]', '', io).split(";")[0]
    tu = re.sub('[\(].*?[\)]', '', tu).split(";")[0]
    lui = re.sub('[\(].*?[\)]', '', lui).split(";")[0]
    noi = re.sub('[\(].*?[\)]', '', noi).split(";")[0]
    voi = re.sub('[\(].*?[\)]', '', voi).split(";")[0]
    loro = re.sub('[\(].*?[\)]', '', loro).split(";")[0]

    collated = [io, tu, lui, noi, voi, loro]

    for i in range(0, len(collated)):
        if ";" in collated[i]:
            if len(collated[i].split(" ")) == 4:
                collated[i] = collated[i].split("; ")[1]
            else:
                collated[i] = collated[i].split("; ")[0]

    return collated

In [10]:
def getStare():
    url = f'https://api.verbix.com/conjugator/iv1/ab8e7bb5-9ac6-11e7-ab6a-00089be4dcbc/1/4/104/stare'
    page = requests.get(url, headers = headers)
    soup = bs4.BeautifulSoup(page.text, 'lxml') 

    dictionary = nestedDict()

    subjects_indicies = [0, 1, 2, 2, 3, 4, 5]
    subjects_pronouns = ['io', 'tu', 'lui', 'lei', 'noi', 'voi', 'loro']

    for i,p in zip(subjects_indicies, subjects_pronouns):

        dictionary['present'][p] = formatResults(soup.select('table')[0])[i]
        dictionary['imperfect'][p] = formatResults(soup.select('table')[2])[i]

    return dictionary

In [11]:
def formatImperative(string, infinitive = ''):
    tu = re.search(r'\\ntu(.*?)\\', string.text).group(1)
    lui = re.search(r'\\nlui(.*?)\\', string.text).group(1)
    noi = re.search(r'\\nnoi(.*?)\\', string.text).group(1)
    voi = re.search(r'\\nvoi(.*?)\\', string.text).group(1)
    loro = re.search(r'\\nloro(.*?)\\', string.text).group(1)

    tu = re.sub('[\(].*?[\)]', '', tu).split(";")[0]
    lui = re.sub('[\(].*?[\)]', '', lui).split(";")[0]
    noi = re.sub('[\(].*?[\)]', '', noi).split(";")[0]
    voi = re.sub('[\(].*?[\)]', '', voi).split(";")[0]
    loro = re.sub('[\(].*?[\)]', '', loro).split(";")[0]

    if not infinitive:
        return [tu, lui, noi, voi, loro]
    else:
        return ["non " + x for x in [infinitive, lui, noi, voi, loro]]

In [12]:
#---SCRAPE REVERSO FOR TRANSLATIONS---#
def getReversoTranslations(verb):

    translations = []

    page = requests.get(f'https://context.reverso.net/translation/italian-english/{verb}', headers = headers)
    soup = bs4.BeautifulSoup(page.text)

    if verb in soup.select('title')[0].getText():

        while len(soup.find_all('div', {"class": "mobile-hidden"})) > 0:
            soup.find_all('div', {"class": "mobile-hidden"})[0].extract()
        
        while len(soup.find_all('a', {"class": "mobile-hidden"})) > 0:  
            soup.find_all('a', {"class": "mobile-hidden"})[0].extract()

        for i in range(0, len(soup.select('#translations-content .translation.ltr.dict.v'))):
            translations.append(soup.select('#translations-content .translation.ltr.dict.v')[i].getText().replace('\n\n\n\r\n          ','').replace('\n',''))

    return translations

In [13]:
#---USE GOOGLETRANS PACKAGE TO FIND TRANSLATONS---#
def getGoogleTranslations(verb):
    
    try: 
        translations = translator.translate(verb, dest = 'en', src = 'it').extra_data['all-translations']

        if translations != None:
            for x in range(0, len(translations)):
                if translations[x][0] == 'verb':
                    translations = translations[x][1]
                    break

        else:
            translations = []
                    
    except Exception as ex:
            translations = []
    
    return translations

In [14]:
def conjugate(verb_data):
    
    verb,rank = verb_data

    url = f'https://api.verbix.com/conjugator/iv1/ab8e7bb5-9ac6-11e7-ab6a-00089be4dcbc/1/4/104/{verb}'
    page = requests.get(url)
    soup = bs4.BeautifulSoup(page.text, 'lxml') 

    if rank < 50000:

        dictionary = nestedDict()

        dictionary['infinitive'] = verb
        dictionary['rank'] = rank
        dictionary['regularity'] = checkRegularity(verb, page)

        dictionary['translations'] = []

        dictionary['participle'] = {'present': re.search(r'Participio presente: (.*?)\\', soup.select('body')[0].getText()).group(1),
                                    'past': re.search(r'Participio passato: (.*?)\\', soup.select('body')[0].getText()).group(1)}

        gerund = re.search(r'Gerundio: (.*?)\\', soup.select('body')[0].getText()).group(1) 

        subjects_indicies = [0, 1, 2, 2, 3, 4, 5]
        subjects_pronouns = ['io', 'tu', 'lui', 'lei', 'noi', 'voi', 'loro']

        subjects_indicies_imp = [0, 1, 2, 3, 4]
        subjects_pronouns_imp = ['tu', 'lei', 'noi', 'voi', 'loro']

        for i,p in zip(subjects_indicies, subjects_pronouns):

            #indicative
            dictionary['simple']['indicative']['present'][p] = formatResults(soup.select('table')[0])[i]
            dictionary['simple']['indicative']['preterite'][p] = formatResults(soup.select('table')[6])[i]
            dictionary['simple']['indicative']['imperfect'][p] = formatResults(soup.select('table')[2])[i]
            dictionary['simple']['indicative']['future'][p] = formatResults(soup.select('table')[4])[i]

            #subjunctive
            dictionary['simple']['subjunctive']['present'][p] = formatResults(soup.select('table')[8])[i]
            dictionary['simple']['subjunctive']['imperfect'][p] = formatResults(soup.select('table')[10])[i]

            #conditional
            dictionary['simple']['conditional']['conditional'][p] = formatResults(soup.select('table')[12])[i]

            #perfect indicative
            dictionary['compound']['indicative']['present'][p] = formatResults(soup.select('table')[1])[i]
            dictionary['compound']['indicative']['preterite'][p] = formatResults(soup.select('table')[7])[i]
            dictionary['compound']['indicative']['imperfect'][p] = formatResults(soup.select('table')[3])[i]
            dictionary['compound']['indicative']['future'][p] = formatResults(soup.select('table')[5])[i]

            #perfect subjunctive
            dictionary['compound']['subjunctive']['present'][p] = formatResults(soup.select('table')[9])[i]
            dictionary['compound']['subjunctive']['imperfect'][p] = formatResults(soup.select('table')[11])[i]

            #perfect conditional
            dictionary['compound']['conditional']['conditional'][p] = formatResults(soup.select('table')[13])[i]

            #progressive
            dictionary['progressive']['indicative']['present'][p] = f"{stare['present'][p]} {gerund}"
            dictionary['progressive']['indicative']['imperfect'][p] = f"{stare['imperfect'][p]} {gerund}"

        #imperative
            try:
                for i,p in zip(subjects_indicies_imp, subjects_pronouns_imp):
                    dictionary['simple']['imperative']['affirmative'][p] = formatImperative(soup.select('table')[14])[i]
                    dictionary['simple']['imperative']['negative'][p] = formatImperative(soup.select('table')[14], verb)[i]
            except:
                for i,p in zip(subjects_indicies_imp, subjects_pronouns_imp):
                    dictionary['simple']['imperative']['affirmative'][p] = ''
                    dictionary['simple']['imperative']['negative'][p] = ''
    
        return dictionary

In [15]:
#---SCRAPE UNRANKED VERBS FROM COOLJUGATOR---#
page = requests.get('https://cooljugator.com/it/list/all', headers = headers)
soup = bs4.BeautifulSoup(page.text, 'lxml')
coolverbs = []

for item in soup.select('.ui.segment.stacked .item'):
    verb = item.getText().split(' ')[0]
    if len(item.getText().split(' ')) == 3 and verb.isalpha() and (verb[-2:] == 're' or verb[-2:] == 'si'):
        coolverbs.append(verb)

In [17]:
#---SCRAPE UNRANKED VERBS FROM WIKIPEDIA---#
wikiverbs = []
urlsafe = set([x for x in 'abcdefghijklmnopqrstuvwxyz'])
last = 'abalienare'
run = True

while run:

    url = f'https://en.wiktionary.org/w/index.php?title=Category:Italian_verbs&pagefrom={last}'
    page = requests.get(url, headers = headers)
    soup = bs4.BeautifulSoup(page.text, 'lxml') 
    list_items = soup.select('.mw-content-ltr .mw-category li a')

    for i in range(18, len(list_items)):
        verb = list_items[i].getText()

        if verb not in wikiverbs and ' ' not in verb and verb.isalpha() and (verb[-2:] == 're' or verb[-2:] == 'si'):
            wikiverbs.append(verb.lower())
            clear(); print(wikiverbs[-1])
    
    for verb in reversed(wikiverbs):
        split = set([x for x in verb])  
        if len(split - urlsafe) == 0:
            if verb != last:
                last = verb
                break
            else:
                run = False

zuppare


In [18]:
#---COMBINE VERBS FROM COOLJUGATOR AND WIKIPEDIA---#
all_verbs = list(set(coolverbs + wikiverbs))

In [None]:
#---SET DEFAULT VARIABLES FOR ADDITIONAL VERBS---#
total = all_verbs
new_verbs = []
initlen = len(all_verbs)

In [20]:
#---SET DEFAULT INDEX FOR ADDITIONAL VERBS---#
start = 5102

In [21]:
#---FIND ADDITIONAL VERBS USING VERBIX---#
for i in range(start, len(all_verbs)):
    
    additional = findAdditionalVerbs(all_verbs[i])
    
    for verb in additional:
        if verb not in total:
            total.append(verb)
            new_verbs.append(verb)
    
    clear(wait = True); print(f"{all_verbs[i]} – {initlen} -> {len(total)} | {len(new_verbs)} ({round(i*100/initlen, 3)}%)")

all_verbs = total; del total

battersela – 12857 -> 13960 | 1103 (105.849%)


In [29]:
#---SET VARIABLES FOR NGRAM SCORES---#
verbs = []

In [40]:
#---SET START INDEX FOR NGRAM SCORING---#
start = 12002

In [41]:
#---ITERATE THROUGH ALL VERBS AND FIND NGRAM SCORES---#
for i in range(start, len(all_verbs)):
    try:
        verb = all_verbs[i]
        value = getNgramData(verb, 'italian', (1980, 2005))

        if isinstance(value, float):
            verbs.append([verb, value])
            clear(wait = True); print(f"{verb} ({i}) ({round(i*100/len(all_verbs), 3)}%)")
        else:
            clear(wait = True); print(f"{verb} ({i}) not found")
        
        i += 1; time.sleep(1)
        
    except Exception as ex:
        clear(wait = True); print(f"Error ({ex}) – Sleeping for 2 minutes ({round(i*100/len(all_verbs), 3)}%)")
        time.sleep(120)

codazzo (13959) (99.993%)


In [42]:
#---RANK VERBS ACCORDING TO NGRAM SCORES---#
infinitives = []
rank = 1

for verb in sorted(verbs, key = lambda x: x[1])[::-1]:
    infinitives.append([verb[0], rank])
    rank += 1

verbs = infinitives; del infinitives

In [43]:
#---FILTER DUPLICATES AND RERANK VERBS---#
decoded = []
unique = []
rank = 1

for verb in verbs:
    if unidecode(verb[0]) not in decoded:
        decoded.append(unidecode(verb[0]))
        unique.append([verb[0], rank])
        rank += 1

print(f"{len(verbs)} -> {len(unique)}")

infinitives = unique; del unique

11253 -> 11250


In [51]:
#---CREATE INTERMEDIATE CHECKPOINT FILE AS CONTINGENCY IN CASE OF KERNAL TIMEOUT---#
f = os.path.join(data_dir, 'infinitives_italian_intermediate.json')

with open(f, "w", encoding = 'utf8') as file:
    json.dump(infinitives, file, indent = 4, ensure_ascii = False)

In [53]:
temp = copy.deepcopy(infinitives)

for verb in temp:
    if (verb[0][-2:] == 're' or verb[0][-2:] == 'si') == False:
        infinitives.remove(verb)

In [54]:
#---SET VARIABLES FOR CONJUGATIONS---#
stare = getStare()
conjugations = defaultdict()
defective = set()
skipped = []
rank = 1
exclude = ['']

In [58]:
#---SET INDEX AT WHICH TO START CONJUGATIONS---#
start = 1590

In [59]:
#---GENERATE CONJUGATIONS---#
for i in range(start, len(infinitives)):

    infinitive = infinitives[i]
    try:
        c = conjugate(infinitive)

        if c != None and infinitive[0] not in exclude: 
            conjugations[c['infinitive']] = c
            conjugations[infinitive[0]]['rank'] = rank
            rank += 1

            clear(wait = True), print(f"{infinitive[0]} – complete ({round(i*100/len(infinitives), 3)}%)")
        
        else:
            clear(wait = True), print(f"{infinitive[0]} – skipped ({round(i*100/len(infinitives), 3)}%)")
            skipped.append(infinitive)

    except Exception as ex:
        clear(wait = True), print(f"{infinitive[0]} – skipped – {ex} ({round(i*100/len(infinitives), 3)}%)")
        skipped.append(infinitive)

conjugations = dict(conjugations)

soprarrivare – complete (99.991%)


In [60]:
#---REMOVE VERBS WITH UNKNOWN REGULARITIES---#
remove = []
rank = 1

for verb in conjugations:
    if conjugations[verb]['regularity'] == 'x':
        remove.append(verb)
    else:
        conjugations[verb]['rank'] = rank
        rank += 1

print(f"{len(conjugations)} -> {len(conjugations) - len(remove)}")

for verb in remove:
    conjugations.pop[verb]

8071 -> 8071


In [61]:
#---CREATE INTERMEDIATE CHECKPOINT FILE AS CONTINGENCY IN CASE OF KERNAL TIMEOUT---#
with open(os.path.join(data_dir, 'conjugations_italian_intermediate.json'), "w", encoding = 'utf8') as file:
    json.dump(conjugations, file, indent = 4, ensure_ascii = False)

In [62]:
#---SET INDEX VARIABLE TO 1 BY DEFAULT---#
i = 0

In [64]:
#---ADD TRANSLATIONS---#
for verb in conjugations:

    if conjugations[verb]['rank'] > i - 1 and conjugations[verb]['translations'] == []:
        
        translations = getReversoTranslations(verb)

        if not translations:
            translations = getGoogleTranslations(verb)
            time.sleep(9)

        conjugations[verb]['translations'] = translations

        if conjugations[verb]['rank'] % 200 == 0:
            with open(os.path.join(data_dir, 'conjugations_translated_italian_intermediate.json'), "w", encoding = 'utf8') as file:
                json.dump(conjugations,file, indent = 4, ensure_ascii = False)

        clear(wait = True), print(f"{verb} ({i}) – complete ({round(i*100/len(conjugations), 3)}%)")

        i += 1

        time.sleep(1)

soprarrivare (8071) – complete (100.0%)


In [65]:
#---REMOVE VERBS WITH NO TRANSLATIONS---#
remove = []
rank = 1

for verb in conjugations:
    if conjugations[verb]['translations'] == []:
        remove.append(verb)
    else:
        conjugations[verb]['rank'] = rank
        rank += 1

for verb in remove:
    conjugations.pop(verb)

In [66]:
#---SAVE CONJUGATIONS TO JSON---#
f = os.path.join(data_dir, 'conjugations_italian.json')
with open(f, "w", encoding = 'utf8') as file:
    json.dump(conjugations, file, indent = 4, ensure_ascii = False)

In [67]:
#---PARSE INFINITIVES AND SAVE TO JSON---#
infinitives = []

for verb in conjugations:
    infinitives.append([verb, conjugations[verb]['rank'], conjugations[verb]['regularity']])

f = os.path.join(data_dir, 'infinitives_italian.json')
with open(f, "w", encoding = 'utf8') as file:
    json.dump(infinitives, file, indent = 4, ensure_ascii = False)