In [None]:
import os
import requests
import bs4
import lxml
import json
import re
import time
import uuid
import copy
from googletrans import Translator
from unidecode import unidecode
from collections import defaultdict
from IPython.display import clear_output as clear

In [None]:
path = os.path.abspath('')
top_dir = path.replace('.corrections', '')
data_dir = os.path.join(path, 'out')
conjugations_dir = os.path.join(top_dir, 'conjugations', 'data', 'language-specific')

In [None]:
class nestedDict(dict):
    def __missing__(self, key):
        value = self[key] = type(self)()
        return value

In [None]:
#---HEADERS FOR WEB SCRAPING---#
headers = {"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
           "accept-encoding": "gzip, deflate, br",
           "accept-language": "en-GB,en;q=0.9,es-ES;q=0.8,es;q=0.7,en-US;q=0.6,eu;q=0.5",
           "cache-control": "max-age=0",
           "cookie": "t=238707487; _ga=GA1.2.1376835774.1641262578; _gid=GA1.2.1482423077.1641262578; _fbp=fb.1.1641262579526.851471446",
           "referer": "https://hidemy.name/en/proxy-list/?start=64",
           "sec-ch-ua-mobile": "?0",
           "sec-ch-ua-platform": "macOS",
           "sec-fetch-dest": "document",
           "sec-fetch-mode": "navigate",
           "sec-fetch-site": "same-origin",
           "sec-fetch-user": "?1",
           "upgrade-insecure-requests": "1",
           "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"}

In [None]:
def checkRegularity(verb, page):

    if 'NOTRECOGVERB' in page.text:
        return 'x'
    elif "class=\\\"irregular\\\">" in page.text:
        return 'i'
    elif "class=\\\"orto\\\">" in page.text:
        return 'sc'
    else:
        return 'r'

In [None]:
url = f'https://conjugator.reverso.net/conjugation-italian-verb-durare.html'
page = requests.get(url, headers = headers)
soup = bs4.BeautifulSoup(page.text, 'lxml')

In [None]:
def formatResults(string):

    lines = 0

    for item in string.select('li'):
        if 'li v="2"' in str(item):
            break
        else:
            lines += 1
    
    if lines != 6:
        io = string.select('li')[0].getText().split('io ')[1].split('/')[0].split(';')[0].strip()
        tu = string.select('li')[1].getText().split('tu ')[1].split('/')[0].split(';')[0].strip()
        lui = string.select('li')[2].getText().split('lui ')[1].split('/')[0].split(';')[0].strip()
        lei = string.select('li')[3].getText().split('lei ')[1].split('/')[0].split(';')[0].strip()
        noi = string.select('li')[4].getText().split('noi ')[1].split('/')[0].split(';')[0].strip()
        voi = string.select('li')[5].getText().split('voi ')[1].split('/')[0].split(';')[0].strip()
        loro = string.select('li')[6].getText().split('loro ')[1].split('/')[0].split(';')[0].strip()

        return [io, tu, lui + '__' + lei, noi, voi, loro]
    
    else:
        io = string.select('li')[0].getText().split('io ')[1].split('/')[0].split(';')[0].strip()
        tu = string.select('li')[1].getText().split('tu ')[1].split('/')[0].split(';')[0].strip()
        lei = string.select('li')[2].getText().split('lei/lui ')[1].split('/')[0].split(';')[0].strip()
        noi = string.select('li')[3].getText().split('noi ')[1].split('/')[0].split(';')[0].strip()
        voi = string.select('li')[4].getText().split('voi ')[1].split('/')[0].split(';')[0].strip()
        loro = string.select('li')[5].getText().split('loro ')[1].split('/')[0].split(';')[0].strip()

        return [io, tu, lei, noi, voi, loro]

In [None]:
def formatImperative(string, n = False, infinitive = ''):
    tu = string.select('li .verbtxt')[0].getText().split('/')[0].split(';')[0].strip()
    lei = string.select('li .verbtxt')[1].getText().split('/')[0].split(';')[0].strip()
    noi = string.select('li .verbtxt')[2].getText().split('/')[0].split(';')[0].strip()
    voi = string.select('li .verbtxt')[3].getText().split('/')[0].split(';')[0].strip()
    loro = string.select('li .verbtxt')[4].getText().split('/')[0].split(';')[0].strip()

    if n:
        return ["non " + x for x in [infinitive, lei, noi, voi, loro]]
    else:
        return [tu, lei, noi, voi, loro]

In [None]:
def getStare():
    url = f'https://conjugator.reverso.net/conjugation-italian-verb-stare.html'
    page = requests.get(url, headers = headers)
    soup = bs4.BeautifulSoup(page.text, 'lxml')

    dictionary = nestedDict()

    subjects_indicies = [0, 1, 2, 2, 3, 4, 5]
    subjects_pronouns = ['io', 'tu', 'lui', 'lei', 'noi', 'voi', 'loro']

    for i,p in zip(subjects_indicies, subjects_pronouns):

        dictionary['present'][p] = formatResults(soup.select('.blue-box-wrap')[0])[i]
        dictionary['imperfect'][p] = formatResults(soup.select('.blue-box-wrap')[1])[i]

    return dictionary

In [None]:
def conjugate(verb_data):
    
    verb,rank = verb_data

    url = f'https://conjugator.reverso.net/conjugation-italian-verb-{verb}.html'
    page = requests.get(url, headers = headers)
    soup = bs4.BeautifulSoup(page.text, 'lxml')

    verbix_url = f'https://api.verbix.com/conjugator/iv1/ab8e7bb5-9ac6-11e7-ab6a-00089be4dcbc/1/4/104/{verb}'
    verbix_page = requests.get(url)

    if rank < 50000:

        dictionary = nestedDict()

        dictionary['infinitive'] = verb
        dictionary['rank'] = rank
        dictionary['regularity'] = checkRegularity(verb, verbix_page)

        dictionary['translations'] = []

        dictionary['participle'] = {'present': soup.select('.blue-box-wrap')[18].select('li')[0].getText(),
                                    'past': soup.select('.blue-box-wrap')[19].select('li')[0].getText()}

        gerund = soup.select('.blue-box-wrap')[15].select('li')[0].getText()

        subjects_indicies = [0, 1, 2, 2, 3, 4, 5]
        subjects_pronouns = ['io', 'tu', 'lui', 'lei', 'noi', 'voi', 'loro']

        subjects_indicies_imp = [0, 1, 2, 3, 4]
        subjects_pronouns_imp = ['tu', 'lei', 'noi', 'voi', 'loro']

        for i,p in zip(subjects_indicies, subjects_pronouns):

            #indicative
            dictionary['simple']['indicative']['present'][p] = formatResults(soup.select('.blue-box-wrap')[0])[i]
            dictionary['simple']['indicative']['preterite'][p] = formatResults(soup.select('.blue-box-wrap')[2])[i]
            dictionary['simple']['indicative']['imperfect'][p] = formatResults(soup.select('.blue-box-wrap')[1])[i]
            dictionary['simple']['indicative']['future'][p] = formatResults(soup.select('.blue-box-wrap')[3])[i]

            #subjunctive
            dictionary['simple']['subjunctive']['present'][p] = formatResults(soup.select('.blue-box-wrap')[8])[i]
            dictionary['simple']['subjunctive']['imperfect'][p] = formatResults(soup.select('.blue-box-wrap')[10])[i]

            #conditional
            dictionary['simple']['conditional']['conditional'][p] = formatResults(soup.select('.blue-box-wrap')[12])[i]

            #perfect indicative
            dictionary['compound']['indicative']['present'][p] = formatResults(soup.select('.blue-box-wrap')[4])[i]
            dictionary['compound']['indicative']['preterite'][p] = formatResults(soup.select('.blue-box-wrap')[6])[i]
            dictionary['compound']['indicative']['imperfect'][p] = formatResults(soup.select('.blue-box-wrap')[5])[i]
            dictionary['compound']['indicative']['future'][p] = formatResults(soup.select('.blue-box-wrap')[7])[i]

            #perfect subjunctive
            dictionary['compound']['subjunctive']['present'][p] = formatResults(soup.select('.blue-box-wrap')[9])[i]
            dictionary['compound']['subjunctive']['imperfect'][p] = formatResults(soup.select('.blue-box-wrap')[11])[i]

            #perfect conditional
            dictionary['compound']['conditional']['conditional'][p] = formatResults(soup.select('.blue-box-wrap')[13])[i]

            #progressive
            dictionary['progressive']['indicative']['present'][p] = f"{stare['present'][p]} {gerund}"
            dictionary['progressive']['indicative']['imperfect'][p] = f"{stare['imperfect'][p]} {gerund}"

        #imperative
            try:
                for i,p in zip(subjects_indicies_imp, subjects_pronouns_imp):
                    dictionary['simple']['imperative']['affirmative'][p] = formatImperative(soup.select('.blue-box-wrap')[14])[i]
                    dictionary['simple']['imperative']['negative'][p] = formatImperative(soup.select('.blue-box-wrap')[14], n = True, infinitive = verb)[i]
            except:
                for i,p in zip(subjects_indicies_imp, subjects_pronouns_imp):
                    dictionary['simple']['imperative']['affirmative'][p] = ''
                    dictionary['simple']['imperative']['negative'][p] = ''
    
        return dictionary

In [None]:
#---SET VARIABLES FOR CONJUGATIONS---#
stare = getStare()
conjugations = defaultdict()
defective = set()
skipped = []
rank = 1
exclude = ['']

In [None]:
#---SET INDEX AT WHICH TO START CONJUGATIONS---#
start = 0

In [None]:
#---OPEN CHECKPOINT FILE---#
# f = os.path.join(data_dir, 'missing_selected_italian.json')
f = os.path.join(conjugations_dir, 'infinitives_italian.json')

with open(f, "r", encoding = 'utf8') as file:
    infinitives = json.loads(file.read())

infinitives = [[x[0], x[1]] for x in infinitives]

In [None]:
#---GENERATE CONJUGATIONS---#
for i in range(start, len(infinitives)):

    infinitive = infinitives[i]
    try:
        c = conjugate(infinitive)

        if c != None and infinitive[0] not in exclude: 
            conjugations[c['infinitive']] = c
            conjugations[infinitive[0]]['rank'] = rank
            rank += 1

            clear(wait = True), print(f"{infinitive[0]} – complete ({round(i*100/len(infinitives), 3)}%)")
        
        else:
            clear(wait = True), print(f"{infinitive[0]} – skipped ({round(i*100/len(infinitives), 3)}%)")
            skipped.append(infinitive)

    except Exception as ex:
        clear(wait = True), print(f"{infinitive[0]} – skipped – {ex} ({round(i*100/len(infinitives), 3)}%)")
        skipped.append(infinitive)

conjugations = dict(conjugations)

In [None]:
#---FIX MISSING---#
new_conjugations = nestedDict(dict())
still_missing = []
rank = 1

for item in infinitives:
    
    verb,old_rank = item

    if verb in conjugations:
        new_conjugations[verb] = conjugations[verb]
        new_conjugations[verb]['rank'] = rank
    else:
        try:
            new_conjugations[verb] = conjugate(item)
            new_conjugations[verb]['rank'] = rank
        except:
            still_missing.append(item)

    rank += 1
    
conjugations = dict(new_conjugations)

In [None]:
for verb in conjugations:
    for mood in ['indicative', 'subjunctive', 'conditional']:
        for tense in conjugations[verb][mood]:
            if '__' in conjugations[verb][mood][tense]['lui']:
                conjugations[verb][mood][tense]['lui'] = conjugations[verb][mood][tense]['lui'].split('__')[0]
                conjugations[verb][mood][tense]['lei'] = conjugations[verb][mood][tense]['lei'].split('__')[1]

In [None]:
#---REMOVE VERBS WITH UNKNOWN REGULARITIES---#
remove = []
rank = 1

for verb in conjugations:
    if conjugations[verb]['regularity'] == 'x':
        remove.append(verb)
    else:
        conjugations[verb]['rank'] = rank
        rank += 1

print(f"{len(conjugations)} -> {len(conjugations) - len(remove)}")

for verb in remove:
    conjugations.pop[verb]

In [None]:
f = os.path.join(conjugations_dir, 'conjugations_italian.json')
with open(f, "r", encoding = 'utf8') as file:
    old_conjugations = json.loads(file.read())

rank = 1
output = {}

for item in infinitives:
    verb = item[0]

    if verb in conjugations:
        output[verb] = conjugations[verb]
        output[verb]['rank'] = rank

    elif verb in old_conjugations:
        output[verb] = old_conjugations[verb]
        output[verb]['rank'] = rank

    rank += 1

In [None]:
#---SAVE CONJUGATIONS TO JSON---#
f = os.path.join(data_dir, 'reconjugated_italian.json')
with open(f, "w", encoding = 'utf8') as file:
    json.dump(output, file, indent = 4, ensure_ascii = False)