In [1]:
import os
import bs4
import json
import time
import lxml
import deepl
import requests
import cloudscraper
from pathlib import Path
from copy import deepcopy
from googletrans import Translator
from difflib import SequenceMatcher
from fake_useragent import UserAgent
from IPython.display import clear_output as clear

In [2]:
path = os.path.abspath('').replace('src', '')
top_dir = Path(path).parent.absolute()
conjugations_data_dir = os.path.join(top_dir, 'conjugations', 'data', 'language-specific')
out_dir = os.path.join(path, 'out')

In [3]:
language = 'portuguese'

In [4]:
with open(os.path.join(out_dir, 'graded', f'translations_{language}.json'), 'r', encoding = 'utf8') as file:
    translations = json.loads(file.read())

with open(os.path.join(conjugations_data_dir, f'conjugations_{language}.json'), 'r', encoding = 'utf8') as file:
    conjugations = json.loads(file.read())

In [5]:
if language == "italian" and 'ridefinire' in list( translations.keys() ):
    del translations["ridefinire"]

In [6]:
output = { verb: {} for verb in translations }

In [7]:
modal_verbs = ['can', 'could', 'may', 'might', 'must', 'shall', 'should', 'will', 'would']

In [8]:
modals = []

for verb in translations:
    for translation in translations[verb]["weighted"]:
        if translation in modal_verbs:
            print(verb, translation, translations[verb]["weighted"])
            modals.append( (verb, translation) )

for modal in modals:
    verb, translation = modal
    del translations[verb]["weighted"][translation]

ir will {'go': 160, 'leave': 9, 'be': 9, 'walk': 9, 'head': 9, 'run': 8, 'do': 8, 'ride': 8, 'wend': 7, 'manage': 7, 'travel': 7, 'prosper': 6, 'will': 5}
poder can {'can': 40, 'may': 36, 'afford': 8, 'might': 8}
poder may {'can': 40, 'may': 36, 'afford': 8, 'might': 8}
poder might {'can': 40, 'may': 36, 'afford': 8, 'might': 8}
saber can {'know': 160, 'have': 9, 'can': 9, 'find out': 9, 'determine something': 9, 'savvy': 8}
dever shall {'owe': 136, 'shall': 69, 'must': 38, 'ought': 30, 'have': 16, 'need': 10, 'be obliged': 5}
dever must {'owe': 136, 'shall': 69, 'must': 38, 'ought': 30, 'have': 16, 'need': 10, 'be obliged': 5}
querer will {'want': 152, 'like': 32, 'love': 30, 'desire': 28, 'care': 10, 'will': 9, 'please': 9, 'wish': 9, 'be fond of': 8, 'feel like': 5, 'list': 4}
decidir will {'decide': 160, 'resolve': 36, 'conclude': 9, 'settle': 8, 'adjudicate': 7, 'will': 6, 'award': 5, 'govern': 4, 'fix': 3, 'overrule': 2, 'find': 1, 'liquidate': 1, 'ordain': 1}
desejar will {'desi

In [9]:
def normalize(infinitive):    
    principal = translations[infinitive]['metadata']['principal']
    check = { 'unique': [], 'duplicates': [] }
    similar = { 'retain': [], 'discard': [] }

    heaviest_verb = list(translations[infinitive]['weighted'].keys())[0]
    heaviest_weight = translations[infinitive]['weighted'][heaviest_verb]
        
    if principal != heaviest_verb:
        translations[infinitive]['weighted'][principal] = heaviest_weight + 1
        heaviest_weight += 1

    formatted = deepcopy(translations[infinitive]['weighted'])

    formatted = dict(sorted(formatted.items(), key = lambda item: item[1])[::-1])

    if principal != list(formatted.keys())[0]:
        formatted = {**{principal: formatted[principal]}, **{translation: formatted[translation] for translation in formatted if translation != principal}}

    for candidate in formatted:
        for existing in check['unique']:
            if candidate in existing or existing in candidate:
                check['duplicates'].append(candidate)
                break
        if candidate not in check['duplicates']:
            check['unique'].append(candidate)

    for duplicate in check['duplicates']:
        formatted.pop(duplicate)

    for c1 in formatted:
        if c1 not in similar['discard']:
            for c2 in formatted:
                if c1 != c2 and c2 not in similar['retain'] and c2 not in similar['discard']:
                    if SequenceMatcher(None, c1, c2).ratio() > 0.75:
                        formatted[c1] += formatted[c2]

                        if formatted[c1] > heaviest_weight:
                            principal = c1
                            heaviest_weight = formatted[c1]

                        similar['discard'].append(c2)

    for similar in similar['discard']:
        formatted.pop(similar)

    formatted = dict(sorted(formatted.items(), key = lambda item: item[1])[::-1])

    output[infinitive]['principal'] = principal
    output[infinitive]['weighted'] = { translation: round(formatted[translation] / heaviest_weight, 4)  for translation in formatted }

In [10]:
disagree = [(verb, conjugations[verb]['rank']) for verb in translations if translations[verb]['metadata']['principal'] != translations[verb]['metadata']['consensus']]

In [11]:
with open(os.path.join(path, 'out', 'corrections', f'corrections_{language}.json'), 'r', encoding = 'utf8') as file:
    corrections = json.loads(file.read())

In [12]:
removed = [ verb[0] for verb in disagree ]

for infinitive in corrections:

    translations[infinitive]['metadata']['principal'] = corrections[infinitive]
    
    if infinitive in removed:
        removed.remove( infinitive )

print(removed)

['aviar', 'dactilografar']


In [13]:
len(removed)

2

In [14]:
for verb in translations: 
    if verb not in removed:
        normalize(verb)
    else:
        del output[verb]

In [15]:
for verb in output:
    if not len(output[verb]):
        print(verb)

In [16]:
with open( os.path.join( out_dir, 'normalized', f'translations_{language}.json'), 'w', encoding = 'utf-8') as file:
    json.dump( output, file, indent = 8, ensure_ascii = False )