In [None]:
import os
import requests
import bs4
import lxml
import json
import re
import time
import uuid
from googletrans import Translator
from unidecode import unidecode
from collections import defaultdict
from IPython.display import clear_output as clear

In [None]:
path = os.path.abspath('').replace('scripts', '')
data_dir = os.path.join(path, 'data')

In [None]:
class nestedDict(dict):
    def __missing__(self, key):
        value = self[key] = type(self)()
        return value

In [None]:
translator = Translator()

In [None]:
#---HEADERS FOR WEB SCRAPING---#
headers = {"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
           "accept-encoding": "gzip, deflate, br",
           "accept-language": "en-GB,en;q=0.9,es-ES;q=0.8,es;q=0.7,en-US;q=0.6,eu;q=0.5",
           "cache-control": "max-age=0",
           "cookie": "t=238707487; _ga=GA1.2.1376835774.1641262578; _gid=GA1.2.1482423077.1641262578; _fbp=fb.1.1641262579526.851471446",
           "referer": "https://hidemy.name/en/proxy-list/?start=64",
           "sec-ch-ua-mobile": "?0",
           "sec-ch-ua-platform": "macOS",
           "sec-fetch-dest": "document",
           "sec-fetch-mode": "navigate",
           "sec-fetch-site": "same-origin",
           "sec-fetch-user": "?1",
           "upgrade-insecure-requests": "1",
           "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"}

In [None]:
def findAdditionalVerbs(verb):

    url = f'https://api.verbix.com/conjugator/iv1/ab8e7bb5-9ac6-11e7-ab6a-00089be4dcbc/1/2/102/{verb}'
    page = requests.get(url, headers = headers)
    
    try: 
        similar_raw = re.search(r'Verbs conjugated like(.*?)<h3>', page.text).group(1)
        similar_parsed = re.findall(r'\\">(.*?)</a>', similar_raw)
        similar_verbs = [x for x in similar_parsed if x.isalpha()]
    except: similar_verbs = []

    try: 
        prefix_raw = re.search(r'Other Verbs with Separable Prefix(.*?)<h3>', page.text).group(1)
        prefix_parsed = re.findall(r'\\">(.*?)</a>', prefix_raw)
        prefix_verbs = [x for x in prefix_parsed if x.isalpha()]
    except: prefix_verbs = []

    try: 
        base_raw = re.search(r'Other Verbs with the same Base Verb(.*?)<h3>', page.text).group(1)
        base_parsed = re.findall(r'\\">(.*?)</a>', base_raw)
        base_verbs = [x for x in base_parsed if x.isalpha()]
    except: base_verbs = []

    try: 
        synonyms_raw = re.search(r'<h4>Synonyms</h4>(.*?)<h3>', page.text).group(1)
        synonyms_parsed = re.findall(r'\\">(.*?)</a>', synonyms_raw)
        synonyms_verbs = [x for x in synonyms_parsed if x.isalpha()]
    except: synonyms_verbs = []

    return list(set(similar_verbs + prefix_verbs + base_verbs + synonyms_verbs))

In [None]:
def checkRegularity(verb, page):

    if 'NOTRECOGVERB' in page.text:
        return 'x'
    elif "class=\\\"irregular\\\">" in page.text:
        return 'i'
    elif "class=\\\"orto\\\">" in page.text:
        return 'sc'
    else:
        return 'r'

In [None]:
def formatResults(string, participle = ''):
    pre = 0
    suf = ''

    for p in ['que', 'se', 'quando']:
        if string.select('li')[0].getText()[:len(p) + 2] == p + 'eu':
            pre = len(p)
    
    if participle:
        suf = ' ' + participle

    eu = string.select('li')[0].getText()[pre + 2:][::-1][len(participle):][::-1] + suf
    tu = string.select('li')[1].getText()[pre + 2:][::-1][len(participle):][::-1] + suf
    ele = string.select('li')[2].getText()[pre + 12:][::-1][len(participle):][::-1] + suf
    nos = string.select('li')[3].getText()[pre + 3:][::-1][len(participle):][::-1] + suf
    vos = string.select('li')[4].getText()[pre + 3:][::-1][len(participle):][::-1] + suf
    eles = string.select('li')[5].getText()[pre + 15:][::-1][len(participle):][::-1] + suf

    return [eu, tu, ele, nos, vos, eles]

In [None]:
def getEstar():
    url = 'https://conjugator.reverso.net/conjugation-portuguese-verb-estar.html'
    page = requests.get(url, headers = headers)
    soup = bs4.BeautifulSoup(page.text, 'lxml') 

    dictionary = nestedDict()

    subjects_indicies = [0, 1, 2, 2, 2, 3, 4, 5, 5, 5]
    subjects_pronouns = ['eu', 'tu', 'ele', 'ela', 'você', 'nós', 'vós', 'eles', 'elas', 'vocês']
    
    for i,p in zip(subjects_indicies, subjects_pronouns):
        dictionary['present'][p] = formatResults(soup.select('.blue-box-wrap')[0])[i]
        dictionary['preterite'][p] = formatResults(soup.select('.blue-box-wrap')[1])[i]
        dictionary['imperfect'][p] = formatResults(soup.select('.blue-box-wrap')[2])[i]
        dictionary['future'][p] = formatResults(soup.select('.blue-box-wrap')[7])[i]
        dictionary['conditional'][p] = formatResults(soup.select('.blue-box-wrap')[15])[i]

    return dictionary

In [None]:
def formatImperative(string, negative = ''):
    if negative:
        pre = 'não '
    else:
        pre = ' '
    tu = pre.lstrip() + string.select('li')[0].getText()[len(pre) - 1:]
    voce = pre.lstrip() + string.select('li')[1].getText()[len(pre) - 1:]
    nos = pre.lstrip() + string.select('li')[2].getText()[len(pre) - 1:]
    vos = pre.lstrip() + string.select('li')[3].getText()[len(pre) - 1:]
    voces  = pre.lstrip() + string.select('li')[4].getText()[len(pre) - 1:]

    return [tu, voce, nos, vos, voces]

In [None]:
#---SCRAPE REVERSO FOR TRANSLATIONS---#
def getReversoTranslations(verb):

    translations = []

    page = requests.get(f'https://context.reverso.net/translation/portuguese-english/{verb}', headers = headers)
    soup = bs4.BeautifulSoup(page.text)

    if verb in soup.select('title')[0].getText():

        while len(soup.find_all('div', {"class": "mobile-hidden"})) > 0:
            soup.find_all('div', {"class": "mobile-hidden"})[0].extract()
        
        while len(soup.find_all('a', {"class": "mobile-hidden"})) > 0:  
            soup.find_all('a', {"class": "mobile-hidden"})[0].extract()

        for i in range(0, len(soup.select('#translations-content .translation.ltr.dict.v'))):
            translations.append(soup.select('#translations-content .translation.ltr.dict.v')[i].getText().replace('\n\n\n\r\n          ','').replace('\n',''))

    return translations

In [None]:
#---USE GOOGLETRANS PACKAGE TO FIND TRANSLATONS---#
def getGoogleTranslations(verb):
    
    try: 
        translations = translator.translate(verb, dest = 'en', src = 'pt').extra_data['all-translations']

        if translations != None:
            for x in range(0, len(translations)):
                if translations[x][0] == 'verb':
                    translations = translations[x][1]
                    break

        else:
            translations = []
                    
    except Exception as ex:
            translations = []
    
    return translations

In [None]:
def conjugate(verb_data):
    
    verb,rank = verb_data

    url = f'https://conjugator.reverso.net/conjugation-portuguese-verb-{verb}.html'
    page = requests.get(url)
    soup = bs4.BeautifulSoup(page.text, 'lxml')

    verbix_page = requests.get(f'https://api.verbix.com/conjugator/iv1/ab8e7bb5-9ac6-11e7-ab6a-00089be4dcbc/1/2/102/{verb}')

    if rank < 50000:

        dictionary = nestedDict()

        dictionary['infinitive'] = verb
        dictionary['rank'] = rank
        dictionary['regularity'] = checkRegularity(verb, verbix_page)

        dictionary['translations'] = []

        dictionary['participle'] = {'present': soup.select('.blue-box-wrap')[17].getText(),
                                    'past': soup.select('.blue-box-wrap')[21].getText()}

        subjects_indicies = [0, 1, 2, 2, 2, 3, 4, 5, 5, 5]
        subjects_pronouns = ['eu', 'tu', 'ele', 'ela', 'você', 'nós', 'vós', 'eles', 'elas', 'vocês']

        subjects_indicies_imp = [0, 1, 2, 3, 4]
        subjects_pronouns_imp = ['tu', 'você', 'nós', 'vós', 'vocês']

        for i,p in zip(subjects_indicies, subjects_pronouns):

            #Simple
            dictionary['simple']['indicative']['present'][p] = formatResults(soup.select('.blue-box-wrap')[0])[i]
            dictionary['simple']['indicative']['preterite'][p] = formatResults(soup.select('.blue-box-wrap')[1])[i]
            dictionary['simple']['indicative']['imperfect'][p] = formatResults(soup.select('.blue-box-wrap')[2])[i]
            dictionary['simple']['indicative']['pluperfect'][p] = formatResults(soup.select('.blue-box-wrap')[3])[i]
            dictionary['simple']['indicative']['future'][p] = formatResults(soup.select('.blue-box-wrap')[7])[i]

            dictionary['simple']['subjunctive']['present'][p] = formatResults(soup.select('.blue-box-wrap')[9])[i]
            dictionary['simple']['subjunctive']['imperfect'][p] = formatResults(soup.select('.blue-box-wrap')[11])[i]
            dictionary['simple']['subjunctive']['future'][p] = formatResults(soup.select('.blue-box-wrap')[13])[i]

            dictionary['simple']['conditional']['conditional'][p] = formatResults(soup.select('.blue-box-wrap')[15])[i]

            #perfect indicative
            dictionary['compound']['indicative']['present'][p] = formatResults(soup.select('.blue-box-wrap')[4], dictionary['participle']['past'])[i]
            dictionary['compound']['indicative']['imperfect'][p] = formatResults(soup.select('.blue-box-wrap')[5], dictionary['participle']['past'])[i]
            dictionary['compound']['indicative']['future'][p] = formatResults(soup.select('.blue-box-wrap')[8], dictionary['participle']['past'])[i]

            #perfect subjunctive
            dictionary['compound']['subjunctive']['present'][p] = formatResults(soup.select('.blue-box-wrap')[10], dictionary['participle']['past'])[i]
            dictionary['compound']['subjunctive']['imperfect'][p] = formatResults(soup.select('.blue-box-wrap')[12], dictionary['participle']['past'])[i]
            dictionary['compound']['subjunctive']['future'][p] = formatResults(soup.select('.blue-box-wrap')[14], dictionary['participle']['past'])[i]

            dictionary['compound']['conditional']['conditional'][p] = formatResults(soup.select('.blue-box-wrap')[16], dictionary['participle']['past'])[i]

            #progressive
            dictionary['progressive']['indicative']['present'][p] = f"{estar['present'][p]} {dictionary['participle']['present']}"
            dictionary['progressive']['indicative']['preterite'][p] = f"{estar['preterite'][p]} {dictionary['participle']['present']}"
            dictionary['progressive']['indicative']['imperfect'][p] = f"{estar['imperfect'][p]} {dictionary['participle']['present']}"
            dictionary['progressive']['indicative']['future'][p] = f"{estar['future'][p]} {dictionary['participle']['present']}"
            
            dictionary['progressive']['conditional']['conditional'][p] = f"{estar['conditional'][p]} {dictionary['participle']['present']}"
        
        #imperative
        try:
            for i,p in zip(subjects_indicies_imp, subjects_pronouns_imp):
                dictionary['simple']['imperative']['affirmative'][p] = formatImperative(soup.select('.blue-box-wrap')[19])[i]
                dictionary['simple']['imperative']['negative'][p] = formatImperative(soup.select('.blue-box-wrap')[20], 'negative')[i]
        except:
            for i,p in zip(subjects_indicies_imp, subjects_pronouns_imp):
                dictionary['simple']['imperative']['affirmative'][p] = ''
                dictionary['simple']['imperative']['negative'][p] = ''
    
        return dictionary

In [None]:
#---SCRAPE UNRANKED VERBS FROM COOLJUGATOR---#
page = requests.get('https://cooljugator.com/pt/list/all', headers = headers)
soup = bs4.BeautifulSoup(page.text, 'lxml')
coolverbs = []

for item in soup.select('.ui.segment.stacked .item'):
    verb = item.getText().split(' ')[0]
    if len(item.getText().split(' ')) == 3 and verb.isalpha() and (verb[-1] == 'r' or verb[-2:] == 'se'):
        coolverbs.append(verb)

In [None]:
#---SCRAPE UNRANKED VERBS FROM WIKIPEDIA---#
wikiverbs = []
urlsafe = set([x for x in 'abcdefghijklmnopqrstuvwxyz'])
last = 'abacharelar'
run = True

while run:

    url = f'https://en.wiktionary.org/w/index.php?title=Category:Portuguese_verbs&pagefrom={last}'
    page = requests.get(url, headers = headers)
    soup = bs4.BeautifulSoup(page.text, 'lxml') 
    list_items = soup.select('.mw-content-ltr .mw-category li a')

    for i in range(18, len(list_items)):
        verb = list_items[i].getText()

        if verb not in wikiverbs and ' ' not in verb and verb.isalpha() and (verb[-1] == 'r' or verb[-2:] == 'se'):
            wikiverbs.append(verb.lower())
            clear(); print(wikiverbs[-1])
    
    for verb in reversed(wikiverbs):
        split = set([x for x in verb])  
        if len(split - urlsafe) == 0:
            if verb != last:
                last = verb
                break
            else:
                run = False

In [None]:
#---COMBINE VERBS FROM COOLJUGATOR AND WIKIPEDIA---#
all_verbs = list(set(coolverbs + wikiverbs))

In [None]:
#---FIND ADDITIONAL VERBS USING VERBIX---#
total = all_verbs
new_verbs = []
initlen = len(all_verbs)

for i in range(0, len(all_verbs)):
    
    additional = findAdditionalVerbs(all_verbs[i])
    
    for verb in additional:
        if verb not in total:
            total.append(verb)
            new_verbs.append(verb)
    
    clear(wait = True); print(f"{all_verbs[i]} – {initlen} -> {len(total)} | {len(new_verbs)} ({round(i*100/initlen, 3)}%)")

all_verbs = total; del total

In [None]:
#---ITERATE THROUGH ALL VERBS AND RANK ACCORDING TO FREQUENCY---#
infinitives = []
rank = 1
i = 0

with open("../frequency/portuguese.txt") as file:
    frequency_list = file.read().split('\n')

for i in range(0, len(frequency_list)):

    entry = frequency_list[i].split(' ')[0]

    if len(entry) > 1 and (entry[-1] == 'r' or entry[-2:] == 'se'):
        if entry in all_verbs and entry not in infinitives and entry != 'por':
            infinitives.append([entry, rank])
            rank += 1

            clear(wait = True); print(f"{entry} | {len(infinitives)} ({round(i*100/len(frequency_list), 3)}%)")

In [None]:
#---FILTER DUPLICATES AND RERANK VERBS---#
decoded = []
unique = []
rank = 1

for verb in infinitives:
    if unidecode(verb[0]) not in decoded:
        decoded.append(unidecode(verb[0]))
        unique.append([verb[0], rank])
        rank += 1

print(f"{len(infinitives)} -> {len(unique)}")

infinitives = unique; del unique

In [None]:
#---CREATE INTERMEDIATE CHECKPOINT FILE AS CONTINGENCY IN CASE OF KERNAL TIMEOUT---#
f = os.path.join(data_dir, 'infinitives_portuguese_intermediate.json')

with open(f, "w", encoding = 'utf8') as file:
    json.dump(infinitives, file, indent = 4, ensure_ascii = False)

In [None]:
#---OPEN CHECKPOINT FILE---#
f = os.path.join(data_dir, 'language-specific', 'infinitives_portuguese.json')

with open(f, "r", encoding = 'utf8') as file:
    infinitives = json.loads(file.read())

In [None]:
#---SET VARIABLES FOR CONJUGATIONS---#
estar = getEstar()
conjugations = defaultdict()
defective = set()
skipped = []
rank = 1
exclude = ['por']

In [None]:
#---SET INDEX AT WHICH TO START CONJUGATIONS---#
start = 0

In [None]:
#---GENERATE CONJUGATIONS---#
for i in range(start, len(infinitives)):

    infinitive = infinitives[i]
    try:
        c = conjugate(infinitive)

        if c != None and infinitive[0] not in exclude: 
            conjugations[c['infinitive']] = c
            conjugations[infinitive[0]]['rank'] = rank
            rank += 1

            clear(wait = True), print(f"{infinitive[0]} – complete ({round(i*100/len(infinitives), 3)}%)")
        
        else:
            clear(wait = True), print(f"{infinitive[0]} – skipped ({round(i*100/len(infinitives), 3)}%)")
            skipped.append(infinitive)

    except Exception as ex:
        clear(wait = True), print(f"{infinitive[0]} – skipped – {ex} ({round(i*100/len(infinitives), 3)}%)")
        skipped.append(infinitive)

conjugations = dict(conjugations)

In [None]:
#---REMOVE VERBS WITH UNKNOWN REGULARITIES---#
remove = []
rank = 1

for verb in conjugations:
    if conjugations[verb]['regularity'] == 'x':
        remove.append(verb)
    else:
        conjugations[verb]['rank'] = rank
        rank += 1

print(f"{len(conjugations)} -> {len(conjugations) - len(remove)}")

for verb in remove:
    conjugations.pop[verb]

In [None]:
#---CREATE INTERMEDIATE CHECKPOINT FILE AS CONTINGENCY IN CASE OF KERNAL TIMEOUT---#
with open(os.path.join(data_dir, 'conjugations_portuguese_intermediate.json'), "w", encoding = 'utf8') as file:
    json.dump(conjugations, file, indent = 4, ensure_ascii = False)

In [None]:
#---SET INDEX VARIABLE TO 1 BY DEFAULT---#
i = 669

In [None]:
#---ADD TRANSLATIONS---#
for verb in conjugations:

    if conjugations[verb]['rank'] > i - 1 and conjugations[verb]['translations'] == []:
        
        translations = getReversoTranslations(verb)

        if not translations:
            translations = getGoogleTranslations(verb)
            time.sleep(9)

        conjugations[verb]['translations'] = translations

        if conjugations[verb]['rank'] % 200 == 0:
            with open(os.path.join(data_dir, 'conjugations_translated_italian_intermediate.json'), "w", encoding = 'utf8') as file:
                json.dump(conjugations,file, indent = 4, ensure_ascii = False)

        clear(wait = True), print(f"{verb} ({i}) – complete ({round(i*100/len(conjugations), 3)}%)")

        i += 1

        time.sleep(1)

In [None]:
#---REMOVE VERBS WITH NO TRANSLATIONS---#
remove = []
rank = 1

for verb in conjugations:
    if conjugations[verb]['translations'] == []:
        remove.append(verb)
    else:
        conjugations[verb]['rank'] = rank
        rank += 1

for verb in remove:
    conjugations.pop(verb)

In [None]:
#---SAVE CONJUGATIONS TO JSON---#
f = os.path.join(data_dir, 'conjugations_portuguese.json')
with open(f, "w", encoding = 'utf8') as file:
    json.dump(conjugations, file, indent = 4, ensure_ascii = False)

In [None]:
#---PARSE INFINITIVES AND SAVE TO JSON---#
infinitives = []

for verb in conjugations:
    infinitives.append([verb, conjugations[verb]['rank'], conjugations[verb]['regularity']])

f = os.path.join(data_dir, 'infinitives_portuguese.json')
with open(f, "w", encoding = 'utf8') as file:
    json.dump(infinitives, file, indent = 4, ensure_ascii = False)