In [1]:
import os
import requests
import bs4
import lxml
import json
import re
import time
import uuid
from googletrans import Translator
from unidecode import unidecode
from collections import defaultdict
from IPython.display import clear_output as clear

In [2]:
path = os.path.abspath('')
top_dir = path.replace('.corrections', '')
data_dir = os.path.join(path, 'out')
conjugations_dir = os.path.join(top_dir, 'conjugations', 'data', 'language-specific')

In [3]:
class nestedDict(dict):
    def __missing__(self, key):
        value = self[key] = type(self)()
        return value

In [4]:
#---HEADERS FOR WEB SCRAPING---#
headers = {"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
           "accept-encoding": "gzip, deflate, br",
           "accept-language": "en-GB,en;q=0.9,es-ES;q=0.8,es;q=0.7,en-US;q=0.6,eu;q=0.5",
           "cache-control": "max-age=0",
           "cookie": "t=238707487; _ga=GA1.2.1376835774.1641262578; _gid=GA1.2.1482423077.1641262578; _fbp=fb.1.1641262579526.851471446",
           "referer": "https://hidemy.name/en/proxy-list/?start=64",
           "sec-ch-ua-mobile": "?0",
           "sec-ch-ua-platform": "macOS",
           "sec-fetch-dest": "document",
           "sec-fetch-mode": "navigate",
           "sec-fetch-site": "same-origin",
           "sec-fetch-user": "?1",
           "upgrade-insecure-requests": "1",
           "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"}

In [5]:
def checkRegularity(verb, page):

    if 'NOTRECOGVERB' in page.text:
        return 'x'
    elif "class=\\\"irregular\\\">" in page.text:
        return 'i'
    elif "class=\\\"orto\\\">" in page.text:
        return 'sc'
    else:
        return 'r'

In [6]:
def formatResults(string):
    ich = string.select('li')[0].getText()[4:].strip()
    du = string.select('li')[1].getText()[3:].strip()
    er = string.select('li')[2].getText()[10:].strip()
    wir = string.select('li')[3].getText()[4:].strip()
    ihr = string.select('li')[4].getText()[4:].strip()
    Sie = string.select('li')[5].getText()[4:].strip()

    return [ich, du, er, wir, ihr, Sie]

In [7]:
def formatImperative(string, n = ''):
    du = string.select('li .verbtxt')[0].getText().strip()
    wir = string.select('li .verbtxt')[1].getText().strip()
    ihr = string.select('li .verbtxt')[2].getText().strip()
    Sie = string.select('li .verbtxt')[3].getText().strip()

    if n:
        return [du + ' nicht', wir + ' nicht', ihr + ' nicht', Sie + ' nicht']
    else:
        return [du, wir, ihr, Sie]

In [8]:
def conjugate(verb_data):
    
    verb,rank = verb_data

    url = f'https://conjugator.reverso.net/conjugation-german-verb-{verb}.html'
    page = requests.get(url, headers = headers)
    soup = bs4.BeautifulSoup(page.text, 'lxml')

    verbix_url = f'https://api.verbix.com/conjugator/iv1/ab8e7bb5-9ac6-11e7-ab6a-00089be4dcbc/1/13/113/{verb}'
    verbix_page = requests.get(verbix_url)

    if rank < 50000:

        dictionary = nestedDict()

        dictionary['infinitive'] = verb
        dictionary['rank'] = rank
        dictionary['regularity'] = checkRegularity(verb, verbix_page)

        dictionary['participle'] = {'present': soup.select('.blue-box-wrap')[15].select('li')[0].getText(),
                                    'past': soup.select('.blue-box-wrap')[16].select('li')[0].getText()}

        subjects_indicies = [0, 1, 2, 2, 2, 3, 4, 5]
        subjects_pronouns = ['ich', 'du', 'er', 'sie', 'es', 'wir', 'ihr', 'Sie']

        subjects_indicies_imp = [0, 1, 2, 3]
        subjects_pronouns_imp = ['du', 'wir', 'ihr', 'Sie']

        for i,p in zip(subjects_indicies, subjects_pronouns):

            #indicative
            dictionary['simple']['indicative']['present'][p] = formatResults(soup.select('.blue-box-wrap')[0])[i]
            dictionary['simple']['indicative']['imperfect'][p] = formatResults(soup.select('.blue-box-wrap')[1])[i]
            dictionary['simple']['indicative']['future'][p] = formatResults(soup.select('.blue-box-wrap')[2])[i]

            #subjunctive
            dictionary['simple']['subjunctive']['present'][p] = formatResults(soup.select('.blue-box-wrap')[6])[i]
            dictionary['simple']['subjunctive']['imperfect'][p] = formatResults(soup.select('.blue-box-wrap')[10])[i]
            dictionary['simple']['subjunctive']['future'][p] = formatResults(soup.select('.blue-box-wrap')[7])[i]
            dictionary['simple']['subjunctive']['conditional'][p] = formatResults(soup.select('.blue-box-wrap')[11])[i]

            #perfect indicative
            dictionary['compound']['indicative']['present'][p] = formatResults(soup.select('.blue-box-wrap')[3])[i]
            dictionary['compound']['indicative']['imperfect'][p] = formatResults(soup.select('.blue-box-wrap')[4])[i]
            dictionary['compound']['indicative']['future'][p] = formatResults(soup.select('.blue-box-wrap')[5])[i]

            #subjunctive
            dictionary['compound']['subjunctive']['present'][p] = formatResults(soup.select('.blue-box-wrap')[8])[i]
            dictionary['compound']['subjunctive']['imperfect'][p] = formatResults(soup.select('.blue-box-wrap')[9])[i]
            dictionary['compound']['subjunctive']['future'][p] = formatResults(soup.select('.blue-box-wrap')[12])[i]
            dictionary['compound']['subjunctive']['conditional'][p] = formatResults(soup.select('.blue-box-wrap')[13])[i]


            #imperative
            try:
                for i,p in zip(subjects_indicies_imp, subjects_pronouns_imp):
                    dictionary['simple']['imperative']['affirmative'][p] = formatImperative(soup.select('.blue-box-wrap')[14])[i]
                    dictionary['simple']['imperative']['negative'][p] = formatImperative(soup.select('.blue-box-wrap')[14], 'negative')[i]
            except:
                for i,p in zip(subjects_indicies_imp, subjects_pronouns_imp):
                    dictionary['simple']['imperative']['affirmative'][p] = ''
                    dictionary['simple']['imperative']['negative'][p] = ''
    
        return dictionary

In [9]:
#---OPEN CHECKPOINT FILE---#
# f = os.path.join(data_dir, 'missing_selected_german.json')
f = os.path.join(conjugations_dir, 'infinitives_german.json')

with open(f, "r", encoding = 'utf8') as file:
    infinitives = json.loads(file.read())

infinitives = [[x[0], x[1]] for x in infinitives]

In [10]:
#---SET VARIABLES FOR CONJUGATIONS---#
conjugations = defaultdict()
defective = set()
skipped = []
rank = 1
exclude = []

In [11]:
#---SET INDEX AT WHICH TO START CONJUGATIONS---#
start = 0

In [12]:
#---GENERATE CONJUGATIONS---#
for i in range(start, len(infinitives)):

    infinitive = infinitives[i]
    try:
        c = conjugate(infinitive)

        if c != None and infinitive[0] not in exclude: 
            conjugations[c['infinitive']] = c
            conjugations[infinitive[0]]['rank'] = rank
            rank += 1

            clear(wait = True), print(f"{infinitive[0]} – complete ({round(i*100/len(infinitives), 3)}%)")
        
        else:
            clear(wait = True), print(f"{infinitive[0]} – skipped ({round(i*100/len(infinitives), 3)}%)")
            skipped.append(infinitive)

    except Exception as ex:
        clear(wait = True), print(f"{infinitive[0]} – skipped – {ex} ({round(i*100/len(infinitives), 3)}%)")
        skipped.append(infinitive)

conjugations = dict(conjugations)

zumengen – complete (99.983%)


In [14]:
skipped

[['beinhalten', 419],
 ['übernachten', 1775],
 ['regnen', 1842],
 ['zurückwichen', 3073],
 ['anschlugen', 3133],
 ['aufblühten', 3237],
 ['hageln', 3534],
 ['ausruhten', 3592],
 ['nieseln', 4547],
 ['bergsteigen', 4676],
 ['mattsetzen', 4837],
 ['beregnen', 4972],
 ['sonnenbaden', 5294],
 ['verhageln', 5324],
 ['googeln', 5539],
 ['generalüberholen', 5635],
 ['tempern', 5753],
 ['entkuppeln', 5931],
 ['anfauchten', 5984]]

In [13]:
f = os.path.join(data_dir, f'reconjugations_german.json')

with open(f, "w", encoding = 'utf8') as file:
    json.dump(conjugations, file, indent = 4, ensure_ascii = False)