In [None]:
import os
import requests
import bs4
import lxml
import json
import re
import time
import uuid
from googletrans import Translator
from unidecode import unidecode
from collections import defaultdict
from IPython.display import clear_output as clear

In [None]:
path = os.path.abspath('').replace('scripts', '')
data_dir = os.path.join(path, 'data')

In [None]:
class nestedDict(dict):
    def __missing__(self, key):
        value = self[key] = type(self)()
        return value

In [None]:
translator = Translator()

In [None]:
#---HEADERS FOR WEB SCRAPING---#
headers = {"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
           "accept-encoding": "gzip, deflate, br",
           "accept-language": "en-GB,en;q=0.9,es-ES;q=0.8,es;q=0.7,en-US;q=0.6,eu;q=0.5",
           "cache-control": "max-age=0",
           "cookie": "t=238707487; _ga=GA1.2.1376835774.1641262578; _gid=GA1.2.1482423077.1641262578; _fbp=fb.1.1641262579526.851471446",
           "referer": "https://hidemy.name/en/proxy-list/?start=64",
           "sec-ch-ua-mobile": "?0",
           "sec-ch-ua-platform": "macOS",
           "sec-fetch-dest": "document",
           "sec-fetch-mode": "navigate",
           "sec-fetch-site": "same-origin",
           "sec-fetch-user": "?1",
           "upgrade-insecure-requests": "1",
           "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"}

In [None]:
def findAdditionalVerbs(verb):

    url = f'https://api.verbix.com/conjugator/iv1/ab8e7bb5-9ac6-11e7-ab6a-00089be4dcbc/1/13/113/{verb}'
    page = requests.get(url, headers = headers)
    
    try: 
        similar_raw = re.search(r'Verbs conjugated like(.*?)<h3>', page.text).group(1)
        similar_parsed = re.findall(r'\\">(.*?)</a>', similar_raw)
        similar_verbs = [x for x in similar_parsed if x.isalpha()]
    except: similar_verbs = []

    try: 
        prefix_raw = re.search(r'Other Verbs with Separable Prefix(.*?)<h3>', page.text).group(1)
        prefix_parsed = re.findall(r'\\">(.*?)</a>', prefix_raw)
        prefix_verbs = [x for x in prefix_parsed if x.isalpha()]
    except: prefix_verbs = []

    try: 
        base_raw = re.search(r'Other Verbs with the same Base Verb(.*?)<h3>', page.text).group(1)
        base_parsed = re.findall(r'\\">(.*?)</a>', base_raw)
        base_verbs = [x for x in base_parsed if x.isalpha()]
    except: base_verbs = []

    try: 
        synonyms_raw = re.search(r'<h4>Synonyms</h4>(.*?)<h3>', page.text).group(1)
        synonyms_parsed = re.findall(r'\\">(.*?)</a>', synonyms_raw)
        synonyms_verbs = [x for x in synonyms_parsed if x.isalpha()]
    except: synonyms_verbs = []

    return list(set(similar_verbs + prefix_verbs + base_verbs + synonyms_verbs))

In [None]:
def getNgramData(verb, language, years):
    corpus = {'spanish':32, 'french':19, 'italian':22, 'german':20}
    syear,eyear = years
    raw = requests.get(f'https://books.google.com/ngrams/json?content={verb}&year_start={syear}&year_end={eyear}&corpus={corpus[language]}&smoothing=0', headers = headers)
        
    if raw.text != '[]': data = json.loads(raw.text)[0]
    else: return 'ngram not found'
    
    if data['ngram'] == verb:
        values = data['timeseries']
        return sum(values)/len(values)
    else: return 'error'

In [None]:
def checkRegularity(verb, page):

    if 'NOTRECOGVERB' in page.text:
        return 'x'
    elif "class=\\\"irregular\\\">" in page.text:
        return 'i'
    elif "class=\\\"orto\\\">" in page.text:
        return 'sc'
    else:
        return 'r'

In [None]:
def formatResults(string):
    ich = re.search(r'\\nich(.*?)\\', string.text).group(1)
    du = re.search(r'\\ndu(.*?)\\', string.text).group(1)
    es = re.search(r'\\ner;sie;es(.*?)\\', string.text).group(1)
    wir = re.search(r'\\nwir(.*?)\\', string.text).group(1)
    ihr = re.search(r'\\nihr(.*?)\\', string.text).group(1)
    Sie = re.search(r'\\nsie;Sie(.*?)\\', string.text).group(1)

    ich = re.sub('[\(].*?[\)]', '', ich).split(";")[0]
    du = re.sub('[\(].*?[\)]', '', du).split(";")[0]
    es = re.sub('[\(].*?[\)]', '', es).split(";")[0]
    wir = re.sub('[\(].*?[\)]', '', wir).split(";")[0]
    ihr = re.sub('[\(].*?[\)]', '', ihr).split(";")[0]
    Sie = re.sub('[\(].*?[\)]', '', Sie).split(";")[0]

    return [ich, du, es, wir, ihr, Sie]

In [None]:
def formSubjunctiveFuture(verb, past_participle = ''):
    werden = ['werde', 'werdest', 'werde', 'werden', 'werdet', 'werden']
    if past_participle:
        return [f"{w} {verb} {past_participle}" for w in werden]
    else:
        return [f"{w} {verb}" for w in werden]

In [None]:
def formatImperative(string, n = ''):
    du = re.search(r'\\ndu(.*?)\\', string.text).group(1)
    ihr = re.search(r'\\nihr(.*?)\\', string.text).group(1)
    wir = ihr[:-1] + 'en wir'
    Sie = ihr[:-1] + 'en Sie'

    du = re.sub('[\(].*?[\)]', '', du).split(";")[0]
    wir = re.sub('[\(].*?[\)]', '', wir).split(";")[0]
    ihr = re.sub('[\(].*?[\)]', '', ihr).split(";")[0]
    Sie = re.sub('[\(].*?[\)]', '', Sie).split(";")[0]

    if n:
        return [du + ' nicht', wir + ' nicht', ihr + ' nicht', Sie + ' nicht']
    else:
        return [du, wir, ihr, Sie]

In [None]:
#---SCRAPE REVERSO FOR TRANSLATIONS---#
def getReversoTranslations(verb):

    translations = []

    page = requests.get(f'https://context.reverso.net/translation/german-english/{verb}', headers = headers)
    soup = bs4.BeautifulSoup(page.text)

    if verb in soup.select('title')[0].getText():

        while len(soup.find_all('div', {"class": "mobile-hidden"})) > 0:
            soup.find_all('div', {"class": "mobile-hidden"})[0].extract()
        
        while len(soup.find_all('a', {"class": "mobile-hidden"})) > 0:  
            soup.find_all('a', {"class": "mobile-hidden"})[0].extract()

        for i in range(0, len(soup.select('#translations-content .translation.ltr.dict.v'))):
            translations.append(soup.select('#translations-content .translation.ltr.dict.v')[i].getText().replace('\n\n\n\r\n          ','').replace('\n',''))

    return translations

In [None]:
#---USE GOOGLETRANS PACKAGE TO FIND TRANSLATONS---#
def getGoogleTranslations(verb):
    
    try: 
        translations = translator.translate(verb, dest = 'en', src = 'de').extra_data['all-translations']

        if translations != None:
            for x in range(0, len(translations)):
                if translations[x][0] == 'verb':
                    translations = translations[x][1]
                    break

        else:
            translations = []
                    
    except Exception as ex:
            translations = []
    
    return translations

In [None]:
def conjugate(verb_data):
    
    verb,rank = verb_data

    url = f'https://api.verbix.com/conjugator/iv1/ab8e7bb5-9ac6-11e7-ab6a-00089be4dcbc/1/13/113/{verb}'
    page = requests.get(url)
    soup = bs4.BeautifulSoup(page.text, 'lxml') 

    if rank < 50000:

        dictionary = nestedDict()

        dictionary['infinitive'] = verb
        dictionary['rank'] = rank
        dictionary['regularity'] = checkRegularity(verb, page)

        dictionary['translations'] = []

        dictionary['participle'] = {'present': re.search(r'Present participle: (.*?)\\', soup.select('body')[0].getText()).group(1),
                                    'past': re.search(r'Past participle: (.*?)\\', soup.select('body')[0].getText()).group(1)}

        subjects_indicies = [0, 1, 2, 2, 2, 3, 4, 5]
        subjects_pronouns = ['ich', 'du', 'er', 'sie', 'es', 'wir', 'ihr', 'Sie']

        subjects_indicies_imp = [0, 1, 2, 3]
        subjects_pronouns_imp = ['du', 'wir', 'ihr', 'Sie']

        for i,p in zip(subjects_indicies, subjects_pronouns):

            #indicative
            dictionary['simple']['indicative']['present'][p] = formatResults(soup.select('table')[0])[i]
            dictionary['simple']['indicative']['imperfect'][p] = formatResults(soup.select('table')[2])[i]
            dictionary['simple']['indicative']['future'][p] = formatResults(soup.select('table')[4])[i]

            #subjunctive
            dictionary['simple']['subjunctive']['present'][p] = formatResults(soup.select('table')[6])[i]
            dictionary['simple']['subjunctive']['imperfect'][p] = formatResults(soup.select('table')[8])[i]
            dictionary['simple']['subjunctive']['future'][p] = formSubjunctiveFuture(verb)[i]
            dictionary['simple']['subjunctive']['conditional'][p] = formatResults(soup.select('table')[10])[i]

            #perfect indicative
            dictionary['compound']['indicative']['present'][p] = formatResults(soup.select('table')[1])[i]
            dictionary['compound']['indicative']['imperfect'][p] = formatResults(soup.select('table')[3])[i]
            dictionary['compound']['indicative']['future'][p] = formatResults(soup.select('table')[5])[i]

            #subjunctive
            dictionary['compound']['subjunctive']['present'][p] = formatResults(soup.select('table')[7])[i]
            dictionary['compound']['subjunctive']['imperfect'][p] = formatResults(soup.select('table')[9])[i]
            dictionary['compound']['subjunctive']['future'][p] = formSubjunctiveFuture(verb, dictionary['participle']['past'])[i]
            dictionary['compound']['subjunctive']['conditional'][p] = formatResults(soup.select('table')[11])[i]


            #imperative
            try:
                for i,p in zip(subjects_indicies_imp, subjects_pronouns_imp):
                    dictionary['simple']['imperative']['affirmative'][p] = formatImperative(soup.select('table')[12])[i]
                    dictionary['simple']['imperative']['negative'][p] = formatImperative(soup.select('table')[12], 'negative')[i]
            except:
                for i,p in zip(subjects_indicies_imp, subjects_pronouns_imp):
                    dictionary['simple']['imperative']['affirmative'][p] = ''
                    dictionary['simple']['imperative']['negative'][p] = ''
    
        return dictionary

In [None]:
#---SCRAPE UNRANKED VERBS FROM COOLJUGATOR---#
page = requests.get('https://cooljugator.com/de/list/all', headers = headers)
soup = bs4.BeautifulSoup(page.text, 'lxml')
coolverbs = []

for item in soup.select('.ui.segment.stacked .item'):
    verb = item.getText().split(' ')[0]
    if len(item.getText().split(' ')) == 3 and verb.isalpha() and verb[-1] == 'n':
        coolverbs.append(verb)

In [None]:
#---SCRAPE UNRANKED VERBS FROM WIKIPEDIA---#
wikiverbs = []
urlsafe = set([x for x in 'abcdefghijklmnopqrstuvwxyz'])
last = 'aalen'
run = True

while run:

    url = f'https://en.wiktionary.org/w/index.php?title=Category:German_verbs&pagefrom={last}'
    page = requests.get(url, headers = headers)
    soup = bs4.BeautifulSoup(page.text, 'lxml') 
    list_items = soup.select('.mw-content-ltr .mw-category li a')

    for i in range(18, len(list_items)):
        verb = list_items[i].getText()

        if verb not in wikiverbs and ' ' not in verb and verb.isalpha() and verb[-1] == 'n':
            wikiverbs.append(verb.lower())
            clear(wait = True); print(wikiverbs[-1])
    
    for verb in reversed(wikiverbs):
        split = set([x for x in verb])  
        if len(split - urlsafe) == 0:
            if verb != last:
                last = verb
                break
            else:
                run = False

In [None]:
#---COMBINE VERBS FROM COOLJUGATOR AND WIKIPEDIA---#
all_verbs = list(set(coolverbs + wikiverbs))

In [None]:
#---INITIALISE VARIABLES FOR STORING ADDITIONAL VERBS---#
total = all_verbs
new_verbs = []
initlen = len(all_verbs)

In [None]:
#---SET INDEX AT WHICH TO FINDING ADDITIONAL VERBS---#
start = 6663

In [None]:
#---FIND ADDITIONAL VERBS USING VERBIX---#
for i in range(start, len(all_verbs)):
    
    additional = findAdditionalVerbs(all_verbs[i])
    
    for verb in additional:
        if verb not in total:
            total.append(verb)
            new_verbs.append(verb)
    
    clear(wait = True); print(f"{all_verbs[i]} – {initlen} -> {len(total)} | {len(new_verbs)} ({round(i*100/initlen, 3)}%)")

all_verbs = total; del total

In [None]:
#---SAVE ADDITIONAL VERBS---#
f = os.path.join(data_dir, 'infinitives_german_intermediate_additional.json')

with open(f, 'w', encoding = 'utf8') as file:
    json.dump(all_verbs, file, indent = 4, ensure_ascii = False)

In [44]:
#–––OPEN CHECKPOINT FILE---#
f = os.path.join(data_dir, 'infinitives_german_intermediate_additional.json')

with open(f, 'r', encoding = 'utf8') as file:
    all_verbs = json.loads(file.read())

In [45]:
#---SET VARIABLES FOR NGRAM SCORES---#
verbs = []
i = 0

In [46]:
#–––OPEN CHECKPOINT FILE---#
f = os.path.join(data_dir, 'infinitives_german_intermediate.json')

with open(f, 'r', encoding = 'utf8') as file:
    verbs = json.loads(file.read())

found = [x[0] for x in verbs]

In [47]:
#---ITERATE THROUGH ALL VERBS AND FIND NGRAM SCORES---#
while i < len(all_verbs):
    try:
        verb = all_verbs[i]

        if verb not in found:
            value = getNgramData(verb, 'german', (1980, 2005))

            if isinstance(value, float):
                verbs.append([verb, value])
                clear(wait = True); print(f"{verbs[-1]} ({round(i*100/len(all_verbs), 3)}%)")

            if i % 250 == 0:
                with open("../data/infinitives_german_intermediate.json", 'w', encoding = 'utf8') as file:
                    json.dump(verbs, file, indent = 4, ensure_ascii = False)

            time.sleep(1)
            
        i += 1
        
    except Exception as ex:
        clear(wait = True); print(f"error ({ex}) – sleeping for 2 minutes ({round(i*100/len(all_verbs), 3)}%)")
        time.sleep(120)

['emporklettern', 1.5519067125397285e-08] (99.992%)


In [48]:
#---RANK VERBS ACCORDING TO NGRAM SCORES---#
infinitives = []
rank = 1

for verb in sorted(verbs, key = lambda x: x[1])[::-1]:
    infinitives.append([verb[0], rank])
    rank += 1

verbs = infinitives; del infinitives

In [49]:
#---FILTER DUPLICATES AND RERANK VERBS---#
decoded = []
unique = []
rank = 1

for verb in verbs:
    if unidecode(verb[0]) not in decoded:
        decoded.append(unidecode(verb[0]))
        unique.append([verb[0], rank])
        rank += 1

print(f"{len(verbs)} -> {len(unique)}")

infinitives = unique; del unique

11879 -> 11695


In [50]:
#---CREATE INTERMEDIATE CHECKPOINT FILE AS CONTINGENCY IN CASE OF KERNAL TIMEOUT---#
f = os.path.join(data_dir, 'infinitives_german_intermediate.json')

with open(f, "w", encoding = 'utf8') as file:
    json.dump(infinitives, file, indent = 4, ensure_ascii = False)

In [None]:
#---OPEN CHECKPOINT FILE---#
f = os.path.join(data_dir, 'infinitives_german_intermediate.json')

with open(f, "r", encoding = 'utf8') as file:
    infinitives = json.loads(file.read())

In [51]:
#---SET VARIABLES FOR CONJUGATIONS---#
conjugations = defaultdict()
defective = set()
skipped = []
rank = 1
exclude = ['einen', 'deutschen', 'alten']

In [52]:
#---SET INDEX AT WHICH TO START CONJUGATIONS---#
start = 0

In [53]:
#---GENERATE CONJUGATIONS---#
for i in range(start, len(infinitives)):

    infinitive = infinitives[i]
    try:
        c = conjugate(infinitive)

        if c != None and infinitive[0] not in exclude: 
            conjugations[c['infinitive']] = c
            conjugations[infinitive[0]]['rank'] = rank
            rank += 1

            clear(wait = True), print(f"{infinitive[0]} – complete ({round(i*100/len(infinitives), 3)}%)")
        
        else:
            clear(wait = True), print(f"{infinitive[0]} – skipped ({round(i*100/len(infinitives), 3)}%)")
            skipped.append(infinitive)

    except Exception as ex:
        clear(wait = True), print(f"{infinitive[0]} – skipped – {ex} ({round(i*100/len(infinitives), 3)}%)")
        skipped.append(infinitive)

conjugations = dict(conjugations)

ramifizieren – complete (99.991%)


In [54]:
#---REMOVE VERBS WITH UNKNOWN OR CONFLICTING REGULARITIES---#
remove = []
rank = 1

for verb in conjugations:
    if conjugations[verb]['regularity'] == 'x':
        remove.append(verb)
    else:
        conjugations[verb]['rank'] = rank
        rank += 1

print(f"{len(conjugations)} -> {len(conjugations) - len(remove)}")

for verb in remove:
    conjugations.pop[verb]

8945 -> 8945


In [55]:
#---CREATE INTERMEDIATE CHECKPOINT FILE AS CONTINGENCY IN CASE OF KERNAL TIMEOUT---#
with open(os.path.join(data_dir, 'conjugations_german_intermediate.json'), "w", encoding = 'utf8') as file:
    json.dump(conjugations, file, indent = 4, ensure_ascii = False)

In [56]:
#---SET INDEX VARIABLE TO 1 BY DEFAULT---#
i = 0

In [None]:
#---OPEN CHECKPOINT FILE---#
with open(os.path.join(data_dir, 'conjugations_translated_german_intermediate.json'), "r", encoding = 'utf8') as file:
    conjugations = json.loads(file.read())

In [63]:
#---ADD TRANSLATIONS---#
for verb in conjugations:

    if conjugations[verb]['rank'] > i - 1 and conjugations[verb]['translations'] == []:
        
        translations = getReversoTranslations(verb)

        if not translations:
            translations = getGoogleTranslations(verb)
            time.sleep(9)

        conjugations[verb]['translations'] = translations

        if conjugations[verb]['rank'] % 200 == 0:
            with open(os.path.join(data_dir, 'conjugations_translated_german_intermediate.json'), "w", encoding = 'utf8') as file:
                json.dump(conjugations, file, indent = 4, ensure_ascii = False)

        clear(wait = True), print(f"{verb} ({i}) – complete ({round(i*100/len(conjugations), 3)}%)")

        i += 1

        time.sleep(1)

ramifizieren (8945) – complete (100.0%)


In [64]:
#---REMOVE VERBS WITH NO TRANSLATIONS---#
remove = []
rank = 1

for verb in conjugations:
    if conjugations[verb]['translations'] == []:
        remove.append(verb)
    else:
        conjugations[verb]['rank'] = rank
        rank += 1

for verb in remove:
    conjugations.pop(verb)

In [65]:
#---SAVE TO JSON---#
f = os.path.join(data_dir, 'conjugations_german.json')
with open(f, "w", encoding = 'utf8') as file:
    json.dump(conjugations, file, indent = 4, ensure_ascii = False)

In [66]:
#---PARSE INFINITIVES AND SAVE TO JSON---#
infinitives = []

for verb in conjugations:
    infinitives.append([verb, conjugations[verb]['rank'], conjugations[verb]['regularity']])

f = os.path.join(data_dir, 'infinitives_german.json')
with open(f, "w", encoding = 'utf8') as file:
    json.dump(infinitives, file, indent = 4, ensure_ascii = False)