In [1]:
import pandas as pd
import requests

In [2]:
INPUT_FILE = "./data/processed/freqs_dict.csv"

VERB = "VERB"
NOUN = "NOUN"
ADJ = "ADJ"
ADV = "ADV"
MODIFIER = "MODIFIER"

POSES_TO_KEEP = {NOUN, VERB, ADJ, ADV}

WORD = "word"
LEMMA = "lemma"
POS = "pos"
FREQUENCY = "frequency"

In [3]:
df = pd.read_csv(INPUT_FILE, index_col=None, header=None, skiprows=1, names=[LEMMA, POS, FREQUENCY])

# Exploration

In [4]:
print(f"Total of {df.shape[0]} instances containing {df[LEMMA].nunique()} distinct lemmas with {df[POS].nunique()} POS(s) were loaded.\n")

display(df.groupby(POS).count())

df.sample(10)

Total of 995789 instances containing 943965 distinct lemmas with 17 POS(s) were loaded.



Unnamed: 0_level_0,lemma,frequency
pos,Unnamed: 1_level_1,Unnamed: 2_level_1
ADJ,46542,46542
ADP,790,790
ADV,19319,19319
AUX,399,399
CCONJ,136,136
DET,740,741
INTJ,30,30
NOUN,412404,412407
NUM,314404,314406
PART,56,56


Unnamed: 0,lemma,pos,frequency
728399,Mchirps,NOUN,1
545836,nnimmt,VERB,1
382637,270303,NUM,3
825741,Wasserarea,PROPN,1
837517,Einbau-Mikrowell,NOUN,1
678255,Burenfarme,NOUN,1
981197,Absenderkonto,NOUN,1
440062,Telekom-Konzern,NOUN,2
897043,Einsteigerset-Drumset,NOUN,1
208702,102494,NUM,3


## Missing values

In [5]:
df.isna().sum()

lemma        14
pos           0
frequency     0
dtype: int64

In [6]:
df[df.isna().any(axis="columns")]

Unnamed: 0,lemma,pos,frequency
5255,,NUM,155
30288,,NOUN,16
54021,,PROPN,8
423085,,PROPN,3
442342,,PROPN,2
452809,,PRON,2
473836,,PROPN,2
490181,,PROPN,2
566239,,NOUN,1
617170,,X,1


## Duplicate lemma-pos pairs

In [7]:
df[df.duplicated(subset=[LEMMA, POS], keep=False)].sort_values(by=[LEMMA, POS])

Unnamed: 0,lemma,pos,frequency
30288,,NOUN,16
566239,,NOUN,1
758022,,NOUN,1
5255,,NUM,155
643141,,NUM,1
54021,,PROPN,8
423085,,PROPN,3
442342,,PROPN,2
473836,,PROPN,2
490181,,PROPN,2


# Processing

## Missing values and duplicates

In [8]:
# Rmoving missing values will take care of duplicate lemma-pos pairs as well
df = df.dropna()
print(f"Total missing values after imputation: {df.isna().sum().sum()}")
print(f"Total duplicated lemma-pos pairs: {df[df.duplicated(subset=[LEMMA, POS])].shape[0]}")

Total missing values after imputation: 0
Total duplicated lemma-pos pairs: 0


## Invalid lemmas

In [9]:
# Removing all-caps lemmas
df = df[~df[LEMMA].str.isupper()]

# Keeping ONLY alphbetic while accepting hyphens
df = df[df[LEMMA].str.match(r'^(?=.*[A-Za-zÄÖÜäöüß])[A-Za-zÄÖÜäöüß]+(-[A-Za-zÄÖÜäöüß]+)*$')]

# Removing the leading and trailing hyphens
df[LEMMA] = df[LEMMA].str.strip("-")

df

Unnamed: 0,lemma,pos,frequency
0,der,DET,1128730
1,in,ADP,307254
2,und,CCONJ,286005
3,ein,DET,269419
4,sein,AUX,227051
...,...,...,...
995775,CIK-Zelle,NOUN,1
995776,mononukleär,ADJ,1
995777,Zzt,NOUN,1
995778,Beuth,ADJ,1


## Cleaning up POSes

In [10]:
all_poses = df.groupby(by=[POS])[LEMMA].count()
all_poses

pos
ADJ       43598
ADP         489
ADV       17784
AUX         302
CCONJ        93
DET         544
INTJ         28
NOUN     390814
NUM         313
PART         39
PRON        599
PROPN    132764
PUNCT       110
SCONJ       105
SPACE         1
VERB      21640
X         12628
Name: lemma, dtype: int64

In [11]:
df = df[df[POS].isin(POSES_TO_KEEP)]

## Cleaning up lemmas

In [12]:
df_verb = df[df[POS] == VERB]

df_verb.sort_values(FREQUENCY, ascending=False)

Unnamed: 0,lemma,pos,frequency
34,geben,VERB,25024
47,kommen,VERB,16581
53,finden,VERB,15318
57,gehen,VERB,13723
60,stehen,VERB,13375
...,...,...,...
995474,herantragen,VERB,1
995616,runtergenudeln,VERB,1
995648,vagabundieren,VERB,1
995670,vertrags,VERB,1


In [13]:
# Verb lemmas that do not end in "n"
df_verb_lemmas_not_ending_in_n = df_verb[
    ~((df_verb[LEMMA].str.endswith("n")) | (df_verb[LEMMA].str.endswith("N")))]


df_verb_lemmas_not_ending_in_n.sort_values(FREQUENCY, ascending=False)

Unnamed: 0,lemma,pos,frequency
332,lässt,VERB,2808
1383,sodass,VERB,774
1644,kannst,VERB,656
1663,schloss,VERB,646
3139,zusammengefasst,VERB,307
...,...,...,...
994684,Las,VERB,1
995078,Literaturpädagogik,VERB,1
995092,Gebraucht,VERB,1
995281,Tolerantia,VERB,1


In [14]:
df = df.drop(df_verb_lemmas_not_ending_in_n.index)

In [15]:
# Remove the duplicate POS-LEMMA pairs
not_noun_filter = df[POS] != NOUN
df.loc[not_noun_filter, LEMMA] = df.loc[not_noun_filter, LEMMA].str.lower()

df[FREQUENCY] = df.groupby([POS, LEMMA])[FREQUENCY].transform("sum")

max_freq_indexes = df.groupby([POS, LEMMA])[FREQUENCY].idxmax()
df = df.loc[max_freq_indexes].sort_values(by=[POS, LEMMA], ascending=False).reset_index(drop=True)

df

Unnamed: 0,lemma,pos,frequency
0,ühren,VERB,1
1,übrigbleiben,VERB,13
2,überzusiedeln,VERB,1
3,überzulaufen,VERB,1
4,überzugehen,VERB,10
...,...,...,...
463608,aalglatt,ADJ,3
463609,aalener,ADJ,5
463610,aalartig,ADJ,2
463611,aachenmünchener,ADJ,1


## Filtering the most frequents

In [16]:
df.groupby(POS)[LEMMA].count()

pos
ADJ      41720
ADV      16173
NOUN    390814
VERB     14906
Name: lemma, dtype: int64

In [21]:
TOP_PERCENTILES = {
    VERB: 98,
    NOUN: 99.5,
    ADJ: 99,
    ADV: 99,
}

output_df = pd.DataFrame(columns=df.columns)
for pos in POSES_TO_KEEP:
    df_pos = df[df[POS] == pos]

    percentile = TOP_PERCENTILES[pos]
    top_quantile = df_pos[FREQUENCY].quantile(percentile/100)
    df_pos = df_pos[df_pos[FREQUENCY] >= top_quantile]

    output_df = pd.concat([output_df, df_pos])
    
for pos in POSES_TO_KEEP:
    print(f"{pos} -> {len(output_df[output_df[POS] == pos])}")

NOUN -> 1955
ADJ -> 418
ADV -> 163
VERB -> 299


# Anki analysis

In [23]:
# NOTE: Before executing this cell, Anki should be running with AnkiConnect addon active.

ANKI_URL = "http://127.0.0.1:8765"
ANKI_NOTETYPES = {
    "My-German-Verb": VERB,
    "My-German-Noun": NOUN,
    "My-German-Modifier": MODIFIER,
}
ANKI_QUERY = " OR ".join([f"note:{notetype}" for notetype in ANKI_NOTETYPES.keys()])

def extract_main_word(word: str) -> str:
    word_parts = word.split()
    return word_parts[len(word_parts) // 2]

# Retreieving note IDs from Anki
note_ids = requests.post(
    ANKI_URL, timeout=(0.5, 3), json={"action": "findNotes", "version": 6, "params": {"query": ANKI_QUERY}}
).json()["result"]

# Retreieving note info from Anki

note_info = requests.post(
    ANKI_URL, timeout=(0.5, 3), json={"action": "notesInfo", "version": 6, "params": {"notes": note_ids}}
).json()["result"]

anki_notetypes = []
anki_words = []
anki_lemmas = []
for note in note_info:
    anki_notetypes.append(ANKI_NOTETYPES[note["modelName"]])
    word = note["fields"]["Deutsch"]["value"]
    anki_words.append(word)
    anki_lemmas.append(extract_main_word(word))

df_anki = pd.DataFrame(
    {
        POS: anki_notetypes,
        WORD: anki_words,
        LEMMA: anki_lemmas,
    }
)

df_anki

Unnamed: 0,pos,word,lemma
0,NOUN,der Familienname,Familienname
1,NOUN,die Frage,Frage
2,NOUN,der Nachname,Nachname
3,NOUN,der Name,Name
4,NOUN,der Vorname,Vorname
...,...,...,...
2868,MODIFIER,zwingend,zwingend
2869,MODIFIER,äußerst,äußerst
2870,MODIFIER,öfter,öfter
2871,MODIFIER,überraschend,überraschend
