In [1]:
import pandas as pd

from configs import (
    FREQDICT_FILE, ANKIFREQ_FILE,
    RANK_COL, LEMMA_COL, FREQUENCY_COL, POS_COL, WORD_COL,
    MAX_RANK_TO_ADD, MAX_RANK_TO_KEEP,
    ANKI_WORD_FIELD, ANKI_NOTETYPE_COL,
    ANKI_REPORT_PATH, ANKI_TO_KEEP_FILE_PREFIX, ANKI_TO_ADD_FILE_PREFIX,
    ANKI_FLAG_COL, ANKI_FLAG_VALUE
)

In [2]:
df_freqs = pd.read_csv(FREQDICT_FILE)

df_freqs

Unnamed: 0,word,frequency,lemma,morph,pos,rank
0,die,1900392,der,Case=Nom|Definite=Def|Gender=Fem|Number=Sing|P...,DET,1
1,der,1696311,der,Case=Nom|Definite=Def|Gender=Masc|Number=Sing|...,DET,2
2,und,1415926,und,,CCONJ,3
3,in,925244,in,,ADP,4
4,das,662237,der,Case=Nom|Definite=Def|Gender=Neut|Number=Sing|...,DET,5
...,...,...,...,...,...,...
1152561,aaalen,1,aaalen,VerbForm=Inf,VERB,497920
1152562,aaahs,1,aaah,Case=Gen|Gender=Masc|Number=Sing,PROPN,497920
1152563,aaahhhhh,1,aaahhhhh,Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbF...,VERB,497920
1152564,aaahh,1,aaahh,Case=Nom|Gender=Fem|Number=Sing,NOUN,497920


In [3]:
df_anki = pd.read_csv(ANKIFREQ_FILE)

df_anki

Unnamed: 0,notetype,Deutsch,word,lemma,frequency,rank
0,My-German-Noun,der Familienname,der,der,1900392,1
1,My-German-Noun,die Frage,die,der,1900392,1
2,My-German-Noun,das Land,das,der,1900392,1
3,My-German-PragmaticExpression,"Mir geht's gut, danke! Und dir?",und,und,1415926,3
4,My-German-Preposition,in,in,in,925244,4
...,...,...,...,...,...,...
2622,My-German-Noun,der Ski,ski,--,0,0
2623,My-German-Noun,die Vetterin,vetterin,vetterin,0,0
2624,My-German-PragmaticExpression,O là là!,là,là,0,0
2625,My-German-Adverb,dorther,dorther,dorther,0,0


# Filtering the top ranking lemmas

In [4]:
df_top_lemmas = df_freqs[(df_freqs[RANK_COL] <= MAX_RANK_TO_ADD) & (df_freqs[FREQUENCY_COL] > 0)]
anki_lemmas = df_anki[LEMMA_COL].unique()
anki_lemmas_in_top = df_anki[(df_anki[RANK_COL] <= MAX_RANK_TO_ADD) & (df_anki[FREQUENCY_COL] > 0)][LEMMA_COL].unique()
anki_lemmas_in_acceptable = df_anki[(df_anki[RANK_COL] <= MAX_RANK_TO_KEEP) & (df_anki[RANK_COL] > MAX_RANK_TO_ADD)][LEMMA_COL].unique()
anki_lemmas_to_keep = df_anki[(df_anki[RANK_COL] <= MAX_RANK_TO_KEEP) & (df_anki[FREQUENCY_COL] > 0)][LEMMA_COL].unique()
# top_lemmas_missing_in_anki = set(top_lemmas[LEMMA_COL].unique()) - set(anki_lemmas_to_keep)

In [5]:
print("Summary of lemma comparison between Anki and top frequency list:")
print(f"Optiomal rank: {MAX_RANK_TO_ADD}")
print(f"Acceptable rank: {MAX_RANK_TO_KEEP}")
print(f"Total top lemmas: {df_top_lemmas[LEMMA_COL].nunique()}")
print(f"Total Anki lemmas: {len(anki_lemmas)}")
print(f"Total Anki lemmas of optimal rank: {len(anki_lemmas_in_top)}")
print(f"Total Anki lemmas of acceptable rank: {len(anki_lemmas_in_acceptable)}")

Summary of lemma comparison between Anki and top frequency list:
Optiomal rank: 5000
Acceptable rank: 10000
Total top lemmas: 3440
Total Anki lemmas: 2551
Total Anki lemmas of optimal rank: 1338
Total Anki lemmas of acceptable rank: 413


In [6]:
anki_keep = df_anki[df_anki[LEMMA_COL].isin(anki_lemmas_to_keep)]
anki_add = df_top_lemmas[~df_top_lemmas[LEMMA_COL].isin(anki_lemmas)]

# Lemmas to keep

In [7]:
anki_keep = anki_keep[[ANKI_NOTETYPE_COL, ANKI_WORD_FIELD, LEMMA_COL, RANK_COL]].sort_values(by=[ANKI_NOTETYPE_COL, ANKI_WORD_FIELD])

print(f"Total Anki lemmas to keep: {len(anki_lemmas_to_keep)}")
anki_keep

Total Anki lemmas to keep: 1751


Unnamed: 0,notetype,Deutsch,lemma,rank
658,My-German-Adjective,allgemein,allgemein,1262
360,My-German-Adjective,alt,alt,560
23,My-German-Adjective,am besten,an,24
88,My-German-Adjective,am besten,gut,119
1054,My-German-Adjective,amerikanisch,amerikanisch,2808
...,...,...,...,...
710,My-German-Verb,öffnen,öffnen,1432
1224,My-German-Verb,üben,üben,3771
1174,My-German-Verb,überlegen,überlegen,3448
1792,My-German-Verb,überqueren,überqueren,9453


## Exporting the output

In [None]:
postypes = anki_keep[ANKI_NOTETYPE_COL].unique()

for pos in postypes:
    report_file = f"{ANKI_REPORT_PATH}/{ANKI_TO_KEEP_FILE_PREFIX}{pos.replace("-", "_")}.csv" 
    df_tosave = anki_keep[anki_keep[ANKI_NOTETYPE_COL] == pos][[ANKI_WORD_FIELD]]
    df_tosave[ANKI_FLAG_COL] = ANKI_FLAG_VALUE
    df_tosave.to_csv(report_file, index=False, header=None)

# Lemmas to add

In [9]:
anki_add.groupby(POS_COL)[LEMMA_COL].nunique()

pos
ADJ      190
ADP       26
ADV      305
AUX        9
CCONJ      5
DET       13
NOUN     931
NUM       14
PART       2
PRON      36
PROPN    259
SCONJ      7
VERB     338
X         66
Name: lemma, dtype: int64

In [10]:
def filter_by_pos(df, pos):
    return df[df[POS_COL] == pos].sort_values(by=WORD_COL)

def filter_by_single_pos(df, pos):
    df_local = df.copy()
    df_local['POS_count'] = df_local.groupby(LEMMA_COL)[POS_COL].transform('nunique')
    return df_local[(df_local[POS_COL] == pos) & (df_local['POS_count'] == 1)].sort_values(by=WORD_COL)

### Pos: X

In [11]:
filter_by_single_pos(anki_add, 'X')

Unnamed: 0,word,frequency,lemma,morph,pos,rank,POS_count
4453,big,977,big,Foreign=Yes,X,4453,1
2688,business,1763,business,Foreign=Yes,X,2687,1
2520,campus,1892,campus,Foreign=Yes,X,2520,1
4257,caritas,1033,caritas,Foreign=Yes,X,4254,1
1906,cm,2526,cm,,X,1907,1
...,...,...,...,...,...,...,...
2327,windows,2054,windows,Foreign=Yes,X,2328,1
4155,wm,1067,wm,,X,4154,1
1640,x,2916,x,,X,1641,1
2045,z,2358,z,,X,2046,1


In [12]:
# Remove all the instances with pos 'X' from anki_add
anki_add = anki_add[anki_add[POS_COL] != 'X']

## Pos: SCONJ

In [13]:
filter_by_single_pos(anki_add, 'SCONJ')

Unnamed: 0,word,frequency,lemma,morph,pos,rank,POS_count
4462,anstatt,976,anstatt,,SCONJ,4458,1
2679,daß,1768,daß,,SCONJ,2678,1
1312,sofern,3635,sofern,,SCONJ,1313,1
353,warum,11973,warum,PronType=Int,SCONJ,354,1
200,wo,20666,wo,PronType=Int,SCONJ,201,1
4006,woher,1111,woher,PronType=Int,SCONJ,4005,1
3382,worauf,1346,worauf,PronType=Int,SCONJ,3381,1


## Pos: AUX

In [14]:
filter_by_single_pos(anki_add, 'AUX')

Unnamed: 0,word,frequency,lemma,morph,pos,rank,POS_count
3771,hab,1182,hab,Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbF...,AUX,3771,1
607,kannst,7289,könnsten,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,AUX,608,1
99,muss,45426,mussen,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,AUX,100,1
3074,musst,1515,musst,Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbF...,AUX,3074,1
497,musste,8908,musste,Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbF...,AUX,498,1
981,mussten,4670,mussten,Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbF...,AUX,982,1
3492,möchtest,1299,möchtest,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,AUX,3492,1
2908,solltest,1609,solltest,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,AUX,2908,1
3962,wirst,1122,wersten,Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbF...,AUX,3963,1


In [15]:
# Remove all the instances with pos 'AUX' from anki_add
anki_add = anki_add[anki_add[POS_COL] != 'AUX']

## Pos: DET

In [16]:
filter_by_single_pos(anki_add, 'DET')

Unnamed: 0,word,frequency,lemma,morph,pos,rank,POS_count
860,all,5231,all,PronType=Ind,DET,861,1
1089,dein,4254,dein,Case=Nom|Gender=Neut|Number=Sing|Poss=Yes|Pron...,DET,1090,1
664,deine,6696,dein,Case=Nom|Gender=Fem|Number=Sing|Poss=Yes|PronT...,DET,665,1
2477,deinem,1923,dein,Case=Dat|Gender=Masc|Number=Sing|Poss=Yes|Pron...,DET,2478,1
1803,deinen,2668,dein,Case=Acc|Gender=Masc|Number=Sing|Poss=Yes|Pron...,DET,1803,1
1812,deiner,2655,dein,Case=Dat|Gender=Fem|Number=Sing|Poss=Yes|PronT...,DET,1813,1
3995,euer,1114,euer,Case=Nom|Gender=Masc|Number=Sing|Poss=Yes|Pron...,DET,3995,1
1925,eure,2508,eure,Case=Nom|Gender=Fem|Number=Sing|Poss=Yes|PronT...,DET,1926,1
2894,jene,1622,jener,Case=Nom|Gender=Fem|Number=Sing|PronType=Dem,DET,2893,1
3006,keinerlei,1551,keinerlei,PronType=Ind,DET,3004,1


In [17]:
ids_to_keep = [3006]

# Remove all the lemmas of instances with pos 'DET' from anki_add, if lemma is not in lemmas_to_keep
anki_add = anki_add[
    (~anki_add[LEMMA_COL].isin(filter_by_pos(anki_add, 'DET')[LEMMA_COL].unique())) | (anki_add.index.isin(ids_to_keep))]

## Pos: ADP

In [18]:
filter_by_single_pos(anki_add, 'ADP')

Unnamed: 0,word,frequency,lemma,morph,pos,rank,POS_count
1348,angesichts,3535,angesichts,,ADP,1349,1
1247,anhand,3814,anhand,,ADP,1248,1
2659,anlässlich,1785,anlässlich,,ADP,2660,1
346,aufgrund,12206,aufgrund,,ADP,347,1
3168,bezüglich,1463,bezüglich,,ADP,3167,1
4383,binnen,1000,binnen,,ADP,4383,1
2224,einschließlich,2159,einschließlich,,ADP,2225,1
1632,entgegen,2931,entgegen,,ADP,1632,1
1688,entlang,2836,entlang,,ADP,1688,1
1972,hinsichtlich,2440,hinsichtlich,,ADP,1972,1


## Pos: NUM

In [19]:
filter_by_single_pos(anki_add, 'NUM')

Unnamed: 0,word,frequency,lemma,morph,pos,rank,POS_count
669,acht,6651,acht,,NUM,670,1
128,drei,33778,drei,,NUM,129,1
1684,elf,2844,elf,,NUM,1684,1
286,fünf,14765,fünf,,NUM,287,1
1173,neun,4011,neun,,NUM,1174,1
419,sechs,10308,sechs,,NUM,420,1
675,sieben,6601,sieben,,NUM,676,1
217,vier,18639,vier,,NUM,218,1
389,zehn,10895,zehn,,NUM,390,1
92,zwei,48622,zwei,,NUM,93,1


In [20]:
filter_by_pos(anki_add, 'NUM')

Unnamed: 0,word,frequency,lemma,morph,pos,rank
669,acht,6651,acht,,NUM,670
128,drei,33778,drei,,NUM,129
1684,elf,2844,elf,,NUM,1684
286,fünf,14765,fünf,,NUM,287
3860,halben,1153,halb,,NUM,3861
3079,hundert,1510,hundert,,NUM,3080
1173,neun,4011,neun,,NUM,1174
419,sechs,10308,sechs,,NUM,420
675,sieben,6601,sieben,,NUM,676
4198,tausend,1051,tausend,,NUM,4196


In [21]:
# Remove all the lemmas of instances with pos 'NUM' from anki_add, regardless of their pos
anki_add = anki_add[~anki_add[LEMMA_COL].isin(filter_by_pos(anki_add, 'NUM')[LEMMA_COL].unique())]

## Pos: PART

In [22]:
filter_by_single_pos(anki_add, 'PART').sort_values(by=LEMMA_COL)

Unnamed: 0,word,frequency,lemma,morph,pos,rank,POS_count
4555,allzu,953,allzu,,PART,4555,1
1709,nein,2786,nein,,PART,1710,1


## Pos: PRON

In [23]:
filter_by_single_pos(anki_add, 'PRON').sort_values(by=LEMMA_COL)

Unnamed: 0,word,frequency,lemma,morph,pos,rank,POS_count
3008,diejenigen,1547,derjenige,Case=Nom|Number=Plur|PronType=Dem,PRON,3008,1
425,dich,10177,dich,Case=Acc|Number=Sing|Person=2|PronType=Prs|Ref...,PRON,426,1
2044,eins,2360,einer,Case=Nom|Gender=Neut|Number=Sing|PronType=Ind,PRON,2045,1
38,er,157523,er,Case=Nom|Gender=Masc|Number=Sing|Person=3|Pron...,PRON,39,1
4562,etliche,951,etlicher,Case=Nom|Number=Plur|PronType=Ind,PRON,4562,1
470,euch,9335,euch,Case=Dat|Number=Plur|Person=2|PronType=Prs,PRON,471,1
4149,hessischen,1068,hessisch,Case=Gen|PronType=Dem,PRON,4150,1
248,ihm,17166,ihm,Case=Dat|Gender=Masc|Number=Sing|Person=3|Pron...,PRON,249,1
211,ihn,19477,ihn,Case=Acc|Gender=Masc|Number=Sing|Person=3|Pron...,PRON,212,1
111,ihnen,39439,ihnen,Case=Dat|Number=Plur|Person=3|PronType=Prs,PRON,112,1


In [24]:
ids_to_keep = [3008, 4562, 4680, 791, 306, 1997, 1715, 3945, 3313, 850, 1081, 314, 146, 33]

# Remove all the lemmas of instances with pos 'PRON' from anki_add, if lemma is not in lemmas_to_keep
anki_add = anki_add[
    (~anki_add[LEMMA_COL].isin(filter_by_pos(anki_add, 'PRON')[LEMMA_COL].unique())) | (anki_add.index.isin(ids_to_keep))]

## Exporting the output

In [26]:
postypes = anki_add[POS_COL].unique()

for pos in postypes:
    report_file = f"{ANKI_REPORT_PATH}/{ANKI_TO_ADD_FILE_PREFIX}{pos}.csv" 
    df_tosave = anki_add[anki_add[POS_COL] == pos].drop(columns=[POS_COL])
    df_tosave.to_csv(report_file, index=False)