In [1]:
import pandas as pd

In [2]:
original_df = pd.read_csv("input/new_interslavic_words_list_words.csv")
noun_tag__pattern = r"(?=\bm\.|\banim\.|\bf\.|\bn\.|\bsg\.|\bpl\.|\bindecl\.)"
nouns_df = original_df[original_df["partOfSpeech"].str.contains(noun_tag__pattern, na=False)]
del(original_df)

columns_to_lower = ["en", "pl", "ru", "cs"]
for col in columns_to_lower:
    nouns_df[f"{col}_lower"] = nouns_df[col].str.lower()

nouns_df.head()

Unnamed: 0,id,isv,addition,partOfSpeech,type,en,sameInLanguages,genesis,ru,be,...,de,nl,eo,frequency,intelligibility,using_example,en_lower,pl_lower,ru_lower,cs_lower
2,24020,abak,,m.,1,abacus,v z j,I,"счёты, абак, абака",абак,...,Abakus,telraam,abako,1504,!,,abacus,"abak, abakus","счёты, абак, абака","počítadlo, abakus"
3,6114,abažur,,m.,1,lampshade,,F,абажур,абажур,...,Lampenschirm,lampenkap,lampoŝirmilo,2030,!,,lampshade,abażur,абажур,"stínidlo, stínítko, abažúr (zastarale)"
4,35658,abdikacija,,f.,1,abdication,,I,"отречение, абдикация, сложение полномочий","абдыкацыя, вырачэнне, адрачэнне, адпінанне, вы...",...,Abdankung,"aftreden, abdicatie",abdiko,2586,!,,abdication,abdykacja,"отречение, абдикация, сложение полномочий",abdikace
6,35691,abeceda,,f.,2,alphabet (usually Latin),,,алфавит (обычно латиница),"абэцэда, абяцадла, азбука (лацінская)",...,Alphabet (gewöhnlich das lateinische),"alfabet (meestal Latijns), abc",aboco,4344,!,,alphabet (usually latin),abecadło,алфавит (обычно латиница),abeceda
7,24026,Abhaz,,m.anim.,1,Abkhaz,v z j,,"абхаз, абхазец",абхаз,...,Abchase,Abchaziër,abĥazo,830,!,,abkhaz,abchaz,"абхаз, абхазец",abcházec


In [3]:
input_wordlists = [
     "input/codenames_wordlist_cz.txt",
     "input/codenames_wordlist_en.txt",
     "input/codenames_wordlist_pl.txt",
     "input/codenames_wordlist_ru.txt"
]

wordlist_cz, wordlist_en, wordlist_pl, wordlist_ru = [
    pd.read_csv(input_wordlist, header=None) for input_wordlist in input_wordlists
]

wordlists = [wordlist_cz, wordlist_en, wordlist_pl, wordlist_ru]

for wordlist in wordlists:
    wordlist[0] = wordlist[0].str.lower()
    wordlist.rename(columns={0: "wordlist"}, inplace=True)

wordlist_cz, wordlist_en, wordlist_pl, wordlist_ru = wordlists

In [4]:
nouns_from_cz = pd.merge(nouns_df, wordlist_cz, left_on='cs_lower', right_on='wordlist')
nouns_from_pl = pd.merge(nouns_df, wordlist_pl, left_on='pl_lower', right_on='wordlist')
nouns_from_en = pd.merge(nouns_df, wordlist_en, left_on='en_lower', right_on='wordlist')
nouns_from_ru = pd.merge(nouns_df, wordlist_ru, left_on='ru_lower', right_on='wordlist')

In [5]:
nouns_combined = pd.concat([nouns_from_cz, nouns_from_pl, nouns_from_en, nouns_from_ru], ignore_index=True)
nouns_combined.drop(columns=["wordlist", "cs_lower", "pl_lower", "en_lower", "ru_lower"], inplace=True)
nouns_combined = nouns_combined.reset_index(drop=True).drop_duplicates()

# all_isv_nouns = list(nouns_combined["isv"])
# for noun in all_isv_nouns:
#     if "đ" in noun:
#         print(noun)

In [6]:
def unetymologize_spelling(word):
    replacement_dict = {
      "ȯ": "o",
      "ŕ": "r",
      "å": "a",
      "ė": "e",
      "ų": "u",
      "ź": "z",
      "ć": "č",
      "đ": "dž",
      "ę": "e",
      "ś": "s",
      "ń": "n",
      "#": ""
    }
    for k, v in replacement_dict.items():
        word = word.replace(k, v)
    return(word)

nouns_combined["standard_spelling_isv"] = nouns_combined["isv"].apply(unetymologize_spelling)

cols = list(nouns_combined.columns)
cols.insert(1, cols.pop(cols.index("standard_spelling_isv")))
nouns_combined = nouns_combined[cols]

nouns_combined = nouns_combined[~nouns_combined["isv"].str.contains(" ", na=False)]

In [7]:
nouns_combined.sort_values(by="pl").tail(50)

Unnamed: 0,id,standard_spelling_isv,isv,addition,partOfSpeech,type,en,sameInLanguages,genesis,ru,...,sr,mk,bg,cu,de,nl,eo,frequency,intelligibility,using_example
271,19179,veriga,veriga,,f.,2,chain,v j,,цепь,...,"верига, ланац, синџир","верига, синџир, ланец",верига,верига,Kette,"ketting, keten",ĉeno,5404,!,
1396,17723,lancuh,lancuh,,m.,2,chain,ub pl,,цепь,...,ланац,"ланец, синџир, верига","ланец, верига",верига,Kette,"ketting, keten",ĉeno,5074,!,
1397,17724,lanec,#lanėc,,m.,3,chain,sh bm,,цепь,...,ланац,"ланец, синџир, верига","ланец, верига",верига,Kette,"ketting, keten",ĉeno,5074,!,
578,27309,losos,losoś,,m.anim.,1,salmon,,,лосось,...,лосос,лосос,сьомга,!,!Lachs,zalm,!salmo,5375,!,
130,386,los,loś,,m.anim.,1,"elk, moose",,,лось,...,лос,лос,лос,!,"!Elch, Elch",eland,"!alko, alko",7022,!,
585,36646,lukostrělec,lųkostrělėc,,m.anim.,1,archer,v z j,,"лучник, стрелок из лука",...,стрелац (лук и стрела),стрелец (со стрели),стрелец,!,!Bogenschütze,boogschutter,pafarkisto,4535,!,
131,1893,ložica,lȯžica,,f.,1,spoon,,,ложка,...,"кашика, жлица",лажица,лъжица,!,!Löffel,lepel,kulero,5552,!,
132,353,ložka,lȯžka,,f.,1,spoon,,,ложка,...,"кашика, жлица",лажица,лъжица,!,!Löffel,lepel,kulero,5560,!,
1407,2886,lodka,lodka,,f.,1,boat,,,лодка,...,"лађа, чамац, чунак","лотка, кајче, чун, чамец",лодка,!,Boot,boot,!boato,4841,!,
373,25037,čoln,čȯln,,m.,1,boat,v cs yu bm,,"чёлн, челнок, лодка",...,чамац,"чун, чамец",лодка,!,Boot,boot,!boato,6165,!,


In [8]:
nouns_combined.to_csv("output/nouns_info.csv")

In [9]:
nouns_combined = nouns_combined.groupby('standard_spelling_isv', as_index=False).first()

In [10]:
def merge_synonyms(df, by_lang, other_cols):
    agg_dict = {col: 'first' for col in other_cols if col not in ['standard_spelling_isv','isv']}
    agg_dict['standard_spelling_isv'] = lambda x: ' / '.join(x.dropna().unique())
    agg_dict['isv'] = lambda x: ' / '.join(x.dropna().unique())
    merged = df.groupby(by_lang, as_index=False).agg(agg_dict)
    cols = ['standard_spelling_isv', 'isv'] + [c for c in merged.columns if c not in ['standard_spelling_isv', 'isv']]
    return merged[cols]

other_columns = ['id', 'addition', 'partOfSpeech', 'type', 'en', 'sameInLanguages', 'genesis', 'ru', 'be', 'uk', 'pl', 'cs', 'sk', 'sl', 'hr', 'sr', 'mk', 'bg', 'cu', 'de', 'nl', 'eo', 'frequency', 'intelligibility', 'using_example']

langs_to_merge_by = ["pl", "cs", "ru"]  # exclude English, as treats "raz" as a synonym of "čas"

for lang in langs_to_merge_by:
    nouns_combined = merge_synonyms(nouns_combined, lang, other_columns)

In [11]:
nouns_combined.sort_values(by="pl").tail(50)

Unnamed: 0,standard_spelling_isv,isv,id,addition,partOfSpeech,type,en,sameInLanguages,genesis,ru,...,sr,mk,bg,cu,de,nl,eo,frequency,intelligibility,using_example
706,směna,směna,20703,,f.,1,shift (factory),,,смена (на заводе),...,"смена (у фабрици), шихта",смена,!промяна (фабрика),!,!shift (Fabrik),ploeg (bij ploegendienst),!movo (fabriko),5628,!,
789,izniščeńje / razrušeńje / uniščeńje,izniščeńje / råzrušeńje / uniščeńje,35758,,n.,1,"destruction, annihilation",ub z yu mk,,"уничтожение, истребление, разрушение",...,уништење,уништување,"унищожение, разрушение",!,"!Zerstörung, Vernichtung","vernietiging, vernieling, verwoesting, destructie","!detruo, ekstermo",6329,!,
253,zombi,zombi,36297,,m.anim.indecl.,1,zombie,v z j,I,зомби,...,зомби,зомби,зомби,!,Zombie,zombie,zombio,5824,!,
743,polivka / supa,polivka / supa,20605,,f.,2,soup,z sh,,"суп, похлёбка",...,супа,супа,!супа,!,!Suppe,soep,!supo,5818,!,
226,životina,životina,1464,,f.,1,animal,ru uk sh bm,,животное,...,"животиња, звер",животно,животно,!,Tier,dier,"animalo, besto",6336,!,
254,zub,zųb,1848,,m.,1,tooth,,,зуб,...,зуб,заб,зъб,!,Zahn,tand,!denton,6862,!,
252,zlato,zlåto,3046,,n.sg.,1,gold,v z j,,золото,...,злато,злато,злато,!,Gold,goud,!oro,7599,!,
27,Latvija,Latvija,879,,f.sg.,1,Latvia,,,Латвия,...,Летонија,"Латвија, Летонија",Латвия,!,!Lettland,Letland,!Latvio,5338,!,
387,labeď,labęď,1303,,m.anim.,1,swan,,,лебедь,...,лабуд,лебед,лебед,!,!Schwan,zwaan,!cigno,5620,!,
386,lapa,lapa,502,,f.,1,paw,,,лапа,...,шапа,шепа,лапа,!,!Pfote,poot,!piedon,3832,!,


In [12]:
etymological_nouns = nouns_combined["isv"]
standard_nouns = nouns_combined["standard_spelling_isv"]

In [13]:
etymological_nouns.to_csv("output/nouns_etymological_spelling_with_synonyms.txt", index=False, encoding="utf-8")
standard_nouns.to_csv("output/nouns_standard_spelling_with_synonyms.txt", index=False, encoding="utf-8")

In [14]:
nouns_combined['isv_no_synonyms'] = nouns_combined['isv'].apply(lambda x: x.split(" / ")[0] if pd.notna(x) else x)
nouns_combined['standard_spelling_isv_no_synonyms'] = nouns_combined['standard_spelling_isv'].apply(lambda x: x.split(" / ")[0] if pd.notna(x) else x)

etymological_nouns_no_synonyms = nouns_combined['isv_no_synonyms']
standard_nouns_no_synonyms = nouns_combined['standard_spelling_isv_no_synonyms']

In [15]:
etymological_nouns_no_synonyms.to_csv("output/nouns_etymological_spelling_no_synonyms.txt", index=False, encoding="utf-8")
standard_nouns_no_synonyms.to_csv("output/nouns_standard_spelling_no_synonyms.txt", index=False, encoding="utf-8")