# Preprocessing of Compound Words

## 0. Requirements

In [None]:
# install requirements 
#!pip3 install german-nouns

In [1]:
# load libraries
import pandas as pd
from german_nouns.lookup import Nouns
nouns = Nouns()

## 1. Load Input

In this step we load the list of glossary terms and split the compounds into its two components, i.e. "Klima" and "X". 

In [3]:
# load wordlist as list of strings 
with open('../files/wordlist.txt') as f:
    wordlist = f.read().splitlines()

# get second part (noun) of climate compounds and save to new list
second_nouns = []
for noun in wordlist:
    second_nouns.append(noun.split("Klima",1)[1])

# create dictionary from wordlist and second part of compounds
d = {"original":wordlist,"second_part":second_nouns}

# create dataframe containing both information, 
# the original compounds and the second part of the compounds 
compounds = pd.DataFrame(d, columns = ["original", "second_part"])

## 2. Preprocessing 

In the following code, we create functions to be able to easily retrieve the declension forms of the noun, the lemma forms and the genus for each compound word. 

In [4]:
# functions retrieve word forms, lemmas and genus from german-nouns library 
# for the second noun part of our climate change compound nouns 

def get_forms(word):
    
    """
    This function retrieves the word forms of the second part of each compound word. 
    Arg: 
        df: The word we want to retrieve the word forms (declension forms) for. 
    Returns: The noun forms for each noun if available, else None.   
    """
    
    try:
        noun_forms = list(set(nouns[word][0]["flexion"].values())) # get all noun forms as a list
        return noun_forms      
    except:  
        return 
    
def get_lemma(word):
    
    """
    This function retrieves the lemma forms of the second part of each compound word. 
    Arg: 
        df: The word for which we want to retrieve the lemma form. 
    Returns: The lemma form for each noun if available, else None.
    """
    
    try:
        lemma = nouns[word][0]["lemma"] # retrieve lemma form
        return lemma  
    except:
        return 
    
def get_genus(word):
    
    """
    This function retrieves the genus of the second part of each compound word. 
    Arg: 
        df: The word for which we want to retrieve the genus. 
    Returns: The genus for each noun if available, else None.
    """
    
    try:
        genus = nouns[word][0]["genus"] # retrieve genus
        return genus
    except:
        return 

In [5]:
# apply functions to compound nouns 
compounds['noun_forms'] = compounds.second_part.apply(get_forms)
compounds['lemma'] = compounds.second_part.apply(get_lemma)
compounds['genus'] = compounds.second_part.apply(get_genus)

compounds

Unnamed: 0,original,second_part,noun_forms,lemma,genus
0,Klimaabzockerei,abzockerei,,,
1,Klimaaktivismus,aktivismus,,,
2,Klimaaktivist,aktivist,"[Aktivisten, Aktivist]",Aktivist,m
3,Klimaaktivistin,aktivistin,"[Aktivistinnen, Aktivistin]",Aktivistin,f
4,Klimaalarm,alarm,"[Alarms, Alarm, Alarmen, Alarme, Alarmes]",Alarm,m
...,...,...,...,...,...
243,Klimazipfel,zipfel,"[Zipfeln, Zipfel, Zipfels]",Zipfel,m
244,Klimazirkus,zirkus,"[Zirkusse, Zirkus, Zirkussen, Zirkusses]",Zirkus,m
245,Klimazunft,zunft,"[Zunft, Zünfte, Zünften]",Zunft,f
246,Klimazwang,zwang,"[Zwange, Zwängen, Zwangs, Zwanges, Zwang, Zwänge]",Zwang,m


Since the `german-nouns` library did not provide information for 15 of the compound words, we retrieve these words and manually retrieve the missing information from the Duden website (https://www.duden.de).

In [8]:
# 15 nouns could not be detected by german-nouns library
# these are being declined manually using DUDEN website 
to_check = compounds[compounds["lemma"].isna()].second_part.tolist()

# save nouns that have to be checked to txt file 
file = open("../evaluation/to_check.txt", "w")
for element in to_check:
    file.write(element + "\n")
file.close()

We re-load the retrieved word forms into Python and have a look at the data frame.

In [9]:
# open csv file with manually added noun forms, lemma and genus
to_check_df = pd.read_csv("../evaluation/to_check_manual.csv",sep=';', header=None, names=["second_part", "noun_forms", "genus"])

# lemma form is equal to second noun of composite
to_check_df["lemma"] = to_check_df["second_part"]

to_check_df

Unnamed: 0,second_part,noun_forms,genus,lemma
0,glaubenslehre,"glaubenslehre, glaubenslehren",f,glaubenslehre
1,verdummung,verdummung,f,verdummung
2,notstandsregierung,"notstandsregierung, notstandsregierungen",f,notstandsregierung
3,kasteiung,"kasteiung, kasteiungen",f,kasteiung
4,bremser,"bremser, bremsers, bremsern",m,bremser
5,besoffenheit,besoffenheit,f,besoffenheit
6,verblödung,verblödung,f,verblödung
7,alarmist,"alarmist, alarmisten",m,alarmist
8,hysteriker,"hysteriker, hysterikers, hysterikern",m,hysteriker
9,donna,"donna, donnas, donnen",f,donna


Now to, merge the manually retrieved information back into the complete data frame of the compound words, we convert the columns into a list format and use the `update` function of `pandas` to merge the information. 

In [10]:
# convert into correct format to be able to merge it to data frame 
noun_forms_df = to_check_df.noun_forms.values.tolist()
   
noun_forms_list = [element.split(",") for element in noun_forms_df]
to_check_df["noun_forms"] = noun_forms_list

# put new information into original data frame containing all composites
keys = ["second_part"]
compounds = compounds.set_index(keys)
compounds['noun_forms'].update(to_check_df.set_index(keys)['noun_forms'])
compounds['lemma'].update(to_check_df.set_index(keys)['lemma'])
compounds['genus'].update(to_check_df.set_index(keys)['genus'])
compounds = compounds.reset_index()

# change order of columns
compounds = compounds[["original", "second_part", "noun_forms", "lemma", "genus"]]

## 2. Normalization
We now perform a few normalization steps to generate a final compound data frame which we can use as an input for upcoming text mining applications:
- lowering of all strings in the data frame
- re-adding of the "klima" prefix to the second constituents to generate complete compound forms (saved in column `compound_forms`)

In [11]:
# get list of noun forms for normalization 
noun_forms = compounds.noun_forms.tolist()

# lower all strings in declension forms
lower = [[string.lower().replace(" ", "") for string in sublist] for sublist in noun_forms]

# lower all lemmas 
compounds["lemma"] = compounds["lemma"].str.lower()
compounds["original"] = compounds["original"].str.lower()

# add "klima" compound part back to get complete compound word forms
compound_forms = [["klima" + word for word in element] for element in lower]

# add complete compound word forms to data frame
compounds["compound_forms"] = compound_forms
compounds["noun_forms"] = lower

## 3. Export Output
Then we save this final compound data frame including all the information we retrieved in the steps above to the `compounds_info.csv` file which we can then use for the upcoming implementation methods. 

In [13]:
# save final data frame to csv file
compounds.to_csv("../files/compounds_info.csv", index=False)