# Definition Phrasing

## 0. Requirements

In [330]:
#!pip3 install requests

In [1]:
import pandas as pd
from ast import literal_eval
from collections import Counter
import numpy as np
import requests
import definition_strings as ds
import random

## 1. Load Knowledge Base
We load the knowledge base that was created in the `textmining.ipynb` notebook and drop the unnecessary columns.

In [72]:
compounds = pd.read_csv("../output/knowledge_base.csv")

to_drop = [ 'noun_forms', 'related_words', 'hypernyms', 'roots', 'en_hypernyms', 'path', 'wup', 'stem_cistem', 'stem_porter',
       'stem_lancaster', 'stem_snowball', 'share_cistem', 'share_porter', 'share_lancaster', 'share_snowball', 'dist_stemmer']

compounds = compounds.drop(to_drop, axis=1)

In [73]:
compounds.columns

Index(['original', 'second_part', 'lemma', 'genus', 'compound_forms',
       'concept', 'definition', 'similar_words', 'PERS_pro', 'ORG_pro',
       'PERS_con', 'ORG_con', 'pro_mods', 'con_mods', 'pro_sarcasm',
       'con_sarcasm', 'pro_attr', 'con_attr', 'tf_pro', 'tf_con', 'tfidf_pro',
       'tfidf_con', 'pro_colls', 'con_colls', 'manual_sentiment'],
      dtype='object')

# 2. Preprocessing of the information
When loading the csv file via `pandas` oftentimes requires the re-evaluation of the literals contained in the data frame. Accordingly, we run the following code to make sure all cell types are evaluated correctly. 

In [74]:
compounds['compound_forms'] = compounds.compound_forms.apply(lambda x: literal_eval(str(x)))
#compounds['related_words'] = compounds.related_words.apply(lambda x: literal_eval(str(x)))
#compounds['hypernyms'] = compounds.hypernyms.apply(lambda x: literal_eval(str(x)))
#compounds['en_hypernyms'] = compounds.en_hypernyms.apply(lambda x: literal_eval(str(x)) if(str(x) != 'nan') else x)
compounds['definition'] = compounds.definition.apply(lambda x: literal_eval(str(x)))
compounds['PERS_pro'] = compounds.PERS_pro.apply(lambda x: literal_eval(str(x)) if(str(x) != 'nan') else x)
compounds['PERS_con'] = compounds.PERS_con.apply(lambda x: literal_eval(str(x)) if(str(x) != 'nan') else x)
compounds['ORG_pro'] = compounds.ORG_pro.apply(lambda x: literal_eval(str(x)) if(str(x) != 'nan') else x)
compounds['ORG_con'] = compounds.ORG_con.apply(lambda x: literal_eval(str(x)) if(str(x) != 'nan') else x)
compounds['similar_words'] = compounds.similar_words.apply(lambda x: literal_eval(str(x)) if(str(x) != 'nan') else x)
compounds['pro_mods'] = compounds.pro_mods.apply(lambda x: literal_eval(str(x)) if(str(x) != 'nan') else x)
compounds['con_mods'] = compounds.con_mods.apply(lambda x: literal_eval(str(x)) if(str(x) != 'nan') else x)
compounds["PERS_pro"] = compounds.PERS_pro.apply(lambda x: Counter(x) if(str(x) != 'nan') else x)
compounds["PERS_con"] = compounds.PERS_con.apply(lambda x: Counter(x) if(str(x) != 'nan') else x)
compounds["ORG_pro"] = compounds.ORG_pro.apply(lambda x: Counter(x) if(str(x) != 'nan') else x)
compounds["ORG_con"] = compounds.ORG_con.apply(lambda x: Counter(x) if(str(x) != 'nan') else x)
compounds["pro_attr"] = compounds.pro_attr.apply(lambda x: "".join(literal_eval(str(x))) if(str(x) != 'nan') else x)
compounds["con_attr"] = compounds.con_attr.apply(lambda x: "".join(literal_eval(str(x))) if(str(x) != 'nan') else x)
compounds["pro_colls"] = compounds.pro_colls.apply(lambda x: literal_eval(str(x)) if(str(x) != 'nan') else list())
compounds["con_colls"] = compounds.con_colls.apply(lambda x: literal_eval(str(x)) if(str(x) != 'nan') else list())

Next we apply some preprocessing steps to translate and reduce some of the pieces of information that are contained in the knowledge base:
- translate polarity labels into German counterparts
- translate attribution tags into German counterparts
- count modifiers
- replace unavailable TF-IDF scores with 0
- retrieve definite articles from genus column

In [75]:
# replace sentiment with the German counterparts
compounds["manual_sentiment"] = compounds.manual_sentiment.replace("negative", "negativ")
compounds["manual_sentiment"] = compounds.manual_sentiment.replace("positive", "positiv")

# replace attribution tag with German counterpart
compounds["pro_attr"] = compounds.pro_attr.replace("Self", "Selbstzuschreibung")
compounds["pro_attr"] = compounds.pro_attr.replace("External", "Fremdzuschreibung")
compounds["con_attr"] = compounds.con_attr.replace("Self", "Selbstzuschreibung")
compounds["con_attr"] = compounds.con_attr.replace("External", "Fremdzuschreibung")

# count modifiers
compounds["pro_mods"] = compounds.pro_mods.apply(lambda x: Counter(x) if(str(x) != 'nan') else x)
compounds["con_mods"] = compounds.con_mods.apply(lambda x: Counter(x) if(str(x) != 'nan') else x)

# replace nan values in TF-IDF columnd with 0
compounds["tfidf_pro"] = compounds.tfidf_pro.replace(np.nan, 0)
compounds["tfidf_con"] = compounds.tfidf_con.replace(np.nan, 0)

# change genus to article information
# create a list of our conditions
conditions = [(compounds['genus'] == "f"),(compounds['genus'] == "m"), (compounds['genus'] == "n")]

# create a list of the values we want to assign for each condition
values = ['die', 'der', 'das']

# create a new column and use np.select to assign values to it using our lists as arguments
compounds['article'] = np.select(conditions, values)

In [340]:
import urllib.request, json

In [349]:
import urllib.request, json 
with urllib.request.urlopen("https://www.dwds.de/api/wb/snippet/?q=Haus") as url:
    data = json.load(url)
    print(data)

#url = "https://www.dwds.de/api/wb/snippet/?q=Haus"
#response = urllib.urlopen(url)
#data = json.loads(response.read())

[{'lemma': 'Haus', 'url': 'https://www.dwds.de/wb/Haus', 'wortart': 'Substantiv', 'input': 'Haus'}]


In [353]:
data[0]["lemma"]

'Haus'

## 2. Create Placeholders
Next we will load the definition strings and place holders into Python.

In [None]:
import definition_strings as ds

In [77]:
### INITIATE STRING BASES FOR EACH CATEGORY

str_base_info = """{COMPOUND}, {ARTICLE}\n"""

str_base_pers = """Der Begriff {COMPOUND} bezeichnet eine Person, die in einer gewissen Beziehung zum Klimawandel steht. Der Begriff wird in unserem Korpus {CON_FREQ} Mal von den Klimaforschungsskeptikern und {PRO_FREQ} Mal von den Klimaforschungsvertretern verwendet. Auf den gesamten Korpus gesehen, entspricht das einer relativen Häufigkeit (TF-IDF) von {CON_TFIDF} für die Skeptiker und {PRO_TFIDF} für die Vertreter.""" 

str_base_loc = """Der Begriff {COMPOUND} bezeichnet eine Lokalität im Bezug auf den Klimawandel. Der Begriff wird in unserem Korpus {CON_FREQ} Mal von den Klimaforschungsskeptikern und {PRO_FREQ} Mal von den Klimaforschungsvertretern verwendet. Auf den gesamten Korpus gesehen, entspricht das einer relativen Häufigkeit (TF-IDF) von {CON_TFIDF} für die Skeptiker und {PRO_TFIDF} für die Vertreter.""" 

str_base_action = """Der Begriff {COMPOUND} bezeichnet eine Aktion im Bezug auf den Klimawandel. Der Begriff wird in unserem Korpus {CON_FREQ} Mal von den Klimaforschungsskeptikern und {PRO_FREQ} Mal von den Klimaforschungsvertretern verwendet. Auf den gesamten Korpus gesehen, entspricht das einer relativen Häufigkeit (TF-IDF) von {CON_TFIDF} für die Skeptiker und {PRO_TFIDF} für die Vertreter.""" 

str_base_abstract = """Der Begriff {COMPOUND} bezeichnet ein Konzept in Relation zum Klimawandel. Der Begriff wird in unserem Korpus {CON_FREQ} Mal von den Klimaforschungsskeptikern und {PRO_FREQ} Mal von den Klimaforschungsvertretern verwendet. Auf den gesamten Korpus gesehen, entspricht das einer relativen Häufigkeit (TF-IDF) von {CON_TFIDF} für die Skeptiker und {PRO_TFIDF} für die Vertreter.""" 

str_base_group = """Der Begriff {COMPOUND} bezeichnet einen Zusammenschluss von Personen im Bezug auf den Klimawandel. Der Begriff wird in unserem Korpus {CON_FREQ} Mal von den Klimaforschungsskeptikern und {PRO_FREQ} Mal von den Klimaforschungsvertretern verwendet. Auf den gesamten Korpus gesehen, entspricht das einer relativen Häufigkeit (TF-IDF) von {CON_TFIDF} für die Skeptiker und {PRO_TFIDF} für die Vertreter.""" 

# INITIATE STRINGS THAT ARE EQUAL FOR ALL CATEGORIES

str_sent =  """ In unserem Korpus Sample ist der Begriff meist {SENTIMENT} konnotiert."""

#str_attr_con = """ Hierbei wird „{COMPOUND}“ von Seiten der Skeptiker im Sinne einer {CON_ATTRIBUTION}""" #(verwendet)
#str_attr_pro = """ und von Vertretern als {PRO_ATTRIBUTION} verwendet."""

str_attr = """Verwendet wird "{COMPOUND}" hierbei im Sinne einer {ATTRIBUTION}"""
str_sarcasm = """ In {SARCASM}"""

str_mods_pro = """ Im Subdiskurs der Klimaforschungsvertreter wird der Begriff von Wörtern wie {PRO_MODS} modifiziert.""" 
str_mods_con = """ Modifizierer wie {CON_MODS} treten häufig auf, um den Begriff im Subdiskurs der Klimaforschungsskeptiker näher zu beschreiben.""" 

#str_pers_con = """ Personen, die im Zusammenhang mit dem Begriff erwähnt werden sind {CON_PERS} (Skeptiker)"""
str_pers_con = """ Im Zusammenhang mit dem Begriff erwähnt der Skeptiker Korpus die Person(en) {CON_PERS}"""
#str_pers_pro = """ und {PRO_PERS} (Vertreter)."""
str_pers_pro = """ und der Vertreter Diskurs die Person(en) {PRO_PERS}."""

str_org_con = """ Außerdem werden im Kontext von "{COMPOUND}" folgende Organisationen genannt: {CON_ORG} (Skeptiker Korpus)"""
str_org_pro = """ und {PRO_ORG} (Vertreter Korpus)."""  

str_colls = """\nHäufige Kollokationen: {CON_COLLS} {PRO_COLLS}"""
str_simwords = """\n\nSiehe auch: {SIMILAR_WORDS}"""

In [78]:
con_mods =  " und ".join(["'"+el[0]+"'" for el in compounds['con_mods'].iloc[2].most_common(2) if el[1] > 1])

con_mods

"'bekannt' und 'weltweit'"

## 3. Fill Placeholders
Next, for each compound word, we will generate a final combination of strings (according to the unique information pieces that we have for this compound) and fill the place holders (denoted in swift brackets) with these information pieces. 

In [81]:
# for each compound word 
for word in compounds.original:
    
    # set index to this compound word 
    idx = compounds.index[compounds['original'] == word][0]
        
    ### BASE INFORMATION ####
    
    # initiate base info string (i.e. compound + genus)
    text = str_base_info
    
    # retrieve base information from knowledge base 
    compound = word.capitalize() # capitalized version of compound
    article = compounds['article'].iloc[idx] # definite article
    con_freq = compounds['tf_con'].iloc[idx] # term frequency C2022
    pro_freq = compounds['tf_pro'].iloc[idx] # term frequency P2022
    con_tfidf = round(compounds['tfidf_con'].iloc[idx],2) # TF-IDF C2022
    pro_tfidf = round(compounds['tfidf_pro'].iloc[idx],2) # TF-IDF P2022        
    
    # if category of compound is "person"
    if compounds['concept'].iloc[idx] == "person":
        text += str_base_pers # add "person" string to base string
        
        ### ATTRIBUTION AND SARCASM ###
        
        text += str_attr # add "attribution" string to base string
        text += str_sarcasm # add "sarcasm" string to base string 
        
        # retrieve attribution information from knowledge base 
        pro_attr = compounds['pro_attr'].iloc[idx]
        con_attr = compounds['con_attr'].iloc[idx]
        
        # retrieve sarcasm information from knowledge base
        pro_sarc = compounds['pro_sarcasm'].iloc[idx]
        con_sarc = compounds['con_sarcasm'].iloc[idx]
        
        # if we have attribution info for both corpora
        if str(compounds['con_attr'].iloc[idx]) != "nan" and str(compounds['pro_attr'].iloc[idx]) != "nan":
            
            # compose attribution filler
            attr = con_attr + "von Seiten der Skeptiker und als " + pro_attr + " im Vetreter Korpus."
                        
            # compose sarcasm filler
            sarcasm += str(int(pro_sarc*100)) + " % (Vertreter) und " + str(int(con_sarc*100)) + " % (Skeptiker) der Fälle wird die Verwendung als sarkatisch eingestuft."
          
        # if we only have attribution info for C2022
        elif str(compounds['con_attr'].iloc[idx]) != "nan" and str(compounds['pro_attr'].iloc[idx]) == "nan":
            
            # compose attribution filler
            attr = con_attr + "von Seiten der Skeptiker."
                        
            # compose sarcasm filler
            sarcasm += str(int(con_sarc*100)) + " % der Fälle wird die Verwendung als sarkatisch eingestuft."

        # if we only have attribution info for P2022
        elif str(compounds['con_attr'].iloc[idx]) == "nan" and str(compounds['pro_attr'].iloc[idx]) != "nan":
            
            # compose attribution filler
            attr = pro_attr + "von Seiten der Vertreter."
            
            # compose sarcasm filler
            sarcasm += str(int(pro_sarc*100)) + " % der Fälle wird die Verwendung als sarkatisch eingestuft."
            
    # if category of compound is "location"
    elif compounds['concept'].iloc[idx] == "location":
        text += str_base_loc # add "location" string to base string 
        attr = "" # attribution not available
        sarcasm = "" # sarcasm not available
    
    # if category of compound is "group"
    elif compounds['concept'].iloc[idx] == "group":
        text += str_base_group # add "group" string to base string 
        
        ### ATTRIBUTION AND SARCASM ###
        
        text += str_attr # add "attribution" string to base string
        text += str_sarcasm # add "sarcasm" string to base string
        
        # retrieve attribution information from knowledge base 
        pro_attr = compounds['pro_attr'].iloc[idx]
        con_attr = compounds['con_attr'].iloc[idx]
        
        # retrieve sarcasm information from knowledge base 
        pro_sarc = compounds['pro_sarcasm'].iloc[idx]
        con_sarc = compounds['con_sarcasm'].iloc[idx]
  
        
        # if we have attribution info for both corpora
        if str(compounds['con_attr'].iloc[idx]) != "nan" and str(compounds['pro_attr'].iloc[idx]) != "nan":

            # compose attribution filler
            attr = con_attr + "von Seiten der Skeptiker und als " + pro_attr + " im Vetreter Korpus."
  
            # compose sarcasm filler
            sarcasm += str(int(pro_sarc*100)) + " % (Vertreter) und " + str(int(con_sarc*100)) + " % (Skeptiker) der Fälle wird die Verwendung als sarkatisch eingestuft."
 
        # if we only have attribution info for C2022
        elif str(compounds['con_attr'].iloc[idx]) != "nan" and str(compounds['pro_attr'].iloc[idx]) == "nan":
            
            # compose attribution filler
            attr = con_attr + "von Seiten der Skeptiker." 
            
            # compose sarcasm filler
            sarcasm += str(int(con_sarc*100)) + " % der Fälle wird die Verwendung als sarkatisch eingestuft."

        # if we only have attribution info for P2022
        elif str(compounds['con_attr'].iloc[idx]) == "nan" and str(compounds['pro_attr'].iloc[idx]) != "nan":
            
            # compose attribution filler
            attr = pro_attr + "von Seiten der Vertreter." 
                        
            # compose sarcasm filler
            sarcasm += str(int(pro_sarc*100)) + " % der Fälle wird die Verwendung als sarkatisch eingestuft."
            
    # if category of compound is "abstraction"        
    elif compounds['concept'].iloc[idx] == "abstraction":
        text += str_base_abstract # add "abstraction" string to base string 
        attr = "" # attribution not available
        sarcasm = "" # sarcasm not available
    
    # if category of compound is "action"
    elif compounds['concept'].iloc[idx] == "action":
        text += str_base_action # add "action" string to base string 
        attr = "" # attribution not available
        sarcasm = "" # sarcasm not available
    
    ### CONNOTATION ### 
        
    # add sentiment string
    text += str_sent 
    
    sentiment = compounds['manual_sentiment'].iloc[idx] # retrieve connotation label 
        
    ### MODIFIERS ###    
    
    # if we have at least one P2022 modifier for compound
    if compounds['pro_mods'].isna().iloc[idx] == False:
        try:
            # try to retrieve two most common modifiers and connect with conjunction
            pro_mods =  " und ".join(["'"+el[0]+"'" for el in compounds['pro_mods'].iloc[idx].most_common(2) if el[1] > 1])
            
            # if modifier string is not empty 
            if pro_mods != "":
                text += str_mods_pro # add "pro modifier" string to base string 
        
        except:
            # if only one modifier available retrieve this one
            pro_mods = "".join(["'"+el[0]+"'" for el in compounds['pro_mods'].iloc[idx].most_common(1) if el[1] > 1])
            
            # if modifier string is not empty
            if pro_mods != "":
                text += str_mods_pro # add "pro modifier" string to base string 
    else:
        print("NO PRO MOD FOUND")
        pro_mods = "" # else, no P2022 modifier available 
            
    # if we have at least one C2022 modifier for compound
    if compounds['con_mods'].isna().iloc[idx] == False:
        try:
            # try to retrieve two most common modifiers and connect with conjunction
            con_mods =  " und ".join(["'"+el[0]+"'" for el in compounds['con_mods'].iloc[idx].most_common(2) if el[1] > 1])
            
            # if modifier string is not empty
            if con_mods != "":
                text += str_mods_con # add "con modifier" string to base string 
        
        except:
            # if only one modifier available retrieve this one
            con_mods = "".join(["'"+el[0]+"'" for el in compounds['con_mods'].iloc[idx].most_common(1) if el[1] > 1])
            
            # if modifier string is not emtpy 
            if con_mods != "": 
                text += str_mods_con # add "con modifier" string to base string 
    else:
        print("NO CON MOD FOUND")
        con_mods = "" # else, no C2022 modifier available 
            
    
    ### PERSON ENTITIES ###
        
    # if we have at least one person in C2022
    if compounds['PERS_con'].isna().iloc[idx] == False:
        try:
            # try to retrieve two most common persons and connect with conjunction
            con_pers = " und ".join([el[0] for el in compounds['PERS_con'].iloc[idx].most_common(2)])
            text += str_pers_con # add "con person" string to base string 
            
        except:
            # if only one person available retrieve this one
            con_pers = compounds['PERS_con'].iloc[idx].most_common(1)[0]
            text += str_pers_con # add "con person" string to base string 
    
    else:
        print("NO PERSON FOUND")
        con_pers = "" 
            
    # if we have at least one person in P2022
    if compounds['PERS_pro'].isna().iloc[idx] == False:
        try:
            # try to retrieve two most common persons and connect with conjunction
            pro_pers = " und ".join([el[0] for el in compounds['PERS_pro'].iloc[idx].most_common(2)])
            text +=  str_pers_pro # add "pro person" string to base string 


        except:
            # if only one person available retrieve this one
            pro_pers = compounds['PERS_pro'].iloc[idx].most_common(1)[0]
            text += str_pers_pro # add "pro person" string to base string 
                
    else:
        pro_pers = ""
        print("NO PERSON FOUND")
        
    ### ORGANISATION ENTITIES ### 
            
    # if we have at least one organisation in C2022
    if compounds['ORG_con'].isna().iloc[idx] == False:
        try:
            # try to retrieve two most common organisations and connect with comma
            con_org =  ", ".join([el[0] for el in compounds['ORG_con'].iloc[idx].most_common(2)])
            text += str_org_con # add "con organisation" string to base string 
        except:
            # if only one organisation available retrieve this one
            con_org = compounds['ORG_con'].iloc[idx].most_common(1)[0]
            text += str_org_con # add "con organisation" string to base string  
                
    else:
        con_org = "" 
            
            
    # if we have at least one organisation in P2022
    if compounds['ORG_pro'].isna().iloc[idx] == False:
        try:
            # try to retrieve two most common organisations and connect with comma
            pro_org =  ", ".join([el[0] for el in compounds['ORG_pro'].iloc[idx].most_common(2)])
            text += str_org_pro # add "pro organisation" string to base string 
            
        except:
            
            # if only one organisation available retrieve this one
            pro_org = compounds['ORG_pro'].iloc[idx].most_common(1)[0]
            text += str_org_pro # add "pro organisation" string to base string 
                
    else:
        pro_org = ""
        text += "." # add final stop to definition text
            
            
    ### SIMILAR COMPOUNDS ###
    
    # if we have at least one similar word 
    if len(compounds['similar_words'].iloc[idx]) != 0:
        
        # retrieve the words and re-append the prefix "Klima" to the words
        similar_words = set(["Klima"+x for x in compounds['similar_words'].iloc[idx] if "Klima"+x != compound])
        
        # connect the words with a comma 
        similar_words = ", ".join(similar_words)
        
        text += str_simwords # add "similar words" string to base string
            
            
    else:
        print("NO SIMILAR WORDS")
        similar_words = ""
        
    ### COLLOCATIONS ###
        
    # if we have collocations for C2022 and P2022
    if len(compounds['con_colls'].iloc[idx]) != 0 and len(compounds['pro_colls'].iloc[idx]) != 0:
        print("COLLS FOR BOTH")
        try:
            # try to retrieve two random collocations from C2022 and connect with comma and compose string
            con_colls = ", ".join(random.sample(compounds['con_colls'].iloc[idx], 2)) + " (Skeptiker)"
        except:
            # if only one collocation from C2022 available retrieve this one and compose string
            con_colls = ", ".join(random.sample(compounds['con_colls'].iloc[idx], 1)) + " (Skeptiker)"

        try:
            # try to retrieve two random collocations from P2022 and connect with comma and compose string 
            pro_colls = "und " + ", ".join(random.sample(compounds['pro_colls'].iloc[idx], 2)) + " (Vertreter)"
        except:
            # if only one collocation from P2022 available retrieve this one
            pro_colls = "und " + ", ".join(random.sample(compounds['pro_colls'].iloc[idx], 1)) + " (Vertreter)"

        text += str_colls # add "collocations" string to base string 
        
    # if we only have collocations for C2022
    elif len(compounds['con_colls'].iloc[idx]) != 0:
        try:
            # try to retrieve two random collocations from C2022 and connect with comma and compose string
            con_colls = ", ".join(random.sample(compounds['con_colls'].iloc[idx], 2)) + " (Skeptiker)"
        except:
            # if only one collocation from C2022 available retrieve this one and compose string
            con_colls = ", ".join(random.sample(compounds['con_colls'].iloc[idx], 1)) + " (Skeptiker)"


        pro_colls = " " # no collocations for P2022 available 
        text += str_colls # add "collocations" string to base string
        
    # if we only have collocations for P2022
    elif len(compounds['pro_colls'].iloc[idx]) != 0:
        try:
            # try to retrieve two random collocations from P2022 and connect with comma and compose string 
            pro_colls = ", ".join(random.sample(compounds['pro_colls'].iloc[idx], 2)) + " (Skeptiker)"
        except:
            # if only one collocation from P2022 available retrieve this one
            pro_colls = ", ".join(random.sample(compounds['pro_colls'].iloc[idx], 1)) + " (Skeptiker)"


        con_colls = " " # no collocations for C2022 available
        text += str_colls # add "collocations" string to base string 
        
    else:
        pro_colls = ""
        con_colls = ""

    ### FINAL DEFINITION ###
            
    # assign the fillers to the according place holders in the final definition string    
    full_definition = text.format(COMPOUND= compound, ARTICLE = article, CON_FREQ= con_freq, PRO_FREQ = pro_freq, 
                                  CON_TFIDF = con_tfidf, PRO_TFIDF = pro_tfidf, SENTIMENT= sentiment, CON_PERS = con_pers, 
                                  PRO_PERS = pro_pers, CON_ORG = con_org, PRO_ORG = pro_org, SIMILAR_WORDS= similar_words, 
                                  PRO_MODS = pro_mods, CON_MODS = con_mods, PRO_COLLS = pro_colls, CON_COLLS = con_colls, 
                                  ATTRIBUTION = attr, SARCASM = sarcasm)
        
        
    #print(full_definition)
    #print("_"*50)
                
    # save to column "full_definition"
    compounds.at[idx, 'full_definition'] = full_definition

NO PRO MOD FOUND
NO PERSON FOUND
NO SIMILAR WORDS
COLLS FOR BOTH
COLLS FOR BOTH
NO PRO MOD FOUND
NO PERSON FOUND
NO PRO MOD FOUND
NO PERSON FOUND
NO PRO MOD FOUND
NO PERSON FOUND
NO PERSON FOUND
NO PRO MOD FOUND
NO PERSON FOUND
NO PRO MOD FOUND
NO PERSON FOUND
NO PERSON FOUND
NO PRO MOD FOUND
NO PERSON FOUND
NO SIMILAR WORDS
NO PRO MOD FOUND
NO PERSON FOUND
NO PERSON FOUND
NO SIMILAR WORDS
NO PRO MOD FOUND
NO PERSON FOUND
NO CON MOD FOUND
NO PERSON FOUND
NO PERSON FOUND
NO SIMILAR WORDS
NO CON MOD FOUND
NO PERSON FOUND
NO PERSON FOUND
NO PRO MOD FOUND
NO PERSON FOUND
NO PERSON FOUND
NO SIMILAR WORDS
NO PRO MOD FOUND
NO CON MOD FOUND
NO PERSON FOUND
NO PERSON FOUND
NO SIMILAR WORDS
NO PRO MOD FOUND
NO PERSON FOUND
NO PERSON FOUND
NO SIMILAR WORDS
NO PRO MOD FOUND
NO PERSON FOUND
NO SIMILAR WORDS
NO PRO MOD FOUND
NO PERSON FOUND
NO PERSON FOUND
NO SIMILAR WORDS
NO PRO MOD FOUND
NO PERSON FOUND
NO PERSON FOUND
NO SIMILAR WORDS
NO PRO MOD FOUND
NO PERSON FOUND
NO PRO MOD FOUND
NO PERSON FO

In [87]:
print(compounds.full_definition[2])

Klimaaktivist, der
Der Begriff Klimaaktivist bezeichnet eine Person, die in einer gewissen Beziehung zum Klimawandel steht. Der Begriff wird in unserem Korpus 66 Mal von den Klimaforschungsskeptikern und 61 Mal von den Klimaforschungsvertretern verwendet. Auf den gesamten Korpus gesehen, entspricht das einer relativen Häufigkeit (TF-IDF) von 0.61 für die Skeptiker und 0.16 für die Vertreter.Verwendet wird "Klimaaktivist" hierbei im Sinne einer Fremdzuschreibungvon Seiten der Skeptiker und als Selbstzuschreibung im Vetreter Korpus. In 0 % (Vertreter) und 1 % (Skeptiker) der Fälle wird die Verwendung als sarkatisch eingestuft. In unserem Korpus Sample ist der Begriff meist neutral konnotiert. Im Subdiskurs der Klimaforschungsvertreter wird der Begriff von Wörtern wie 'lieb' und 'jung' modifiziert. Modifizierer wie 'bekannt' und 'weltweit' treten häufig auf, um den Begriff im Subdiskurs der Klimaforschungsskeptiker näher zu beschreiben. Im Zusammenhang mit dem Begriff erwähnt der Skeptike

#def fill_persons(df):
    
for word in compounds.original:
    #print(compound)
    idx = compounds.index[compounds['original'] == word][0]
    
    if compounds['concept'].iloc[idx] == "person":
        #print(compound)
        
        #text = """Der Begriff {COMPOUND} bezeichnet eine/n {DEFINITION} im Bezug auf den Klimawandel.\nDer Begriff wird in unserem Korpus {CON_FREQ} Mal von den Klimaskeptikern und {PRO_FREQ} Mal von den Klimaaktivisten verwendet und ist {SENTIMENT} konnotiert. """
        text = str_base_pers
        
        # retrieve according information
        compound = word.capitalize()
        con_freq = compounds['tf_con'].iloc[idx]
        pro_freq = compounds['tf_pro'].iloc[idx]
        sentiment = compounds['manual_sentiment'].iloc[idx]
        
        try:
            definition = compounds['definition'].iloc[idx][0] # try [0] for definition
        except:
            #print("NO DEFINITION AVAILABLE")
            definition = " "
            
        
        # retrieve number of persons. if 0 then do nothing
        
        # if we have at least one person in PERS_con
        if compounds['PERS_con'].isna().iloc[idx] == False:
            try:
                con_pers =  " und ".join([el[0] for el in compounds['PERS_con'].iloc[idx].most_common(2)])
                #text += """\nPersonen, die im Zusammenhang mit dem Begriff erwähnt werden sind {CON_PERS} (Con Corpus) """
                text += str_pers_con
            except:
                con_pers = compounds['PERS_con'].iloc[idx].most_common(1)[0]
                #text += """\nPersonen, die im Zusammenhang mit dem Begriff erwähnt werden sind {CON_PERS} (Con Corpus) """
                text += str_pers_con
        else:
          #  print("NO PERSON FOUND")
            #con_pers = " " 
            
        # if we have at least one person in PERS_con
        if compounds['PERS_pro'].isna().iloc[idx] == False:
            try:
                #print("2 PERSONS FOUND")
                pro_pers = " und ".join([el[0] for el in compounds['PERS_pro'].iloc[idx].most_common(2)])
                #text +=  """und {PRO_PERS} (Pro Corpus). """
                text += str_pers_pro


            except:
                print("ONLY 1 PERSON FOUND")
                pro_pers = compounds['PERS_pro'].iloc[idx].most_common(1)[0]
                #text +=  """und {PRO_PERS} (Pro Corpus). """
                text += str_pers_pro
                
        else:
            pro_pers = " "
            #print("NO PERSON FOUND")
            
        if compounds['ORG_con'].isna().iloc[idx] == False:
            try:
                con_org =  ", ".join([el[0] for el in compounds['ORG_con'].iloc[idx].most_common(2)])
                #text +=  """\nAußerdem werden im Kontext von {COMPOUND} folgende Organisationen genannt: {CON_ORG} (C2022)"""
                text += str_org_con
            except:
                con_org = compounds['ORG_con'].iloc[idx].most_common(1)[0]
                #text +=  """\nAußerdem werden im Kontext von {COMPOUND} folgende Organisationen genannt: {CON_ORG} (C2022)"""
                text += str_org_con 
                
        else:
            con_org = " " 
            
        # if we have at least one person in PERS_con
        if compounds['ORG_pro'].isna().iloc[idx] == False:
            try:
                pro_org =  ", ".join([el[0] for el in compounds['ORG_pro'].iloc[idx].most_common(2)])
                #text +=  """und {PRO_ORG} (P2022). """
                text += str_org_pro
            except:
                pro_org = compounds['ORG_pro'].iloc[idx].most_common(1)[0]
                #text +=  """und {PRO_ORG} (P2022). """   
                text += str_org_pro
                
        else:
            pro_org = " "
            #text += "."
            
        if len(compounds['similar_words'].iloc[idx]) != 0:
            #print(len(compounds['similar_words'].iloc[idx]))
            similar_words = set(["Klima"+x for x in compounds['similar_words'].iloc[idx] if "Klima"+x != compound])
            similar_words = ", ".join(similar_words)
           # similar_words = ", ".join(compounds['similar_words'].iloc[idx])
            #text += """\nSiehe auch: {SIMILAR_WORDS}"""
            text += str_simwords
            
            
        else:
           # print("NO SIMILAR WORDS")
            
        full_definition = text.format(COMPOUND= compound, DEFINITION= definition, CON_FREQ= con_freq,
                        PRO_FREQ = pro_freq, SENTIMENT= sentiment, CON_PERS = con_pers, PRO_PERS = pro_pers, 
                        CON_ORG = con_org, PRO_ORG = pro_org, SIMILAR_WORDS= similar_words)
        
        
       # full_definition = persons_text.format(COMPOUND= compound, DEFINITION= definition, CON_FREQ= con_freq,
        #                 PRO_FREQ = pro_freq, SENTIMENT= sentiment, CON_PERS = con_pers, PRO_PERS = pro_pers, 
         #                CON_ORG = con_org, PRO_ORG = pro_org, SIMILAR_WORDS= similar_words)
        
        #print(full_definition)
        #print("_"*50)
                
        # save to column "full_definition"
        
        
        