In [1]:
# install Spacy
# upgrade Spacy
!pip install spacy
!pip install -U spacy --quiet

# install Spacy polish language model 
!python -m spacy download pl_core_news_sm --quiet

import spacy
import pandas as pd
import numpy as np
pd.set_option("max_colwidth", 400)
pd.options.mode.chained_assignment = None
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 


# load SPACY model
nlp = spacy.load('pl_core_news_sm')

[K     |████████████████████████████████| 6.0 MB 5.2 MB/s 
[K     |████████████████████████████████| 181 kB 54.8 MB/s 
[K     |████████████████████████████████| 42 kB 1.1 MB/s 
[K     |████████████████████████████████| 10.1 MB 19.7 MB/s 
[K     |████████████████████████████████| 628 kB 40.4 MB/s 
[K     |████████████████████████████████| 451 kB 42.5 MB/s 
[K     |████████████████████████████████| 58.6 MB 1.1 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pl_core_news_sm')


In [2]:
def load_data(file_path, indx = True, indx_col = 0):
  '''Parameters:
  file_path: path to your excel or csv file with data,

  indx: boolean - whether there is index column in your file (usually it is the first column) --> default is True
  
  indx_col: int - if your file has index column, specify column number here --> default is 0 (first column)
  '''
  if indx == True and file_path.endswith(".xlsx"):
    data = pd.read_excel(file_path, index_col = indx_col)
  elif indx == False and file_path.endswith(".xlsx"):
    data = pd.read_excel(file_path)

  elif indx == True and file_path.endswith(".csv"):
    data = pd.read_csv(file_path, index_col = indx_col)
  elif indx == False and file_path.endswith(".csv"):
    data = pd.read_csv(file_path)

  return data

In [10]:
def lemmatization(dataframe, text_column):
  '''Parameters:
  dataframe: dataframe with your data,

  text_column: name of a column in your dataframe where text is located
  '''
  df = dataframe.copy()
  lemmas = []
  for doc in nlp.pipe(df[text_column].apply(str)):
    lemmas.append([token.lemma_ for token in doc])
  df[text_column +"_lemmatized"] = lemmas
  return df

In [4]:
def find_emotive_words(dataframe, content_lemmatized_column, db_words, uniq_words=False, database = "nawl"):
  '''Parameters: 
  dataframe: dataframe with your data,

  content_lemmatized_column: str - name of a column in your dataframe where lemmatized text is located,
    
  db_words: str - name of a column in affective database where words are listed,
  
  uniq_words: boolean - True if you want to retrieve only unique emotive words from your text data,
  False if you want to retrieve every emotive word (thus, there can be duplicated words),
  --> *by default it is set to False

  database: str - name of an affective database you want to analyse your data with --> type "nawl" or "emean"
  '''
  database = database.upper()

  if database == "NAWL":
    affective_database = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Emotional word lists/NAWL_full_db.xlsx", index_col=0)
  elif database == "EMEAN":
    affective_database = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Emotional word lists/uniq_lemma_Emean.xlsx", index_col=0)

  affective_database = affective_database[[db_words]]

  all_emotive_words = []
  if uniq_words == True:
    for lemmas_list in dataframe[content_lemmatized_column]:
      emotive_words = [word for word in set(lemmas_list).intersection(affective_database[db_words])]
      all_emotive_words.append(emotive_words)

  elif uniq_words == False:
    for lemmas_list in dataframe[content_lemmatized_column]:
      from collections import Counter
      list_words = list(affective_database[db_words])
      list_text = pd.Series(lemmas_list)
      words_in_database = Counter(list_words)
      lemma_words = Counter(list_text)
      emotive_words = [key for key in list(lemma_words.keys()) if key in list(words_in_database.keys()) for i in range(lemma_words[key])]
      all_emotive_words.append(emotive_words)
  
  dataframe["Emotive_words"] = all_emotive_words
  return dataframe

In [5]:
def average(dataframe, emotive_words_column, database = "nawl"):
  '''Parameters: 
  dataframe: dataframe with your data,

  emotive_words_column: str - name of a column in your dataframe where emotive words are listed,
  
  database: str - name of an affective database you want to analyse your data with --> type "nawl" or "emean"
  '''
  import warnings
  np.seterr(divide='ignore')
  warnings.filterwarnings(action='ignore', message='Mean of empty slice')

  database = database.upper()

  NAWL_db = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Emotional word lists/NAWL_full_db.xlsx", index_col=0)
  Emean_db = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Emotional word lists/uniq_lemma_Emean.xlsx", index_col=0)

  emean_emotion_values = ['HAP M', 'ANG M', 'SAD M', 'FEA M', 'DIS M', 'VAL M', 'ARO M', 'SUR M', 'TRU M', 'ANT M']
  emean_words = "lemma"
  emean_cols = [emean_words] + emean_emotion_values

  nawl_emotion_values = ['hap_M_all', 'ang_M_all', 'sad_M_all', 'fea_M_all', 'dis_M_all', 'val_M_all', 'aro_M_all']
  nawl_words = "NAWL_word"
  nawl_cols = [nawl_words] + nawl_emotion_values

  if database == "NAWL":
    affective_database = NAWL_db[nawl_cols]
    affective_database.set_index(nawl_words, inplace=True)

    happ_all = []
    ang_all = []
    sad_all = []
    fea_all = []
    dis_all = []
    val_all = []
    aro_all = []

    happ_all_vals = []
    ang_all_vals = []
    sad_all_vals = []
    fea_all_vals = []
    dis_all_vals = []
    val_all_vals = []
    aro_all_vals = []

    for emotive_words in dataframe[emotive_words_column]:
      individual_scores = []
      values_scores = []
      for emotion_value in nawl_emotion_values:
        individual = affective_database.loc[emotive_words][emotion_value].to_numpy(dtype=np.float32).flatten()
        individual_scores.append(individual)
        
        average = round(np.nanmean(np.array(individual)), 5)
        values_scores.append(average)
      
      happ_ind = individual_scores[0]
      happ_all.append(list(happ_ind))
      ang_ind = individual_scores[1]
      ang_all.append(list(ang_ind))
      sad_ind = individual_scores[2]
      sad_all.append(list(sad_ind))
      fea_ind = individual_scores[3]
      fea_all.append(list(fea_ind))
      dis_ind = individual_scores[4]
      dis_all.append(list(dis_ind))
      val_ind = individual_scores[5]
      val_all.append(list(val_ind))
      aro_ind = individual_scores[6]
      aro_all.append(list(aro_ind))

      happ_val = values_scores[0]
      happ_all_vals.append(happ_val)
      ang_val = values_scores[1]
      ang_all_vals.append(ang_val)
      sad_val = values_scores[2]
      sad_all_vals.append(sad_val)
      fea_val = values_scores[3]
      fea_all_vals.append(fea_val)
      dis_val = values_scores[4]
      dis_all_vals.append(dis_val)
      val_val = values_scores[5]
      val_all_vals.append(val_val)
      aro_val = values_scores[6]
      aro_all_vals.append(aro_val)

    dataframe["Happiness"] = happ_all_vals
    dataframe["Anger"] = ang_all_vals
    dataframe["Sadness"] = sad_all_vals
    dataframe["Fear"] = fea_all_vals
    dataframe["Disgust"] = dis_all_vals
    dataframe["Valence"] = val_all_vals
    dataframe["Arousal"] = aro_all_vals

    dataframe["Happiness_individual_values"] = happ_all
    dataframe["Anger_individual_values"] = ang_all
    dataframe["Sadness_individual_values"] = sad_all
    dataframe["Fear_individual_values"] = fea_all
    dataframe["Disgust_individual_values"] = dis_all
    dataframe["Valence_individual_values"] = val_all
    dataframe["Arousal_individual_values"] =  aro_all


  elif database == "EMEAN":
    affective_database = Emean_db[emean_cols]
    affective_database.set_index(emean_words, inplace=True)

    happ_all = []
    ang_all = []
    sad_all = []
    fea_all = []
    dis_all = []
    val_all = []
    aro_all = []
    sur_all = []
    tru_all = []
    ant_all = []

    happ_all_vals = []
    ang_all_vals = []
    sad_all_vals = []
    fea_all_vals = []
    dis_all_vals = []
    val_all_vals = []
    aro_all_vals = []
    sur_all_vals = []
    tru_all_vals = []
    ant_all_vals = []


    for emotive_words in dataframe[emotive_words_column]:
      individual_scores = []
      values_scores = []
      for emotion_value in emean_emotion_values:
        individual = affective_database.loc[emotive_words][emotion_value].to_numpy(dtype=np.float32).flatten()
        individual_scores.append(individual)
        
        average = round(np.nanmean(np.array(individual)), 5)
        values_scores.append(average)
      
      happ_ind = individual_scores[0]
      happ_all.append(list(happ_ind))
      ang_ind = individual_scores[1]
      ang_all.append(list(ang_ind))
      sad_ind = individual_scores[2]
      sad_all.append(list(sad_ind))
      fea_ind = individual_scores[3]
      fea_all.append(list(fea_ind))
      dis_ind = individual_scores[4]
      dis_all.append(list(dis_ind))
      val_ind = individual_scores[5]
      val_all.append(list(val_ind))
      aro_ind = individual_scores[6]
      aro_all.append(list(aro_ind))
      sur_ind = individual_scores[7]
      sur_all.append(list(sur_ind))
      tru_ind = individual_scores[8]
      tru_all.append(list(tru_ind))
      ant_ind = individual_scores[9]
      ant_all.append(list(ant_ind))   

      happ_val = values_scores[0]
      happ_all_vals.append(happ_val)
      ang_val = values_scores[1]
      ang_all_vals.append(ang_val)
      sad_val = values_scores[2]
      sad_all_vals.append(sad_val)
      fea_val = values_scores[3]
      fea_all_vals.append(fea_val)
      dis_val = values_scores[4]
      dis_all_vals.append(dis_val)
      val_val = values_scores[5]
      val_all_vals.append(val_val)
      aro_val = values_scores[6]
      aro_all_vals.append(aro_val)
      sur_val = values_scores[7]
      sur_all_vals.append(sur_val)
      tru_val = values_scores[8]
      tru_all_vals.append(tru_val)
      ant_val = values_scores[9]
      ant_all_vals.append(ant_val)


    dataframe["Happiness"] = happ_all_vals
    dataframe["Anger"] = ang_all_vals
    dataframe["Sadness"] = sad_all_vals
    dataframe["Fear"] = fea_all_vals
    dataframe["Disgust"] = dis_all_vals
    dataframe["Valence"] = val_all_vals
    dataframe["Arousal"] = aro_all_vals
    dataframe["Surprise"] = sur_all_vals
    dataframe["Trust"] = tru_all_vals
    dataframe["Anticipation"] = ant_all_vals

    dataframe["Happiness_individual_values"] = happ_all
    dataframe["Anger_individual_values"] = ang_all
    dataframe["Sadness_individual_values"] = sad_all
    dataframe["Fear_individual_values"] = fea_all
    dataframe["Disgust_individual_values"] = dis_all
    dataframe["Valence_individual_values"] = val_all
    dataframe["Arousal_individual_values"] =  aro_all
    dataframe["Surprise_individual_values"] = sur_all
    dataframe["Trust_individual_values"] = tru_all
    dataframe["Anticipation_individual_values"] = ant_all

  return dataframe

In [6]:
def emotion_category(dataframe, emotive_words_column, db_words, db_emotion_category, database = "nawl"):
  '''Parameters: 
  dataframe: dataframe with your data,

  emotive_words_column: str - name of a column in your dataframe where emotive words are located,
    
  db_words: str - name of a column in affective database where words are listed,

  db_emotion_category: str - name of the column from affective database from where the categories will be taken,

  database: str - name of an affective database you want to analyse your data with --> type "nawl" or "emean"
  '''
  import numpy as np

  database = database.upper()

  if database == "NAWL":
    affective_database = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Emotional word lists/NAWL_full_db.xlsx", index_col=0)
  elif database == "EMEAN":
    affective_database = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Emotional word lists/uniq_lemma_Emean.xlsx", index_col=0)
  
  affective_database = affective_database[[db_words, db_emotion_category]]
  affective_database.set_index(db_words, inplace=True)
  set_of_words = set(affective_database.index)

  all_emotion_categories = []
  for emotive_words in dataframe[emotive_words_column]:
    emotion_categories = [affective_database[db_emotion_category].loc[str(word)] if str(word) in set_of_words else np.nan for word in emotive_words]
    all_emotion_categories.append(emotion_categories)
  
  dataframe["Emotion_categories"] = all_emotion_categories
  return dataframe

In [7]:
def count_categories(dataframe, emotion_categories_column, db_emotion_category, database = "nawl"):
  '''Parameters: 
  dataframe: dataframe with data,
  
  emotion_categories_column: str - name of a column in your dataframe where emotion categories are located,
  
  db_emotion_category: str - name of the column from affective database from where the categories will be taken,

  database: str - name of an affective database you want to analyse your data with --> type "nawl" or "emean"
  '''
  #dataframe.fillna(0, inplace=True)
  database = database.upper()

  if database == "NAWL":
    affective_database = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Emotional word lists/NAWL_full_db.xlsx", index_col=0)
  elif database == "EMEAN":
    affective_database = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Emotional word lists/uniq_lemma_Emean.xlsx", index_col=0) 
  
  all_categories = affective_database[db_emotion_category].unique().tolist()

  dataframe["merge_indx"] = range(0, len(dataframe))
  from collections import Counter

  dataframe = pd.merge(dataframe, pd.DataFrame([Counter(x) for x in dataframe[emotion_categories_column]]).fillna(0).astype(int).add_prefix("CATEGORY_"), how='left', left_on="merge_indx", right_index=True)
  dataframe.drop(["merge_indx"], axis=1, inplace=True)
  
  for category in all_categories:
    if not "CATEGORY_"+category in dataframe.columns:
      dataframe["CATEGORY_"+category] = 0
  
  return dataframe


In [8]:
# load your data
my_data = load_data("/content/drive/MyDrive/Colab Notebooks/debates/DebateTVP_June.xlsx")
my_data.head(1)

Unnamed: 0,argument
6,"plączący się w swoich deklaracjach minister Szumowski minister Szumowski raz mówi, że wirus groźny"


In [11]:
# lemmatize your text
my_data = lemmatization(my_data, "argument")
my_data.head(1)

Unnamed: 0,argument,argument_lemmatized
6,"plączący się w swoich deklaracjach minister Szumowski minister Szumowski raz mówi, że wirus groźny","[plączący, się, w, swoich, deklaracja, minister, szumowski, minister, szumowski, raz, mówić, że, wirus, groźny]"


In [12]:
# find (retrieve) emotive word from your texts
my_data = find_emotive_words(my_data, content_lemmatized_column = "argument_lemmatized", 
                           db_words = "lemma", database = "emean")
my_data.head(2)

Unnamed: 0,argument,argument_lemmatized,Emotive_words
6,"plączący się w swoich deklaracjach minister Szumowski minister Szumowski raz mówi, że wirus groźny","[plączący, się, w, swoich, deklaracja, minister, szumowski, minister, szumowski, raz, mówić, że, wirus, groźny]","[wirus, groźny]"
7,"plączący się w swoich deklaracjach minister Szumowski minister Szumowski raz mówi, że nie groźny","[plączący, się, w, swoich, deklaracja, minister, szumowski, minister, szumowski, raz, mówić, że, nie, groźny]",[groźny]


In [13]:
# assign emotion categories to found emotive words in texts
my_data = emotion_category(my_data, emotive_words_column= "Emotive_words", db_words = "lemma", 
                         db_emotion_category = "classification", 
                         database = "emean")
my_data.head(2)

Unnamed: 0,argument,argument_lemmatized,Emotive_words,Emotion_categories
6,"plączący się w swoich deklaracjach minister Szumowski minister Szumowski raz mówi, że wirus groźny","[plączący, się, w, swoich, deklaracja, minister, szumowski, minister, szumowski, raz, mówić, że, wirus, groźny]","[wirus, groźny]","[FEA, FEA]"
7,"plączący się w swoich deklaracjach minister Szumowski minister Szumowski raz mówi, że nie groźny","[plączący, się, w, swoich, deklaracja, minister, szumowski, minister, szumowski, raz, mówić, że, nie, groźny]",[groźny],[FEA]


In [14]:
# compute average emotion values conveyed in your texts
my_data = average(my_data, emotive_words_column = "Emotive_words", database = "emean")
my_data.head(2)

Unnamed: 0,argument,argument_lemmatized,Emotive_words,Emotion_categories,Happiness,Anger,Sadness,Fear,Disgust,Valence,Arousal,Surprise,Trust,Anticipation,Happiness_individual_values,Anger_individual_values,Sadness_individual_values,Fear_individual_values,Disgust_individual_values,Valence_individual_values,Arousal_individual_values,Surprise_individual_values,Trust_individual_values,Anticipation_individual_values
6,"plączący się w swoich deklaracjach minister Szumowski minister Szumowski raz mówi, że wirus groźny","[plączący, się, w, swoich, deklaracja, minister, szumowski, minister, szumowski, raz, mówić, że, wirus, groźny]","[wirus, groźny]","[FEA, FEA]",0.16141,1.19345,1.61117,2.27031,1.12062,-1.38813,1.60907,0.74475,0.18102,0.42892,"[0.10714286, 0.21568628]","[1.0535715, 1.3333334]","[1.7321428, 1.4901961]","[2.2857144, 2.254902]","[1.3392857, 0.9019608]","[-1.4821428, -1.2941177]","[1.375, 1.8431373]","[0.60714287, 0.88235295]","[0.10714286, 0.25490198]","[0.25, 0.60784316]"
7,"plączący się w swoich deklaracjach minister Szumowski minister Szumowski raz mówi, że nie groźny","[plączący, się, w, swoich, deklaracja, minister, szumowski, minister, szumowski, raz, mówić, że, nie, groźny]",[groźny],[FEA],0.21569,1.33333,1.4902,2.2549,0.90196,-1.29412,1.84314,0.88235,0.2549,0.60784,[0.21568628],[1.3333334],[1.4901961],[2.254902],[0.9019608],[-1.2941177],[1.8431373],[0.88235295],[0.25490198],[0.60784316]


In [15]:
# count how many emotive words belong to each emotion category
my_data = count_categories(my_data, "Emotion_categories", db_emotion_category = "classification", database = "emean")
my_data.head(2)

Unnamed: 0,argument,argument_lemmatized,Emotive_words,Emotion_categories,Happiness,Anger,Sadness,Fear,Disgust,Valence,Arousal,Surprise,Trust,Anticipation,Happiness_individual_values,Anger_individual_values,Sadness_individual_values,Fear_individual_values,Disgust_individual_values,Valence_individual_values,Arousal_individual_values,Surprise_individual_values,Trust_individual_values,Anticipation_individual_values,CATEGORY_FEA,CATEGORY_ANT,CATEGORY_SAD,CATEGORY_TRU,CATEGORY_HAP,CATEGORY_SUR,CATEGORY_ANG,CATEGORY_NEU,CATEGORY_DIS
6,"plączący się w swoich deklaracjach minister Szumowski minister Szumowski raz mówi, że wirus groźny","[plączący, się, w, swoich, deklaracja, minister, szumowski, minister, szumowski, raz, mówić, że, wirus, groźny]","[wirus, groźny]","[FEA, FEA]",0.16141,1.19345,1.61117,2.27031,1.12062,-1.38813,1.60907,0.74475,0.18102,0.42892,"[0.10714286, 0.21568628]","[1.0535715, 1.3333334]","[1.7321428, 1.4901961]","[2.2857144, 2.254902]","[1.3392857, 0.9019608]","[-1.4821428, -1.2941177]","[1.375, 1.8431373]","[0.60714287, 0.88235295]","[0.10714286, 0.25490198]","[0.25, 0.60784316]",2,0,0,0,0,0,0,0,0
7,"plączący się w swoich deklaracjach minister Szumowski minister Szumowski raz mówi, że nie groźny","[plączący, się, w, swoich, deklaracja, minister, szumowski, minister, szumowski, raz, mówić, że, nie, groźny]",[groźny],[FEA],0.21569,1.33333,1.4902,2.2549,0.90196,-1.29412,1.84314,0.88235,0.2549,0.60784,[0.21568628],[1.3333334],[1.4901961],[2.254902],[0.9019608],[-1.2941177],[1.8431373],[0.88235295],[0.25490198],[0.60784316],1,0,0,0,0,0,0,0,0
