In [None]:
import pandas as pd
pd.set_option("max_colwidth", 400)
pd.set_option("display.max.columns", None)
pd.set_option("display.max.rows", None)

import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
plt.style.use("seaborn-talk")
import matplotlib.ticker as mtick

import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
np.seterr(divide='ignore')
warnings.filterwarnings(action='ignore', message='Mean of empty slice')
pd.options.mode.chained_assignment = None 

In [None]:
def clean_text(dataframe, text_column):
  '''Parameters:
  dataframe: dataframe with data,
  
  text_column: str - name of the column in the dataframe where the text you want to clean is listed
  '''
  import re
  import string
  df = dataframe.copy()
  all_texts = []
  for text in df[text_column]:
    text = re.sub(r"(http|https):\/\/([\w\s\d\.]+)(\/?)(.*)", " ", str(text).lower()) #  urls
    text = re.sub(r"(www).([\w\s\d\.]+)(\/?)(.*)", " ", text) #  urls
    text = re.sub('@[\w\d]+',' ', text)  # mentions
    text = text.replace("\n", " ") # new lines
    text = re.sub(r'\B#\w*[a-zA-Z0-9]+\w*',' ', text) # hashtags
    text = text.strip()
    text = re.sub(r'\s\s+', ' ', text)
    all_texts.append(text)
  df["clean_" + text_column] = all_texts

  return df

In [None]:
def lemmatization(dataframe, text_column):
  '''Parameters:
  dataframe: dataframe with your data,
  
  text_column: column of a dataframe where text is located
  '''
  df = dataframe.copy()
  lemmas = []
  for doc in nlp.pipe(df[text_column].apply(str)):
    lemmas.append([token.lemma_ for token in doc if (not token.is_punct and len(token) > 1)])
  df[text_column+"_lemmatized"] = lemmas
  return df

In [None]:
def find_pathos_inducers(dataframe, content_lemmatized_column, affective_database_path, db_words = "Word", uniq_words=False):
  '''Parameters: 
  dataframe: dataframe with your data,

  content_lemmatized_column: str - name of a column in dataframe where lemmatized text is located,
  
  affective_database_path: str - path to a file with affective database,
  
  db_words: str - name of a column in affective database where words are listed,
  
  uniq_words: boolean - True if you want to retrieve only unique emotive words from your text data,
  False if you want to retrieve every emotive word (thus, there can be duplicated words),
  --> *by default it is set to False
  '''

  if affective_database_path.endswith(".xlsx"):
    affective_database = pd.read_excel(affective_database_path)
  elif affective_database_path.endswith(".csv"):
    affective_database = pd.read_csv(affective_database_path)

  affective_database = affective_database[[db_words]]
  affective_database_emotive_words = affective_database[db_words].tolist()

  all_emotive_words = []
  if uniq_words == True:
    for lemmas_list in dataframe[content_lemmatized_column]:
      emotive_words = [word for word in set(lemmas_list).intersection(affective_database[db_words])]
      all_emotive_words.append(emotive_words)

  elif uniq_words == False:
    for lemmas_list in dataframe[content_lemmatized_column]:
      emotive_words = []
      for word in lemmas_list:
        if word in affective_database_emotive_words:
          emotive_words.append(word)

      all_emotive_words.append(emotive_words)
  
  dataframe[content_lemmatized_column[:-10]+"pathos_inducers"] = all_emotive_words
  return dataframe

In [None]:
def get_polarity_score(dataframe, content_lemmatized_column, affective_database_path, db_words = "Word"):
  '''Parameters: 
  dataframe: dataframe with your data,

  content_lemmatized_column: str - name of a column in dataframe where words-lemmas are listed
  
  affective_database_path: str - path to a file with affective database,
  
  db_words: str - name of a column in affective database where words are listed
  '''
  affective_database = load_data(affective_database_path)

  emotion_values = ["Valence_standardized"]
  used_cols = [db_words] + emotion_values

  affective_database_polarity = affective_database[used_cols]
  affective_database_polarity.set_index(db_words, inplace=True)


  all_neg_percent = []
  all_pos_percent = []

  affective_database_polarity_words = affective_database[db_words].tolist()

  for lemmas_list in dataframe[content_lemmatized_column]:
    emotive_words = []
    for word in lemmas_list:
      if word in affective_database_polarity_words:
        emotive_words.append(word)
    
    if len(emotive_words) > 0:
      scores = affective_database_polarity.loc[emotive_words]

      neg_scores_count = scores.where(scores["Valence_standardized"] < -0.5).count()[0]

      pos_scores_count = scores.where(scores["Valence_standardized"] > 1).count()[0]

      neg_percent = round((neg_scores_count / len(lemmas_list)), 3)
      all_neg_percent.append(neg_percent)

      pos_percent = round((pos_scores_count / len(lemmas_list)), 3)
      all_pos_percent.append(pos_percent)
      
    else:
      neg_percent=pos_percent = np.NaN 
      all_neg_percent.append(neg_percent)
      all_pos_percent.append(pos_percent)

  dataframe[content_lemmatized_column[:-10]+"Negative_percentage"] = all_neg_percent
  dataframe[content_lemmatized_column[:-10]+"Positive_percentage"] = all_pos_percent

  return dataframe

#Preprocess data

In [None]:
# eliciting valence data
data_debate = load_data("/content/drive/MyDrive/Colab Notebooks/debates/debate_2020_June.xlsx")
data_debate.head(1)

In [None]:
for col in ["conclusion", "premise", "full_argument"]:
  print(col)
  data = lemmatization(df, col) 

  data = find_pathos_inducers(data, content_lemmatized_column = col+'_lemmatized', 
                                           affective_database_path = "/content/drive/MyDrive/Colab Notebooks/Emotional word lists/joined_scaled_filled_0_NAWL-Sentimenti_db.xlsx", 
                                           db_words = "Word")

  data = get_polarity_score(data, content_lemmatized_column = col, 
                        affective_database_path = "/content/drive/MyDrive/Colab Notebooks/Emotional word lists/valence_only10k_scaled_NAWL-Sentimenti_Imbir.xlsx")


In [None]:
# expressed sentiment data

social_media = load_data("/content/drive/MyDrive/Colab Notebooks/debates/validation_samples/tweet_Czerwiec_expressed_sentiment.xlsx")
social_media.sort_values(by = 'Data', inplace=True)
social_media = social_media[social_media.Data < '2020-06-17 23:00:00']
social_media = clean_text(social_media, "Tekst")
social_media.shape

#Filter SM data

In [None]:
def filter_candidates(social_media, candidate = 'Andrzej Duda'):
  """
  'Andrzej Duda', 
  'Rafał Trzaskowski'

  """
  key_words_duda = ['duda', 'andrzej', 'pad ', 'dudy', 'dudzie', 'anżej', 'rzad', 'rząd']
  key_words_trzask = ['rafa', 'trzask', 'warszaw', 'platform', ' ko ', 'kidaw']

  if candidate == 'Andrzej Duda':
    key_words = key_words_duda
  elif candidate == 'Rafał Trzaskowski':
    key_words = key_words_trzask

  sm2 = social_media[['Data', 'Tekst', 'clean_Tekst',
                      'sentiment_tuned_PaRes', 'sentiment_label_PaRes']]
  
  sm2 = sm2.reset_index(drop=True)
  sm2['Tekst'] = sm2.Tekst.apply(lambda x: str(x).lower())
  k_ids = []
  for k in key_words:
    for i in sm2.index:
      if k in sm2.loc[i, 'Tekst']:
        k_ids.append(i)  

  sm_filtered = sm2.loc[k_ids]
  sm_filtered.drop_duplicates("Tekst", inplace=True)
  sm_filtered.sort_values(by = "Data", inplace=True)
  print(f"Found: {len(sm_filtered)} tweets for candidate: {candidate}")
  return sm_filtered

In [None]:
def normalise_data(dataframe_social_media, dataframe_debate):
  """
  Parameters:  

  dataframe_social_media: data with social media reactions (comments), 

  dataframe_debate:  data with debate (politicians' arguments)

  """
  sm_data = dataframe_social_media.copy()
  deb_data = dataframe_debate.copy()

  sm_data.sort_values(by = 'Data', inplace=True)

  # eliciting emotions
  deb_data["Time"] = pd.to_datetime(deb_data.start)
  deb_data.sort_values(by = "Time", inplace=True)
  df_plot = deb_data.set_index("Time").resample("1T").mean().fillna(0)
  df_plot.reset_index(inplace=True)
  df_plot["Time"] = df_plot["Time"].dt.time.apply(str)

  df_plots_counts_normalized = pd.DataFrame(sm_data.set_index("Data").shift(periods=-1, 
                                                                              freq="T", 
                                                                              axis=0).resample("1T")["sentiment_label_PaRes"].value_counts(normalize=True)) * 100
  df_plots_counts_normalized.columns = ['_'.join(col) for col in df_plots_counts_normalized.columns]
  df_plots_counts_normalized = df_plots_counts_normalized.reset_index()
  df_plots_counts_normalized.columns = ['Data', 'sentiment_label_PaRes', 'mean']

  df_plots_counts_normalized.sort_values(by = ['Data', 'sentiment_label_PaRes'], inplace=True)
  df_plots_counts_normalized["Time"] = df_plots_counts_normalized["Data"].dt.time.apply(str)
    
  # merge social media and debate data
  df_plot_join = pd.merge(df_plots_counts_normalized, df_plot, on = "Time", how = "left")
    
  cols2 = ['full_argument_Negative_percentage','full_argument_Positive_percentage']
  df_plot_join = df_plot_join[['Data', 'mean', 'sentiment_label_PaRes', 'Time']+cols2]

  # elicited valence 
  df_baselines_diff = df_plot_join.copy()
    
  # expressed sentiment baselines
  expressed_senti_baselines = pd.DataFrame(df_plot_join.groupby("sentiment_label_PaRes")["mean"].mean()).reset_index()
  expressed_senti_baselines = expressed_senti_baselines.set_index("sentiment_label_PaRes").T
  for col in set(expressed_senti_baselines.columns):
    ids = df_baselines_diff[df_baselines_diff.sentiment_label_PaRes == col]["mean"].index
    df_baselines_diff.loc[ids, "mean"] = df_baselines_diff.loc[ids, "mean"] - expressed_senti_baselines[col].iloc[0]

  return df_baselines_diff

##AD

In [None]:
sm_duda = filter_candidates(social_media = social_media)

duda_df_base = normalise_data(dataframe_social_media = sm_duda, 
                                        dataframe_debate = data_debate)
duda_df_base['full_argument_valence_score'] = duda_df_base['full_argument_Positive_percentage'] - duda_df_base['full_argument_Negative_percentage']
print(duda_df_base.shape, '\n')


deb_duda = data_debate[data_debate.speaker == 'Andrzej Duda']
print(deb_duda.shape, '\n')
duda_utterance_time = deb_duda.start.unique()

duda_utterance_time

array(['21:04:00', '21:22:00', '22:06:00', '21:42:00'], dtype=object)

In [None]:
deb_duda_scatter = duda_df_base[duda_df_base.Time.isin(duda_utterance_time)]

In [None]:
df_plot_neg = duda_df_base[duda_df_base.sentiment_label_PaRes == "neg"]
df_plot_pos = duda_df_base[duda_df_base.sentiment_label_PaRes == "pos"]

deb_duda_scatter2 = deb_duda_scatter[deb_duda_scatter.sentiment_label_PaRes=='neg']


sns.set_theme(style="whitegrid")
plt.style.use("seaborn-talk")

fig, ax1 = plt.subplots(1, 1, figsize=(15, 8.5))
x = list(df_plot_neg.Time)

ax1.plot(df_plot_neg["Time"], df_plot_neg["full_argument_valence_score"]*100, label = "valence", 
         color = "#525252", alpha=0.85, linewidth = 2.6)


plt.scatter(deb_duda_scatter2["Time"], deb_duda_scatter2["full_argument_valence_score"]*100, 
            color = "#0900A4", label = "Andrzej Duda arguments", alpha = 0.9, s = 120)
xx = deb_duda_scatter2["Time"].values
yy = deb_duda_scatter2["full_argument_valence_score"].values*100
for i, txt in enumerate(xx):
    ax1.annotate(txt, (xx[i], 27), xycoords="data", 
                 xytext=(-20, 0), textcoords="offset points",
                  va="center", ha="left", color = '#0900A4',
                  bbox=dict(boxstyle="round", fc="w"),
                  arrowprops=dict(arrowstyle="->"))


ax1.plot(df_plot_neg["Time"], df_plot_neg["mean"], label = "negative sentiment expressed", 
         color = "#E90000", alpha=0.65, linestyle="--", linewidth = 2.5)

ax1.plot(df_plot_pos["Time"], df_plot_pos["mean"], label = "positive sentiment expressed", 
         color = "#00E965", alpha=0.75, linestyle="--", linewidth = 2.5)

ax1.set_xticks(x[::3])
ax1.set_xticklabels(x[::3], rotation=90, size=12)
ax1.set_xlabel("\nTime")

ax1.set_title("Positivity and negativity in 2020 June debate - Andrzej Duda keywords \n\n", fontsize = 15)
ax1.set_ylabel("Value\n")
plt.xticks(rotation='vertical', size=11)
#plt.yticks(np.arange(-20, 56, 5))
plt.tight_layout()
plt.legend(loc="upper center", bbox_to_anchor=(0.5, 1.08), ncol=6)

ax1.yaxis.set_major_formatter(mtick.PercentFormatter())
plt.show()

##RT

In [None]:
sm_rt = filter_candidates(social_media = social_media, candidate="Rafał Trzaskowski")

rt_df_base = normalise_data(dataframe_social_media = sm_rt, dataframe_debate = data_debate)
rt_df_base['full_argument_valence_score'] = rt_df_base['full_argument_Positive_percentage'] - rt_df_base['full_argument_Negative_percentage']
print(rt_df_base.shape, '\n')

deb_rt = data_debate[data_debate.speaker == 'Rafał Trzaskowski']
print(deb_rt.shape, '\n')

rt_utterance_time = deb_rt.start.unique()

rt_utterance_time

array(['21:55:00', '21:13:00', '22:09:00', '21:45:00', '21:34:00'],
      dtype=object)

In [None]:
deb_rt_scatter = rt_df_base[rt_df_base.Time.isin(rt_utterance_time)]

In [None]:
df_plot_neg = rt_df_base[rt_df_base.sentiment_label_PaRes == "neg"]
df_plot_pos = rt_df_base[rt_df_base.sentiment_label_PaRes == "pos"]

deb_rt_scatter2 = deb_rt_scatter[deb_rt_scatter.sentiment_label_PaRes=='neg']


sns.set_theme(style="whitegrid")
plt.style.use("seaborn-talk")

fig, ax1 = plt.subplots(1, 1, figsize=(15, 8.5))
x = list(df_plot_pos.Time)

ax1.plot(df_plot_neg["Time"], df_plot_neg["full_argument_valence_score"]*100, label = "valence", 
         color = "#525252", alpha=0.85, linewidth = 2.6)


plt.scatter(deb_rt_scatter2["Time"], deb_rt_scatter2["full_argument_valence_score"]*100, 
            color = "#0900A4", label = "Rafał Trzaskowski arguments", alpha = 0.9, s = 120)
xx = deb_rt_scatter2["Time"].values
yy = deb_rt_scatter2["full_argument_valence_score"].values*100
for i, txt in enumerate(xx):
  ax1.annotate(txt, (xx[i], 33), xycoords="data", 
                 xytext=(-20, 0), textcoords="offset points",
                  va="center", ha="left", color = '#0900A4',
                  bbox=dict(boxstyle="round", fc="w"),
                  arrowprops=dict(arrowstyle="->"))


ax1.plot(df_plot_neg["Time"], df_plot_neg["mean"], label = "negative sentiment expressed", 
         color = "#E90000", alpha=0.66, linestyle="--", linewidth = 2.5)

ax1.plot(df_plot_pos["Time"], df_plot_pos["mean"], label = "positivie sentiment expressed", 
         color = "#00E965", alpha=0.7, linestyle="--", linewidth = 2.5)

ax1.set_xticks(x[::3])
ax1.set_xticklabels(x[::3], rotation=90, size=12)
ax1.set_xlabel("\nTime")

ax1.set_title("Positivity and negativity in TVP 2020 June debate - Rafał Trzaskowski keywords \n\n", fontsize = 15)
ax1.set_ylabel("Value\n")
plt.xticks(rotation='vertical', size=11)
#plt.yticks(np.arange(-20, 56, 5))
plt.tight_layout()
plt.legend(loc="upper center", bbox_to_anchor=(0.5, 1.08), ncol=6)

ax1.yaxis.set_major_formatter(mtick.PercentFormatter())
plt.show()