# Comparison of raw text toxicity vs toxicity of the preprocessed text
Notebook where we produce most of final figures with the TUM colors, and did a basic t-test to test pre and post take over hypothesises.

In [None]:
%reload_ext autoreload
%autoreload 2

import os 
import sys
import pandas as pd
import numpy as np
import plotly 
import plotly.graph_objects as go
import time


try:
    print(run_only_once)
except Exception as e:
    print(os.getcwd())
    os.chdir("./../../../")
    print(os.getcwd())
    run_only_once = "Dir has already been changed"

In [None]:
files_list = ['fitness_hashtag_merged.csv', 'musk_hashtag_merged.csv', 
              'netflix_hashtag_merged.csv', 'trump_hashtag_merged.csv', 
              'uno_hashtag_merged.csv', 'vegan_hashtag_merged.csv', 
              'vegetarian_hashtag_merged.csv']

tum_colors = ["#165DB1", "#C7D97D", "#F7811E", "#FED702", "#B55CA5", "#6A757E"]
tox_cols = ["toxicity", "severe_toxicity", "obscene", "threat", "insult", "identity_attack"]
tox_cols_raw = [f"normal_{i}" for i in tox_cols]
tox_cols_preprocessed = [f"lemma_{i}" for i in tox_cols]

meta_cols = ["retweet_count", "reply_count", "like_count", "quote_count", "created_at"]

def get_prepared_df_for_file(file_name):
    df = pd.read_csv(f"./data/preprocessed/{file_name}")
    df["created_at"] = pd.to_datetime(df["created_at"])
    return df

Link to the page where all events correlated to the takeover are listed:    
   
https://www.searchenginejournal.com/elon-musks-twitter-takeover-a-timeline-of-events/470927/#close

## Load all the data files into a single one

In [None]:
def merge_data_files_into_one_df(files_list):
    comparison_list = []
    tweet_count = 0
    for file_name in files_list:
        hashtag_name = file_name.split("_")[0]
        print(f"For hashtag {hashtag_name}, from file {file_name}")
        tweets_df = get_prepared_df_for_file(file_name)
        tweet_count += len(tweets_df.index)

        # group by date 
        comparison_list.append(tweets_df)
        # figure_list.append(tweets_df)
     
    merged_df = pd.concat(comparison_list, ignore_index=True) 
    print(f"There is {tweet_count} tweets in all files together!")
    return merged_df
combined_data_df = merge_data_files_into_one_df(files_list)

## Most toxic tweets in the dataset and toxicity summaries

In [None]:
# most toxic tweets
import dataframe_image as dfi
pd.set_option('display.max_colwidth', None)
combined_data_df.nlargest(10, columns="lemma_toxicity")[["text", "processed_text", "lemma_toxicity"]]

In [None]:
raw_stats = combined_data_df[tox_cols_raw].describe()
raw_stats

In [None]:
processed_stats = combined_data_df[tox_cols_preprocessed].describe()
processed_stats

## Compare the average of toxicity metrics for raw and lemmatized text

In [None]:
# plot every every statistic for every metric for raw and preprocessed 
for metric in list(processed_stats.index)[1:-1]:
    trace_list = []
    print(metric)
    raw_metric = raw_stats.loc[metric][tox_cols_raw]
    proc_metric = processed_stats.loc[metric][tox_cols_preprocessed]

    trace_list.append(go.Bar(name='Raw text', x=tox_cols, y=raw_metric, marker_color='#005293'))
    trace_list.append(go.Bar(name='Preprocessed text', x=tox_cols, y=proc_metric, marker_color='#e37222'))
    
    fig = go.Figure(data=trace_list)
    # Change the bar mode
    fig.update_layout(barmode='group')
    fig.update_layout(
        title_text=f"Comparison of '{metric}' for raw and preprocessed text, on all toxicity metrics", 
        yaxis_title="Value",
        xaxis_title="Metric name",

    )
    fig.show()

### Plot comparison of single metric for raw and preprocessed

In [None]:
selected_stat = "mean"
raw_mean = tox_cols_raw[2]
pre_mean = tox_cols_preprocessed[2]
raw_mean_val = raw_stats.loc[selected_stat][raw_mean]
pre_mean_val = processed_stats.loc[selected_stat][pre_mean]

print(f"Preprocessing resulted in: {(pre_mean_val/raw_mean_val)*100:.3f}% increase/decrease compared to the raw text metric.")
fig = go.Figure([
                go.Bar(x=[raw_mean], 
                        y=[raw_mean_val],
                        marker_color='#005293', name="Raw text"),
                go.Bar(x=[pre_mean], 
                        y=[pre_mean_val],
                        marker_color='#e37222', name="Preprocessed text")])
fig.update_layout(
    autosize=False,
    width=500,
    height=500,
    title_text=f"Raw and preprocessed text for metric '{selected_stat}'", 
    yaxis_title="Date",
    xaxis_title="Mean metric value",

)
fig.show()
fig.write_image("test_svg.svg")

## Comparision of single hashtags through time
Here we plot a scatter chart over time

In [None]:
def plot_metrics_for_files(files_list):
    comparison_list = []
    tweet_count = 0
    for file_name in files_list[:3]:
        hashtag_name = file_name.split("_")[0]
        print(f"For hashtag {hashtag_name}, from file {file_name}")
        tweets_df = get_prepared_df_for_file(file_name)
        tweet_count += len(tweets_df.index)

        # group by date 
        grouped_by_day = tweets_df[tox_cols_preprocessed + meta_cols].resample("D", on="created_at").mean()
        # display(grouped_by_day.index, grouped_by_day[[tox_cols_preprocessed[0]]])
        figure_list = []
        cols_s = ["lemma_toxicity", "lemma_obscene", "lemma_insult"]
        for col, color_i in zip(cols_s, tum_colors[:3]):
            metric_name = " ".join(col.split("_")[1:])
            trace_i = go.Scatter(x=grouped_by_day.index,y=grouped_by_day[col], 
                                 name=metric_name, mode="lines", marker_color=color_i)
            figure_list.append(trace_i)
            

        fig = go.Figure(data=figure_list)
        fig.update_layout(
            title_text=f"Mean metrics for hashtag/keyword '{hashtag_name}' from 01/06/2022 to 03/01/2023", 
            yaxis_title="Value",
            xaxis_title="Time",
            width=2000,
            height=400,
            autosize=False,
            margin={'l': 0, 'r': 0, 't': 25, 'b': 0}
        )
        fig.update_yaxes(rangemode="tozero")
        fig.update_layout(legend = dict(font = dict(size = 25)))
        
        annotation_height = grouped_by_day[tox_cols_preprocessed].max().max() + 0.01
        print(annotation_height)
        fig.add_vline(x='2022-10-03 00:00:00', line_color="#000000", line_width=3)
        fig.add_annotation(x='2022-10-03 00:00:00', y=annotation_height ,text="Acquisition moves forward", 
                           xanchor="right", yanchor="top", showarrow=False)
        
        fig.add_vline(x='2022-10-27 00:00:00', line_color="#000000", line_width=3)
        fig.add_annotation(x='2022-10-27 00:00:00', y=annotation_height, text="Deal closed", 
                           xanchor="left", yanchor="top", showarrow=False)
        fig.add_vline(x='2022-12-16 00:00:00', line_color="#000000", line_width=3)
        fig.add_annotation(x='2022-12-16 00:00:00', y=annotation_height, text="Musk bans a journalist", 
                           xanchor="left", yanchor="top", showarrow=False)
        fig.show()
        # comparison_list.append(tweets_df)
        # figure_list.append(tweets_df)
        fig.write_image(f"{hashtag_name}_metrics_chart.svg")
        
        
    print(f"There is {tweet_count} tweets in all files together!")
    return comparison_list


figure_arr = plot_metrics_for_files(files_list)

Comparing change in metrics post and pre take over.

In [None]:
def plot_distribution_for_files(file_names):
    combined_df = pd.DataFrame()
    for file_name in file_names:
        hashtag_name = file_name.split("_")[0]
        print(f"\nFor hashtag {hashtag_name}, from file {file_name}")
        tweets_df = get_prepared_df_for_file(file_name)

        # split data pre-takeover (27.10.2022), and post take over
        pre_takeover_mask = tweets_df["created_at"].dt.date < pd.to_datetime("27/10/2022 00:00:00").date()
        post_takeover_mask = tweets_df["created_at"].dt.date >= pd.to_datetime("27/10/2022 00:00:00").date()
        pre_to_df = tweets_df[pre_takeover_mask]
        post_to_df = tweets_df[post_takeover_mask]
        print(f"Pre take over data consists of {len(pre_to_df.index)} tweets, while post take over consists of {len(post_to_df.index)}")
        
        # get averages for each metric and rename the columns
        pre_mean_df = pre_to_df[tox_cols_preprocessed].mean().to_frame().T
        pre_mean_df.columns = [f"pre_{'_'.join(col.split('_')[1:])}" for col in pre_mean_df.columns]
        
        post_mean_df = post_to_df[tox_cols_preprocessed].mean().to_frame().T
        post_mean_df.columns = [f"post_{'_'.join(col.split('_')[1:])}" for col in post_mean_df.columns]
        # merge them to one row
        pre_post_row = pd.concat([pre_mean_df, post_mean_df], axis=1)
        # rename row to hashtag name
        pre_post_row.rename(index={0: hashtag_name}, inplace=True)
        
        if False:
            print(f"- Proc pre-take-over toxicity {pre_post_row['pre_toxicity']}")
            print(f"- Proc post-take-over toxicity {pre_post_row['post_toxicity']}")
            print(f"- Pre change: {((pre_post_row['post_toxicity']/pre_post_row['pre_toxicity']) * 100) - 100}%")

        combined_df = pd.concat([combined_df, pre_post_row])
    display(combined_df)
    return combined_df

combined_diff_df = plot_distribution_for_files(files_list)

In [None]:
for tox_metric in tox_cols:
    trace_arr = []
    for index, row in combined_diff_df.iterrows():
        diff = (row[f"post_{tox_metric}"] / row[f"pre_{tox_metric}"]) * 100 - 100
        if diff >= 0 :
            trace_arr.append(go.Bar(x=[index], 
                                y=[diff],
                                marker_color='#005293', 
                                name=index,
                                showlegend=False))
        else:
            trace_arr.append(go.Bar(x=[index], 
                    y=[diff],
                    marker_color='#e37222', 
                    name=index,
                    showlegend=False))
    fig = go.Figure(trace_arr)
    fig.update_layout(
        title_text=f"Relative change in mean '{tox_metric}' post takeover", 
        yaxis_title="Relative change to pre-take over",
        xaxis_title="Hashtag/Keyword",
        yaxis=dict(ticksuffix=".0%"),
        width=500,
        height=300,
        autosize=False,
        margin={'l': 0, 'r': 0, 't': 25, 'b': 0}
    )
    
    fig.write_image(f"change_in_{tox_metric}_chart.svg")
    fig.show()

In [None]:
combined_diff_df

In [None]:
# scatter plot of combined metrics 
# plot every every statistic for every metric for raw and preprocessed 
x_col_pre = "pre_toxicity"
y_col_pre = "pre_severe_toxicity"
x_col_post = "post_toxicity"
y_col_post = "post_severe_toxicity"

trace_list = []
trace_list.append(go.Scatter(name='Pre-takeover', 
                             x=combined_diff_df[x_col_pre],
                             y=combined_diff_df[y_col_pre], 
                             text=combined_diff_df.index,
                             marker_color='#005293',
                             mode="markers+text",
                             marker=dict(size=6),
                             textposition='top center'))
trace_list.append(go.Scatter(name='Post-takeover', 
                             x=combined_diff_df[x_col_post], 
                             y=combined_diff_df[y_col_post], 
                             text=combined_diff_df.index,
                             marker_color='#e37222', 
                             mode="markers+text",
                             marker=dict(size=6),
                             textposition='bottom center'))

fig = go.Figure(data=trace_list)
# Change the bar mode

fig.update_layout(
    title_text=f"Comparison of toxicity and severe-toxicity metrics pre-post takeover", 
    yaxis_title="Severe toxicity",
    xaxis_title="Toxicity",
    margin={'l': 0, 'r': 0, 't': 25, 'b': 0}
)
fig.update_yaxes(rangemode="tozero")
fig.update_layout(legend = dict(font = dict(size = 25)))
fig.show()
fig.write_image("toxicity_vs_severe_toxicity.svg")

## T-test
https://www.geeksforgeeks.org/how-to-conduct-a-two-sample-t-test-in-python/

In [None]:
from statsmodels.stats.weightstats import ttest_ind
import numpy as np
import pingouin as pg

def t_test_v1(file_names):
    for file_name in file_names:
        hashtag_name = file_name.split("_")[0]
        print(f"\n\nFor hashtag {hashtag_name}, from file {file_name}")
        tweets_df = get_prepared_df_for_file(file_name)

        # split data pre-takeover (27.10.2022), and post take over
        pre_takeover_mask = tweets_df["created_at"].dt.date < pd.to_datetime("27/10/2022 00:00:00").date()
        post_takeover_mask = tweets_df["created_at"].dt.date >= pd.to_datetime("27/10/2022 00:00:00").date()
        pre_to_df = tweets_df[pre_takeover_mask]
        post_to_df = tweets_df[post_takeover_mask]
        print(f"Pre take over data consists of {len(pre_to_df.index)} tweets, while post take over consists of {len(post_to_df.index)}")
        # Conducting two-sample ttest
    
        print("T-test")
        result = pg.ttest(pre_to_df["lemma_toxicity"],
                          post_to_df["lemma_toxicity"],
                          correction=True)

        # Print the result
        print("With pingouin", result)
        print("Stats models", ttest_ind(pre_to_df["lemma_toxicity"], post_to_df["lemma_toxicity"]))

combined_diff_df = t_test_v1(files_list)