# Comparison of raw text toxicity vs toxicity of the preprocessed text

In [None]:
%reload_ext autoreload
%autoreload 2

import os 
import sys
import pandas as pd
import numpy as np
import plotly 
import plotly.graph_objects as go
import time


try:
    print(run_only_once)
except Exception as e:
    print(os.getcwd())
    os.chdir("./../../../")
    print(os.getcwd())
    run_only_once = "Dir has already been changed"

In [None]:
files_list = ['fitness_hashtag_merged.csv', 'musk_hashtag_merged.csv', 
              'netflix_hashtag_merged.csv', 'trump_hashtag_merged.csv', 
              'uno_hashtag_merged.csv', 'vegan_hashtag_merged.csv', 
              'vegetarian_hashtag_merged.csv']

tox_cols = ["toxicity", "severe_toxicity", "obscene", "threat", "insult", "identity_attack"]
tox_cols_raw = [f"normal_{i}" for i in tox_cols]
tox_cols_preprocessed = [f"lemma_{i}" for i in tox_cols]

meta_cols = ["retweet_count", "reply_count", "like_count", "quote_count", "created_at"]

def get_prepared_df_for_file(file_name):
    df = pd.read_csv(f"./data/preprocessed/{file_name}")
    df["created_at"] = pd.to_datetime(df["created_at"])
    return df

Link to the page where all events correlated to the takeover are listed.
## Compare the average of toxicity metrics for raw and lemmatized text

In [None]:
def plot_metrics_for_files(files_list):
    comparison_list = []
    tweet_count = 0
    for file_name in files_list:
        hashtag_name = file_name.split("_")[0]
        print(f"For hashtag {hashtag_name}, from file {file_name}")
        tweets_df = get_prepared_df_for_file(file_name)
        tweet_count += len(tweets_df.index)

        # group by date 
        comparison_list.append(tweets_df)
        # figure_list.append(tweets_df)
     
    merged_df = pd.concat(comparison_list, ignore_index=True) 
    print(f"There is {tweet_count} tweets in all files together!")
    return merged_df
combined_data_df = plot_metrics_for_files(files_list)

In [None]:
raw_stats = combined_data_df[tox_cols_raw].describe()
raw_stats

In [None]:
processed_stats = combined_data_df[tox_cols_preprocessed].describe()
processed_stats

In [None]:
# plot every every statistic for every metric for raw and preprocessed 
for metric in list(processed_stats.index)[1:-1]:
    trace_list = []
    print(metric)
    raw_metric = raw_stats.loc[metric][tox_cols_raw]
    proc_metric = processed_stats.loc[metric][tox_cols_preprocessed]

    trace_list.append(go.Bar(name='Raw text', x=tox_cols, y=raw_metric, marker_color='#005293'))
    trace_list.append(go.Bar(name='Preprocessed text', x=tox_cols, y=proc_metric, marker_color='#e37222'))
    
    fig = go.Figure(data=trace_list)
    # Change the bar mode
    fig.update_layout(barmode='group')
    fig.update_layout(
        title_text=f"Comparison of '{metric}' for raw and preprocessed text, on all toxicity metrics", 
        yaxis_title="Value",
        xaxis_title="Metric name",

    )
    fig.show()

In [None]:
# plot comparison of single metric for raw and preprocessed
selected_stat = "mean"
raw_mean = tox_cols_raw[2]
pre_mean = tox_cols_preprocessed[2]
raw_mean_val = raw_stats.loc[selected_stat][raw_mean]
pre_mean_val = processed_stats.loc[selected_stat][pre_mean]

print(f"Preprocessing resulted in: {(pre_mean_val/raw_mean_val)*100:.3f}% increase/decrease compared to the raw text metric.")
fig = go.Figure([
                go.Bar(x=[raw_mean], 
                        y=[raw_mean_val],
                        marker_color='#005293', name="Raw text"),
                go.Bar(x=[pre_mean], 
                        y=[pre_mean_val],
                        marker_color='#e37222', name="Preprocessed text")])
fig.update_layout(
    autosize=False,
    width=500,
    height=500,
    title_text=f"Raw and preprocessed text for metric '{selected_stat}'", 
    yaxis_title="Date",
    xaxis_title="Mean metric value",

)
fig.show()
fig.write_image("test_svg.svg")

## Comparision of single hashtags through time
Here we plot a scatter chart over time

In [None]:
def plot_metrics_for_files(files_list):
    comparison_list = []
    tweet_count = 0
    for file_name in files_list:
        hashtag_name = file_name.split("_")[0]
        print(f"For hashtag {hashtag_name}, from file {file_name}")
        tweets_df = get_prepared_df_for_file(file_name)
        tweet_count += len(tweets_df.index)

        # group by date 
        grouped_by_day = tweets_df[tox_cols_preprocessed + meta_cols].resample("D", on="created_at").mean()
        # display(grouped_by_day.index, grouped_by_day[[tox_cols_preprocessed[0]]])
        figure_list = []
        tum_colors = ["#165DB1", "#FED702", "#F7811E", "#B55CA5", "#6A757E", "#C7D97D"]
        for col, color_i in zip(tox_cols_preprocessed, tum_colors):
            metric_name = " ".join(col.split("_")[1:])
            trace_i = go.Scatter(x=grouped_by_day.index,y=grouped_by_day[col], 
                                 name=metric_name, mode="lines", marker_color=color_i)
            figure_list.append(trace_i)

        fig = go.Figure(data=figure_list)
        fig.update_layout(
            title_text=f"Mean metrics for hashtag/keyword '{hashtag_name}' from 01/06/2022 to 03/01/2023", 
            yaxis_title="Value",
            xaxis_title="Time",
        )
        
        annotation_height = grouped_by_day[tox_cols_preprocessed].max().max() + 0.01
        print(annotation_height)
        fig.add_vline(x='2022-10-04 00:00:00', line_dash="dash", line_color="#8F81EA", line_width=3)
        fig.add_annotation(x='2022-10-04 00:00:00', y=annotation_height ,text="Acquisition moves forward", 
                           xanchor="right", yanchor="top", showarrow=False)
        
        fig.add_vline(x='2022-10-27 00:00:00', line_dash="dash", line_color="#D95117", line_width=3)
        fig.add_annotation(x='2022-10-27 00:00:00', y=annotation_height, text="Deal closed", 
                           xanchor="left", yanchor="top", showarrow=False)
        fig.show()
        # comparison_list.append(tweets_df)
        # figure_list.append(tweets_df)
        
        
    print(f"There is {tweet_count} tweets in all files together!")
    return comparison_list


figure_arr = plot_metrics_for_files(files_list)

Comparing distributions of normal and severe toxicity for each hashtag through time.

In [None]:
import plotly.figure_factory as ff

def plot_distribution_for_file(file_name):
    hashtag_name = file_name.split("_")[0]
    print(f"For hashtag {hashtag_name}, from file {file_name}")
    tweets_df = get_prepared_df_for_file(file_name)

    # split data pre-takeover (27.10.2022), and post take over
    pre_takeover_mask = tweets_df["created_at"].dt.date < pd.to_datetime("27/10/2022 00:00:00").date()
    post_takeover_mask = tweets_df["created_at"].dt.date >= pd.to_datetime("27/10/2022 00:00:00").date()
    pre_to_df = tweets_df[pre_takeover_mask]
    post_to_df = tweets_df[post_takeover_mask]
    print(f"Pre take over data consists of {len(pre_to_df.index)} tweets, while post take over consists of {len(post_to_df.index)}")

    pre_toxicity = pre_to_df['normal_toxicity'].mean()
    post_toxicity = post_to_df['normal_toxicity'].mean()
    pre_toxicity_p = pre_to_df['lemma_toxicity'].mean()
    post_toxicity_p = post_to_df['lemma_toxicity'].mean()
    print(f"- Raw pre-take over toxicity: {pre_toxicity:.5f}")
    print(f"- Raw post-take over toxicity: {post_toxicity:.5f}")
    print(f"- Proc pre-take-over toxicity {pre_toxicity_p:.5f}")
    print(f"- Proc post-take-over toxicity {post_toxicity_p:.5f}")
    print(f"- Raw change: {((post_toxicity/pre_toxicity) * 100) - 100:.2f}%")
    print(f"- Pre change: {((post_toxicity_p/pre_toxicity_p) * 100) - 100:.2f}%")
    # fig = go.Figure(data=[go.Histogram(x=tweets_df[pre_takeover_mask]["lemma_toxicity"], histnorm='probability')])
    # fig.show()
    
    # plot histogram from mean and std
    tmp = tweets_df[pre_takeover_mask][["lemma_toxicity"]].describe() 
    tmp2 = tweets_df[post_takeover_mask][["lemma_toxicity"]].describe() 
    display(tmp)
    x1 = np.random.randn(200)
    x2 = np.random.randn(200) + 2

    group_labels = ['Pre-take over', 'Post take over']
    colors = ['slategray', 'magenta']

    """    fig = go.Figure()
    # fig.add_trace(go.Box(y=tweets_df[pre_takeover_mask]['lemma_toxicity']))
    # fig.add_trace(go.Box(y=tweets_df[post_takeover_mask]['lemma_toxicity']))
    
    fig.add_trace(go.Box(x="s2", mean=tmp.loc["mean"], median=tweets_df[pre_takeover_mask][["lemma_toxicity"]].median(),
                        q1=tmp.loc["25%"], q3=tmp.loc["75%"]))
    fig.add_trace(go.Box(x="s1", mean=tmp2.loc["mean"], median=tweets_df[post_takeover_mask][["lemma_toxicity"]].median(),
                        q1=tmp2.loc["25%"], q3=tmp2.loc["75%"]))
    fig.show()"""
    
        
for file in files_list:
    plot_distribution_for_file(files_list[0])
    break