# Toxicity through time

In [None]:
%reload_ext autoreload
%autoreload 2

import os 
import sys
import pandas as pd
import numpy as np
import plotly 
import plotly.graph_objects as go
import time

import nltk
from detoxify import Detoxify
# nltk.download('stopwords')

try:
    print(run_only_once)
except Exception as e:
    print(os.getcwd())
    os.chdir("./../../")
    print(os.getcwd())
    run_only_once = "Dir has already been changed"

In [None]:

# "./data/processed/merged_tweets_and_toxicity_31_12.csv"
file_name = "./data/detoxify_toxicity_added_hashtags/vegetarian_hashtag_6_1_2023_detoxify_toxicity_712.csv"
tweets_df = pd.read_csv(file_name)
tweets_df["created_at"] = pd.to_datetime(tweets_df["created_at"])
display(tweets_df)

# group by date 
toxicity_cols = ["toxicity", "severe_toxicity", "obscene", "threat", "insult", "identity_attack"]
selected_cols = ["retweet_count", "reply_count", "like_count", "quote_count", "created_at"] + toxicity_cols
grouped_by_day = tweets_df[selected_cols].resample("D", on="created_at").mean()
display(grouped_by_day)

Toxicity metrics through time plotted.

In [None]:
col_traces = []
for column in toxicity_cols:
    col_traces.append(
        go.Scatter(x=grouped_by_day.index,y=grouped_by_day[column], name=column))
    
fig = go.Figure(col_traces)
fig.update_layout(
    title_text="Change of toxicity metrics through time", 
    yaxis_title="Date",
    xaxis_title="Mean metric value",
)
fig.update_layout(bargap=0.2)
fig.show()

## Most and least toxic tweets in our data set

In [None]:
pd.set_option('display.max_colwidth', None)
# tweets_df.iloc[4980:5000]
tweets_df.nlargest(10, ["severe_toxicity"])[["author_id", "text"] + toxicity_cols]

In [None]:
tweets_df.nsmallest(10, ["severe_toxicity"])[["author_id", "text"] + toxicity_cols]

More preprocessing has to be done on tweets to improve the accuracy of the toxicity prediction model because this shit is hillarious.

In [None]:
# load the model
model = Detoxify('original', device="cuda")



In [None]:
# for single predictions
model.predict("Big first world problems…but wtf is going on with Netflix app on Roku tv. Like good god. It’s worse than wow on a prepatch/ expansion launch day.")

## Hashtag toxicity comparison

In [None]:
def compare_hashtag_toxicity(files_list):
    comparison_list = []
    for file_name in files_list:
        hashtag_name = file_name.split("_")[0]
        print(f"For hashtag {hashtag_name}, from file {file_name}")

        file_name = f"./data/detoxify_toxicity_added_hashtags/{file_name}"
        tweets_df = pd.read_csv(file_name)
        tweets_df["created_at"] = pd.to_datetime(tweets_df["created_at"])
        # group by date 
        toxicity_cols = ["toxicity", "severe_toxicity", "obscene", "threat", "insult", "identity_attack"]
        selected_cols = ["retweet_count", "reply_count", "like_count", "quote_count", "created_at"] + toxicity_cols
        grouped_by_day = tweets_df[selected_cols].resample("D", on="created_at").mean()
        display(grouped_by_day)

        col_traces = []
        for column in toxicity_cols:
            col_traces.append(
                go.Scatter(x=grouped_by_day.index,y=grouped_by_day[column], name=column, mode="lines"))
        comparison_list.append(go.Scatter(x=grouped_by_day.index,y=grouped_by_day["toxicity"], name=hashtag_name, mode="lines"))
            
        fig = go.Figure(col_traces)
        fig.update_layout(
            title_text="Change of toxicity metrics through time", 
            yaxis_title="Date",
            xaxis_title="Mean metric value",
        )
        fig.update_layout(bargap=0.2)
        fig.show()
        
    fig_com = go.Figure(comparison_list)
    fig_com.update_layout(
        title_text="Change of toxicity through time for each hashtag", 
        yaxis_title="Date",
        xaxis_title="Mean value",
    )
    fig_com.update_layout(bargap=0.2)
    fig_com.show()

In [None]:
files_list = ["vegetarian_hashtag_6_1_2023_detoxify_toxicity_712.csv", "trump_hashtag_04_01_2023_detoxify_toxicity_712.csv",
             "uno_hashtag_09_01_2023_detoxify_toxicity_712.csv"]

compare_hashtag_toxicity(files_list)