# Toxicity metrics data generation
In this notebook I generate toxicity metrics with the Detoxify library which is used to measure toxicity of texts, in our case tweets. 
This is meant as a suplementary approach to the Perpective API since we are limited by the number of queries when using it.

Make sure to install CUDA achieves at least 5x speed up.

In [None]:
%reload_ext autoreload
%autoreload 2

import os 
import sys
import pandas as pd
import numpy as np
import plotly 
import plotly.graph_objects as go
import time

import nltk
from detoxify import Detoxify
# nltk.download('stopwords')

try:
    print(run_only_once)
except Exception as e:
    print(os.getcwd())
    os.chdir("./../../")
    print(os.getcwd())
    run_only_once = "Dir has already been changed"

In [None]:
import torch
# clear memory to reduce memory errors
torch.cuda.empty_cache()
print(torch.cuda.memory_summary(device=None, abbreviated=False))

# test if cuda is available, it has to be otherwise slow asf, 33 hours for 1.2 million tweets 
device = torch.device("cuda")
cuda_present = torch.cuda.is_available()
print(f"Cuda present: {cuda_present}")

# load the model
model = Detoxify('original', device="cuda")

In [None]:
# for single predictions
# model.predict("#Bridgerton season 2 was satisfying but very slow ... love d end #Netflix	")

In [None]:
tweets_df = pd.read_csv("./data/twitter_1_million_tweet_dump_29_12_2022.csv")
total_len = len(tweets_df.index)
tweets_df = tweets_df[tweets_df["lang"] == "en"]
print(f"Removed {total_len - len(tweets_df.index)} tweets out of {len(tweets_df.index)}, since they were not in English")

# if we don't do it, the toxicity metrics will missmatch down the line
tweets_df = tweets_df.reset_index(drop=True)
tweets_df

## Generating toxicity scores for each tweet
The code below needed 19092 seconds (5.3 hours) to run the last time, with CUDA.

In [None]:
csv_file_n = "toxicity_temp_7_1_single_pred.csv"
# generating toxicity scores for each tweet
start_time = time.time() 
toxicity_df = pd.DataFrame(columns=["toxicity", "severe_toxicity", "obscene", "threat", "insult", "identity_attack"])
toxicity_df.to_csv(csv_file_n)

content_list = tweets_df["text"].to_list()
"""
# multi step - it should work fine now! You can use it and it should be a bit faster
step = 50
for i in range(0, len(tweets_df.index), step):
    print(f"At row: {i}")
    # everything in one row to reduce memory consumption
    if i % 500 == 0:
        torch.cuda.empty_cache()
        toxicity_df.to_csv(csv_file_n, mode='a', header=False)
        print("Cleared GPU cache and saved to file")
        toxicity_df = pd.DataFrame(columns=["toxicity", "severe_toxicity", "obscene", "threat", "insult", "identity_attack"])
        
    toxicity_df = pd.concat([toxicity_df, pd.DataFrame(model.predict(content_list[i:i+step]))], ignore_index=True)
"""
for i in range(0, len(tweets_df.index), 1):
    # everything in one row to reduce memory consumption
    if i % 500 == 0:
        print(f"At row: {i}")
        torch.cuda.empty_cache()
        toxicity_df.to_csv(csv_file_n, mode='a', header=False)
        print("Cleared GPU cache and saved to file")
        toxicity_df = pd.DataFrame(columns=["toxicity", "severe_toxicity", "obscene", "threat", "insult", "identity_attack"])
        
    toxicity_df = pd.concat([toxicity_df, pd.DataFrame(model.predict(content_list[i:i+1]))], ignore_index=True)    
    
toxicity_df.to_csv(csv_file_n, mode='a', header=False)
print(f"Execution took: {time.time() - start_time:.2f} seconds")
display(toxicity_df) 

In [None]:
# load the csv with the toxicity data
tox_df = pd.read_csv(csv_file_n)
tox_df = tox_df.drop("Unnamed: 0", axis=1)
tweets_df = tweets_df.reset_index(drop=True)
display(tox_df)
display(tweets_df)

In [None]:
# testing problems with toxicity
# pd.set_option('display.max_colwidth', None)
# display(tox_df.iloc[1136439])
# display(tweets_df.iloc[1136439])

In [None]:
merged_f = "merged_single_pred_toxicity_7_1.csv"
# merge tweets with toxicity and save it to a file
merged_toxic_df = pd.merge(tweets_df, tox_df, left_index=True, right_index=True)
merged_toxic_df.to_csv(merged_f)
display(merged_toxic_df)

In [None]:
# merged_toxic_df.iloc[1136439]