# Toxicity metrics data generation
In this notebook I generate toxicity metrics with the Detoxify library which is used to measure toxicity of texts, in our case tweets. 
This is meant as a suplementary approach to the Perpective API since we are limited by the number of queries when using it.

Make sure to install CUDA achieves at least 5x speed up.

In [None]:
%reload_ext autoreload
%autoreload 2

import os 
import sys
import pandas as pd
import numpy as np
import plotly 
import plotly.graph_objects as go
import time
import random
# progress monitoring
from tqdm import tqdm

from sentence_splitter import split_text_into_sentences
# for dealing with fucked up data
from unidecode import unidecode

import nltk
from detoxify import Detoxify
# nltk.download('stopwords')

try:
    print(run_only_once)
except Exception as e:
    print(os.getcwd())
    os.chdir("./../../")
    print(os.getcwd())
    run_only_once = "Dir has already been changed"

In [None]:
import torch
# clear memory to reduce memory errors
torch.cuda.empty_cache()
print(torch.cuda.memory_summary(device=None, abbreviated=False))

# test if cuda is available, it has to be otherwise slow asf, 33 hours for 1.2 million tweets 
device = torch.device("cuda")
cuda_present = torch.cuda.is_available()
print(f"Cuda present: {cuda_present}")

# load the model
model = Detoxify('original', device="cuda")

In [None]:
# for single predictions
model.predict("Love on the Spectrum is the cutest show on Netflix rn ðŸ¥¹ðŸ’“")

## Generating toxicity scores for each tweet
The code below needed 19092 seconds (5.3 hours) to run the last time, with CUDA on ~1 million tweets.

In [None]:
# test for sentence splitting
sentences = split_text_into_sentences(
    text='This is a paragraph. It contains several sentences. "But why," you ask?',
    language='en'
)

for sent in sentences:
    print(sent)

This is the initial implementation of toxicity generation, without us doing any preprocessing on the text.

In [None]:
# TODO move this to .py file
def generate_toxicity_for_tweet_file(input_file, output_file_name):
    print(f"\nStarted generating toxicity metrics for:\n"
      f"-input file: '{input_file}',\n"
      f"-output file: '{output_file_name}'")
    tweets_df = pd.read_csv("./data/raw_hashtags/" + input_file)
    total_len = len(tweets_df.index)
    tweets_df = tweets_df[tweets_df["lang"] == "en"]
    print(f"Removed {total_len - len(tweets_df.index)} tweets out of {len(tweets_df.index)}, since they were not in English")

    # if we don't do it, the toxicity metrics will missmatch down the line
    tweets_df = tweets_df.reset_index(drop=True)
    print("Tweet df:")
    display(tweets_df.head(5))

    # generating toxicity scores for each tweet
    start_time = time.time() 
    csv_columns = list(tweets_df.columns) + ["toxicity", "severe_toxicity", "obscene", "threat", "insult", "identity_attack"]
    toxicity_df = pd.DataFrame(columns=csv_columns)
    # save headers to file
    toxicity_df.to_csv(output_file_name)
    content_list = tweets_df["text"].to_list()
    
    # multi step - it should work fine now! You can use it and it should be a bit faster
    step = 50
    for i in range(0, len(tweets_df.index), step):
        if i % 500 == 0 and i != 0:
            print(f"At row: {i}")
            torch.cuda.empty_cache()
            toxicity_df.to_csv(output_file_name, mode='a', header=False)
            print("Cleared GPU cache and saved to file")
            toxicity_df = pd.DataFrame(columns=csv_columns)
            
        curr_tox_dict = model.predict(content_list[i:i+step])
        curr_tweet_dict = tweets_df.iloc[i:i+step].reset_index(drop=True).to_dict(orient="list")
        merged_tweet_tox = pd.merge(pd.DataFrame(curr_tweet_dict), pd.DataFrame(curr_tox_dict), 
                                    left_index=True, right_index=True)
        toxicity_df = pd.concat([toxicity_df, merged_tweet_tox], ignore_index=True)
        
    toxicity_df.to_csv(output_file_name, mode='a', header=False)
    print(f"Execution took: {time.time() - start_time:.2f} seconds")
    print(f"Finished saving to file '{output_file_name}'\n")

Here we implemented a new version of the function above which split the text into sentences and computes the averages over them.

In [None]:
# TODO move this to .py file
def upgraded_generate_toxicity_for_tweet_file(input_file, output_file_name):
    print(f"\nStarted generating toxicity metrics for:\n"
      f"-input file: '{input_file}',\n"
      f"-output file: '{output_file_name}'")
    tweets_df = pd.read_csv("./data/lemmatized/" + input_file)
    total_len = len(tweets_df.index)
    tweets_df = tweets_df[tweets_df["lang"] == "en"]
    print(f"Removed {total_len - len(tweets_df.index)} tweets out of {len(tweets_df.index)}, since they were not in English")

    # if we don't do it, the toxicity metrics will missmatch down the line
    tweets_df = tweets_df.reset_index(drop=True)
    print("Tweet df:")
    display(tweets_df.head(5))

    # generating toxicity scores for each tweet
    start_time = time.time() 
    csv_columns = list(tweets_df.columns) + ["toxicity", "severe_toxicity", "obscene", "threat", "insult", "identity_attack"]
    toxicity_df = pd.DataFrame(columns=csv_columns)
    # save headers to file
    toxicity_df.to_csv(output_file_name)
    # changed this column for lemmatized info
    content_list_p = tweets_df["processed_text"].to_list()
    content_list_raw = tweets_df["text"].to_list()
    
    # easiest to implement per text
    step = 1
    for i in tqdm(range(0, len(tweets_df.index), step)):
        if i % 5000 == 0 and i != 0:
            torch.cuda.empty_cache()
            toxicity_df.to_csv(output_file_name, mode='a', header=False)
            # print("At row: {i}. Cleared GPU cache and saved to file")
            toxicity_df = pd.DataFrame(columns=csv_columns)
           
        sentences_arr = []
        try:
            # before predicting, split text into sentences
            sentences_arr = split_text_into_sentences(text=content_list_p[i], language='en')
        except TypeError as e:
            # for handling bad string regex etc.
            print(f"At row {i}, encountered non-sentence splittable string '{content_list_p[i]}'")
            print(f"Trying to split the original sentence parsed with unidecode '{unidecode(content_list_raw[i])}'!")
            sentences_arr = split_text_into_sentences(text=unidecode(content_list_raw[i]), language='en')
            
        text_tox_arr = []
        for sentence in sentences_arr:
            curr_tox_dict = model.predict(sentence)
            text_tox_arr.append(curr_tox_dict)

        curr_tox_dict = {}
        # merge all sentence toxicities and take average (we could also take max)
        sentence_count = len(text_tox_arr)
        for key in text_tox_arr[0].keys():
            curr_tox_dict[key] = sum(tmp_dict[key] for tmp_dict in text_tox_arr) / sentence_count

        curr_tweet_dict = tweets_df.iloc[i:i + step].reset_index(drop=True).to_dict(orient="list")
        # we have to wrap our dict in an array to be converted to df
        merged_tweet_tox = pd.merge(pd.DataFrame(curr_tweet_dict), pd.DataFrame([curr_tox_dict]), 
                                    left_index=True, right_index=True)
        toxicity_df = pd.concat([toxicity_df, merged_tweet_tox], ignore_index=True)
        
    toxicity_df.to_csv(output_file_name, mode='a', header=False)
    print(f"Execution took: {time.time() - start_time:.2f} seconds")
    print(f"Finished saving to file '{output_file_name}'\n")

Here we actually run our toxify method.

In [None]:
# hashtag_files = ["vegetarian_hashtag_6_1_2023.csv", "trump_hashtag_04_01_2023.csv", "uno_hashtag_09_01_2023.csv", 
#                 "vegan_hashtag_6_1_2023.csv", "fitness_hashtag_08_01_2023.csv", "musk_hashtag_03_01_2023.csv",
#                "netflix_hashtag_08_01_2023.csv"]
hashtag_files_lemmatized = ["netflix_hashtag_08_01_2023_lemmatized.csv", "vegetarian_hashtag_6_1_2023_lemmatized.csv", 
                            "uno_hashtag_09_01_2023_lemmatized.csv", "vegan_hashtag_6_1_2023_lemmatized.csv", 
                            "fitness_hashtag_08_01_2023_lemmatized.csv", "musk_hashtag_03_01_2023_lemmatized.csv", 
                            "trump_hashtag_13_01_2023_lemmatized.csv"]

# to not override files by mistake
hash_int = random.randrange(1000)
for file_name in hashtag_files_lemmatized:
    replaced_str = file_name.replace('.csv', '').replace('_lemmatized', '')
    output_file = f"./data/detoxify_toxicity_added_hashtags/lemmatized_{replaced_str}_detoxify_toxicity_{hash_int}.csv"
    
    # old version # generate_toxicity_for_tweet_file(file_name, output_file)
    upgraded_generate_toxicity_for_tweet_file(file_name, output_file)

## Use unicode
**unidecode** function automatically converts a string to be more asci compliant. Problem occured at netflix on line ~595628