## Preprocessing text for the Detoxify model

In [None]:
%reload_ext autoreload
%autoreload 2

import os 
import sys
import pandas as pd
import numpy as np
import plotly 
import plotly.graph_objects as go
import time

import nltk
from detoxify import Detoxify
import spacy
import csv
import json
import re

# download this via "python -m spacy download en_core_web_sm"
nlp = spacy.load('en_core_web_sm')

try:
    print(run_only_once)
except Exception as e:
    print(os.getcwd())
    os.chdir("./../../")
    print(os.getcwd())
    run_only_once = "Dir has already been changed"

In [None]:
# files that we want to prepare
hashtag_files = ["vegetarian_hashtag_6_1_2023.csv", "uno_hashtag_09_01_2023.csv", 
                 "vegan_hashtag_6_1_2023.csv", "fitness_hashtag_08_01_2023.csv", "netflix_hashtag_08_01_2023.csv", 
                 "musk_hashtag_03_01_2023.csv", "trump_hashtag_13_01_2023.csv"]

In [None]:
# symbols to remove
emoji_regex = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)

def prepare_text_for_tweet_file(input_file, output_file_name):
    print(f"\nStarted generating toxicity metrics for:\n"
      f"-input file: '{input_file}',\n"
      f"-output file: '{output_file_name}'")
    start_time = time.time()
    tweets_df = pd.read_csv("./data/raw_hashtags/" + input_file)
    total_len = len(tweets_df.index)
    
    tweets_df = tweets_df[tweets_df["lang"] == "en"]
    print(f"Removed {total_len - len(tweets_df.index)} tweets out of {len(tweets_df.index)}, since they were not in English\n")
    # if we don't do it, the toxicity metrics will missmatch down the line
    tweets_df = tweets_df.reset_index(drop=True)
    

    # add new column for processed text
    tweets_df["text_processed"] = np.nan
    preprocessed_text_arr = []
    for index, row in tweets_df.iterrows():
        if index % 500 == 0:
            print(f"At row: {index}/{len(tweets_df.index)}")
        # remove emojis       
        text = emoji_regex.sub(r'', row["text"])    
        # lematize it using spacy
        lemmas = ' '.join([x.lemma_ for x in nlp(text)])
        
        # add a new column to the dataframe, the text is lemmatized
        # tweets_df.iloc[index]["text_processed"] = lemmas
        # add it to the array which will be added as a column at the end
        preprocessed_text_arr.append(lemmas)
        break 
    tweets_df["text_processed"] = pd.Series(preprocessed_text_arr) 
    print("Finished lemmitization")
    
    tweets_df.to_csv(output_file_name, header=True)
    print(f"\nExecution took: {time.time() - start_time:.2f} seconds")
    print(f"Finished saving to file '{output_file_name}'\n")
    return tweets_df

In [None]:
for file in hashtag_files:
    output_file = f"./data/lemmatized/{file.split('.')[0]}_lemmatized.csv"
    tmp_df = prepare_text_for_tweet_file(file, output_file)
    