## Preprocessing text for the Detoxify model

In [1]:
%reload_ext autoreload
%autoreload 2

import os 
import sys
import pandas as pd
import numpy as np
import plotly 
import plotly.graph_objects as go
import time

import nltk
from detoxify import Detoxify
import spacy
import csv
import json
import re
from tqdm import tqdm

# download this via "python -m spacy download en_core_web_sm"
nlp = spacy.load('en_core_web_sm')

try:
    print(run_only_once)
except Exception as e:
    print(os.getcwd())
    os.chdir("./../../")
    print(os.getcwd())
    run_only_once = "Dir has already been changed"

C:\Users\Zan\Desktop\Faksic_TUM\3_semester\seminar_social_computing\impact-of-twitter-take-over\notebooks\zan
C:\Users\Zan\Desktop\Faksic_TUM\3_semester\seminar_social_computing\impact-of-twitter-take-over


In [2]:
# symbols to remove
emoji_regex = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251" # TODO this line should be removed/modified else netflix hashtag won't work
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)

def prepare_text_for_tweet_file(input_file, output_file_name):
    print(f"\nStarted generating toxicity metrics for:\n"
      f"-input file: '{input_file}',\n"
      f"-output file: '{output_file_name}'")
    start_time = time.time()
    tweets_df = pd.read_csv("./data/raw_hashtags/" + input_file)
    total_len = len(tweets_df.index)
    
    tweets_df = tweets_df[tweets_df["lang"] == "en"]
    print(f"Removed {total_len - len(tweets_df.index)} tweets out of {len(tweets_df.index)}, since they were not in English\n")
    # if we don't do it, the toxicity metrics will missmatch down the line
    tweets_df = tweets_df.reset_index(drop=True)
    

    # add new column for processed text
    tweets_df["text_processed"] = np.nan
    preprocessed_text_arr = []
    for index, row in tweets_df.iterrows():
        if index % 500 == 0:
            print(f"At row: {index}/{len(tweets_df.index)}")
        # remove emojis       
        text = emoji_regex.sub(r'', row["text"])    
        # lematize it using spacy
        lemmas = ' '.join([x.lemma_ for x in nlp(text)])
        
        # add it to the array which will be added as a column at the end
        preprocessed_text_arr.append(lemmas)
 
    tweets_df["text_processed"] = pd.Series(preprocessed_text_arr) 
    print("Finished lemmitization")
    
    tweets_df.to_csv(output_file_name, header=True)
    print(f"\nExecution took: {time.time() - start_time:.2f} seconds")
    print(f"Finished saving to file '{output_file_name}'\n")
    return tweets_df

In [3]:
# optimized version
def optimized_prepare_text_for_tweet_file(input_file, output_file_name):
    print(f"\nStarted generating toxicity metrics for:\n"
      f"-input file: '{input_file}',\n"
      f"-output file: '{output_file_name}'")
    start_time = time.time()
    tweets_df = pd.read_csv("./data/raw_hashtags/" + input_file)
    total_len = len(tweets_df.index)
    
    tweets_df = tweets_df[tweets_df["lang"] == "en"]
    print(f"Removed {total_len - len(tweets_df.index)} tweets out of {len(tweets_df.index)}, since they were not in English\n")
    # if we don't do it, the toxicity metrics will missmatch down the line
    tweets_df = tweets_df.reset_index(drop=True)
    
    # removed emoji and other weird symbols
    symbol_removed_col = tweets_df["text"].str.replace(emoji_regex, "")
    # remove new lines, tabs, and multiple spaces.
    symbol_removed_col = symbol_removed_col.str.replace(r'\r+|\n+|\t+','', regex=True).replace(r'\s+', ' ', regex=True)

    # array in which to store the data
    preprocessed_text_arr = []
    
    lemmas_arr = []
    # perform multithreaded execution
    for doc in tqdm(nlp.pipe(symbol_removed_col.astype("unicode").values, batch_size=15, n_process=3)):
        if doc.has_annotation:
            # contains actual word/token
            # tokens.append([n.text for n in doc])
            # contains the label of the token
            # pos.append([n.pos_ for n in doc])
            
            # contains the lemmatized sentence
            lemmas_arr.append(" ".join([token.lemma_ for token in doc]))
        else:
            # We want to make sure that the lists of parsed results have the
            # same number of entries of the original Dataframe, so add some blanks in case the parse fails
            tokens.append(None)
            lemma.append(None)
            pos.append(None)

    tweets_df['processed_text'] = pd.Series(lemmas_arr) 
    print("Finished lemmitization")
    
    tweets_df.to_csv(output_file_name, header=True)
    print(f"\nExecution took: {time.time() - start_time:.2f} seconds")
    print(f"Finished saving to file '{output_file_name}'\n")


## Run Lemmatization
This will run lemmatization  on text for all the files

In [4]:
# files that we want to prepare
hashtag_files = ["vegetarian_hashtag_6_1_2023.csv", "uno_hashtag_09_01_2023.csv", 
                 "vegan_hashtag_6_1_2023.csv", "fitness_hashtag_08_01_2023.csv", "netflix_hashtag_08_01_2023.csv", 
                 "musk_hashtag_03_01_2023.csv", "trump_hashtag_13_01_2023.csv"]

In [5]:
for file in hashtag_files:
    output_file = f"./data/lemmatized/{file.split('.')[0]}_lemmatized.csv"
    optimized_prepare_text_for_tweet_file(file, output_file)


Started generating toxicity metrics for:
-input file: 'vegetarian_hashtag_6_1_2023.csv',
-output file: './data/lemmatized/vegetarian_hashtag_6_1_2023_lemmatized.csv'
Removed 30 tweets out of 71134, since they were not in English



71134it [01:50, 643.52it/s]


Finished lemmitization

Execution took: 112.79 seconds
Finished saving to file './data/lemmatized/vegetarian_hashtag_6_1_2023_lemmatized.csv'


Started generating toxicity metrics for:
-input file: 'uno_hashtag_09_01_2023.csv',
-output file: './data/lemmatized/uno_hashtag_09_01_2023_lemmatized.csv'
Removed 1429 tweets out of 107575, since they were not in English



107575it [02:15, 794.57it/s] 


Finished lemmitization

Execution took: 138.08 seconds
Finished saving to file './data/lemmatized/uno_hashtag_09_01_2023_lemmatized.csv'


Started generating toxicity metrics for:
-input file: 'vegan_hashtag_6_1_2023.csv',
-output file: './data/lemmatized/vegan_hashtag_6_1_2023_lemmatized.csv'
Removed 151 tweets out of 248143, since they were not in English



248143it [05:34, 741.22it/s] 


Finished lemmitization

Execution took: 341.70 seconds
Finished saving to file './data/lemmatized/vegan_hashtag_6_1_2023_lemmatized.csv'


Started generating toxicity metrics for:
-input file: 'fitness_hashtag_08_01_2023.csv',
-output file: './data/lemmatized/fitness_hashtag_08_01_2023_lemmatized.csv'
Removed 1076 tweets out of 280376, since they were not in English



280376it [06:54, 676.43it/s] 


Finished lemmitization

Execution took: 423.53 seconds
Finished saving to file './data/lemmatized/fitness_hashtag_08_01_2023_lemmatized.csv'


Started generating toxicity metrics for:
-input file: 'netflix_hashtag_08_01_2023.csv',
-output file: './data/lemmatized/netflix_hashtag_08_01_2023_lemmatized.csv'
Removed 8433 tweets out of 1637171, since they were not in English



1637171it [32:53, 829.79it/s] 


Finished lemmitization

Execution took: 2018.51 seconds
Finished saving to file './data/lemmatized/netflix_hashtag_08_01_2023_lemmatized.csv'


Started generating toxicity metrics for:
-input file: 'musk_hashtag_03_01_2023.csv',
-output file: './data/lemmatized/musk_hashtag_03_01_2023_lemmatized.csv'
Removed 1024 tweets out of 742395, since they were not in English



742395it [17:49, 694.38it/s]


Finished lemmitization

Execution took: 1092.98 seconds
Finished saving to file './data/lemmatized/musk_hashtag_03_01_2023_lemmatized.csv'


Started generating toxicity metrics for:
-input file: 'trump_hashtag_13_01_2023.csv',
-output file: './data/lemmatized/trump_hashtag_13_01_2023_lemmatized.csv'
Removed 1165 tweets out of 2362363, since they were not in English



2362363it [1:00:45, 648.07it/s]


Finished lemmitization

Execution took: 3727.66 seconds
Finished saving to file './data/lemmatized/trump_hashtag_13_01_2023_lemmatized.csv'

