In [266]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import os

# save for later
#from sklearn.decomposition import PCA
#import seaborn as sns
#import matplotlib.pyplot as plt
#from sklearn.preprocessing import StandardScaler, MinMaxScaler
#from sklearn.linear_model import LogisticRegression
#from sklearn.datasets import make_classification
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import confusion_matrix
#from sklearn.metrics import classification_report

#os.chdir(r"C:\Users\raned\Documents\GitHub\PostModeration")


This notebook will start off with preprocessing the two csv files to train different supervised learning models. 
- Removal of usernames, URLs, and special characters
- Lowercasing text
- Tokenization (nltk or spaCy): breaking text into smaller units 
- Stopword removal: remove common words that become index terms ("and", "or", "the", "in")
- Lemmatization: reduces words to their base or dictionary form
- TF-IDF vectorization for feature extraction: a technique that converts text data into numerical vectors, representing the importance of words in a document relative to a collection of documents, by combining term frequency with inverse document frequency

In [271]:
df = pd.read_csv("TrainingData/labeled_data.csv")
print(df.describe())
print(df.shape)
print(df.head())
print(df.info())


         Unnamed: 0         count   hate_speech  offensive_language  \
count  24783.000000  24783.000000  24783.000000        24783.000000   
mean   12681.192027      3.243473      0.280515            2.413711   
std     7299.553863      0.883060      0.631851            1.399459   
min        0.000000      3.000000      0.000000            0.000000   
25%     6372.500000      3.000000      0.000000            2.000000   
50%    12703.000000      3.000000      0.000000            3.000000   
75%    18995.500000      3.000000      0.000000            3.000000   
max    25296.000000      9.000000      7.000000            9.000000   

            neither         class  
count  24783.000000  24783.000000  
mean       0.549247      1.110277  
std        1.113299      0.462089  
min        0.000000      0.000000  
25%        0.000000      1.000000  
50%        0.000000      1.000000  
75%        0.000000      1.000000  
max        9.000000      2.000000  
(24783, 7)
   Unnamed: 0  count  hat

**count**: number of CrowdFlower users who coded each tweet (min is 3, sometimes more users coded a tweet when judgments were

**hate_speech**: number of CF users who judged the tweet to be hate speech

**offensive_language**: number of CF users who judged the tweet to be offensive

**neither**: number of CF users who judged the tweet to be neither offensive nor non-offensive

**class**: class label for majority of CF users. 0 - hate speech 1 - offensive language 2 - neither


In [272]:
# scrubbing text: removing usernames, URLs, special characters and ensuring all text is lowercase
tweet_column = df['tweet'].astype(str).str.casefold()  # lowercase
tweet_column.head()


0    !!! rt @mayasolovely: as a woman you shouldn't...
1    !!!!! rt @mleew17: boy dats cold...tyga dwn ba...
2    !!!!!!! rt @urkindofbrand dawg!!!! rt @80sbaby...
3    !!!!!!!!! rt @c_g_anderson: @viva_based she lo...
4    !!!!!!!!!!!!! rt @shenikaroberts: the shit you...
Name: tweet, dtype: object

In [274]:
#removes usernames first, urls, then any special characters
clean_tweet = tweet_column.str.replace(r'(rt)?\s?@\w+:?', ' ', regex=True).str.replace(r'http.+', ' ', regex=True).str.replace(r'\W+', ' ', regex=True)
clean_tweet.head()

0     as a woman you shouldn t complain about clean...
1     boy dats cold tyga dwn bad for cuffin dat hoe...
2     dawg you ever fuck a bitch and she start to c...
3                               she look like a tranny
4     the shit you hear about me might be true or i...
Name: tweet, dtype: object

In [275]:
#tokenization, stop words, and lemmatization
from nltk.corpus import stopwords

nltk.download('punkt_tab')
#nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) #stopwords

def clean_tokenize(text): 
    tokens = word_tokenize(text)  # Keeps contractions like "don't"; tokenization
    tokens = [t.lower() for t in tokens if t.isalpha() or "'" in t]  # keep letters + contractions
    tokens = [t for t in tokens if t != "rt" and t not in stop_words]  # remove 'rt' and stopwords
    lemmatized = [lemmatizer.lemmatize(t) for t in tokens] #lemmatization
    return lemmatized

cleaned_tokens = clean_tweet.apply(clean_tokenize)
print(cleaned_tokens.head())

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/abigailcalderon/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abigailcalderon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abigailcalderon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/abigailcalderon/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


0    [woman, complain, cleaning, house, amp, man, a...
1    [boy, dat, cold, tyga, dwn, bad, cuffin, dat, ...
2    [dawg, ever, fuck, bitch, start, cry, confused...
3                                 [look, like, tranny]
4    [shit, hear, might, true, might, faker, bitch,...
Name: tweet, dtype: object


In [276]:
#tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

df['cleaned_text'] = cleaned_tokens.apply(lambda tokens: ' '.join(tokens))


tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['cleaned_text'])

feature_names = tfidf.get_feature_names_out()
print(feature_names[:100])  
print(df)

['aa' 'aaaaaaaaand' 'aaahhhhh' 'aahahah' 'aaliyah' 'aan' 'aap' 'aaron'
 'aaronmacgruder' 'aaryn' 'ab' 'abandonado' 'abbey' 'abby' 'abc' 'abdelka'
 'abduction' 'abdullah' 'abdurahman' 'abed' 'abel' 'aberdeen' 'ability'
 'able' 'abo' 'aborted' 'abortion' 'abou' 'abound' 'abouta' 'abouttime'
 'abraham' 'absent' 'absolute' 'absolutely' 'absoluteyvile' 'absolved'
 'abstract' 'absurd' 'abt' 'abu' 'abundance' 'abus' 'abuse' 'abused'
 'abuser' 'abusive' 'ac' 'aca' 'acab' 'academic' 'accelerated' 'accent'
 'accept' 'acceptable' 'acceptance' 'accepted' 'access' 'accessible'
 'accessorize' 'accessory' 'accident' 'accidentally' 'accipiter'
 'accipitridae' 'accnt' 'accolade' 'accompanied' 'accord' 'according'
 'accordingly' 'account' 'accountable' 'accountant' 'acct' 'accuracy'
 'accurate' 'accurately' 'accused' 'accuses' 'accustomed' 'acdc' 'ace'
 'aceptar' 'aceves' 'ach' 'achieve' 'achilles' 'aching' 'acid' 'ackin'
 'acknowledge' 'acknowledged' 'acknowledging' 'acl' 'acne' 'acoustic'
 'acquire' '

In [277]:
#PREPROCESSING FOR HateSpeechDatasetBalanced.csv

#Load dataset and take a 27,000-row sample; easier to have random 27,000 samples due to how big the actual dataset is 
df = pd.read_csv("TrainingData/HateSpeechDatasetBalanced.csv")
df_subset = df.sample(n=27000, random_state=42).copy()

df_subset['Content'] = df_subset['Content'].astype(str).str.casefold()

def clean_tokenize(text):
    tokens = word_tokenize(text)  # splits into words and keeps contractions
    tokens = [t for t in tokens if t.isalpha()]  # keep only alphabetic tokens
    tokens = [t for t in tokens if t not in stop_words]
    lemmatized = [lemmatizer.lemmatize(t) for t in tokens]
    return lemmatized


df_subset['cleaned_tokens'] = df_subset['Content'].apply(clean_tokenize)
df_subset['cleaned_text'] = df_subset['cleaned_tokens'].apply(lambda x: ' '.join(x))


df_subset['cleaned_text'] = df_subset['cleaned_text'].fillna('')


tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df_subset['cleaned_text'])  # Features
y = df_subset['Label']  # Target labels

print("TF-IDF shape:", X_tfidf.shape)
print(tfidf.get_feature_names_out()[:100])

TF-IDF shape: (27000, 33004)
['aa' 'aaa' 'aaaa' 'aaaaa' 'aaaaaaaaaaaaaaaaa' 'aaaaaaacopyrighta'
 'aaaaaaareaareaaaaaaaaaaaaaa' 'aaaaarrrrrggggghhhhh' 'aaaacg'
 'aaadonaaat' 'aaah' 'aaand' 'aachen' 'aaeyou' 'aag' 'aah' 'aaib' 'aaj'
 'aak' 'aalukkoru' 'aand' 'aanti' 'aap' 'aardvark' 'aaron' 'aaroncrick'
 'aarp' 'aau' 'ab' 'aba' 'aback' 'abacus' 'abandon' 'abandoned' 'abash'
 'abated' 'abaxial' 'abb' 'abba' 'abbey' 'abbott' 'abbreviated'
 'abbreviation' 'abc' 'abd' 'abdf' 'abdomen' 'abduce' 'abdul' 'abdullah'
 'abe' 'abecedary' 'abeh' 'abel' 'abelson' 'aberdeen' 'abet' 'abeyance'
 'abf' 'abhishek' 'abhorrent' 'abidance' 'abide' 'abiding' 'abigail'
 'ability' 'abiogenic' 'abject' 'abk' 'abkhazia' 'able' 'abm' 'abnegation'
 'abner' 'abnormal' 'abnormality' 'aboard' 'abode' 'abolish' 'abolished'
 'abolishment' 'abolitionist' 'abominable' 'abominably' 'abominate'
 'abomination' 'aboridzinima' 'aboriginal' 'aborigine' 'abort' 'aborted'
 'abortion' 'abortive' 'abortively' 'abound' 'abp' 'abraha

In [281]:
import html

# PARAMETERS
emotion_data_size = 12000
hateSpeechBalanced_size = 6000
assert emotion_data_size + hateSpeechBalanced_size == 18000

# ================== Process text.csv (emotion data) =======================
emotions_df = pd.read_csv("TrainingData/text.csv")
emotions_df.drop(columns=['Unnamed: 0'], inplace=True)

# Split into 'feeling' and 'no feeling' texts
no_feeling_size = int(emotion_data_size / 2)
feeling_size = int(no_feeling_size / 5)

feeling_df = emotions_df[emotions_df['text'].str.contains(r'\bfeel(?:s|ing)?\b', case=False, regex=True)]
no_feeling_df = emotions_df[~emotions_df['text'].str.contains(r'\bfeel(?:s|ing)?\b', case=False, regex=True)].sample(n=no_feeling_size, random_state=42).copy()

emotions = pd.DataFrame()
for i in range(0, 5):
    temp = feeling_df[feeling_df['label'] == i].sample(n=feeling_size, random_state=42).copy()
    emotions = pd.concat([emotions, temp], ignore_index=True)

emotions = pd.concat([emotions, no_feeling_df], ignore_index=True)
emotions['Content'] = emotions['text']
emotions['Label'] = 0  # all emotion data treated as clean
emotions.drop(columns=['text', 'label'], inplace=True)

# ================== Process labeled_data.csv =======================
df1 = pd.read_csv("TrainingData/labeled_data.csv")
df1 = df1[['tweet', 'class']]

hatespeech_ld = df1[df1['class'] == 0].sample(n=1430, random_state=42).copy()
clean_ld = df1[df1['class'] == 2].sample(n=1430, random_state=42).copy()

labeled_data = pd.concat([hatespeech_ld, clean_ld], ignore_index=True)
labeled_data['Content'] = labeled_data['tweet']
labeled_data['Label'] = labeled_data['class'].replace({0: 1, 2: 0})
labeled_data.drop(columns=['tweet', 'class'], inplace=True)

# ================== Process HateSpeechDatasetBalanced.csv =======================
df2 = pd.read_csv("TrainingData/HateSpeechDatasetBalanced.csv")
df2 = df2[['Content', 'Label']]

# Subsample from df2 to match hateSpeechBalanced_size
hs_size = hateSpeechBalanced_size // 2
non_hs_size = hateSpeechBalanced_size - hs_size

hs_df = df2[df2['Label'] == 1].sample(n=hs_size, random_state=42).copy()
clean_df = df2[df2['Label'] == 0].sample(n=non_hs_size, random_state=42).copy()
balanced_df = pd.concat([hs_df, clean_df], ignore_index=True)

# ================== Final Merge =======================
# Combine emotion data, balanced hateSpeech data, and labeled_data.csv
training_data_df = pd.concat([emotions, balanced_df, labeled_data], ignore_index=True)

# ================== Cleaning =======================
training_data_df['Content'] = training_data_df['Content'].astype(str).str.casefold()

# Unescape emojis (from &#12345; to actual emoji)
training_data_df['Content'] = training_data_df['Content'].apply(html.unescape)

# Basic cleaning: remove mentions and links
training_data_df['Content'] = training_data_df['Content'] \
    .str.replace(r'(rt)?\s?@\w+:?', ' ', regex=True) \
    .str.replace(r'http\S+', ' ', regex=True)

# ================== Save =======================
training_data_df.to_csv('combined_balanced_dataset_BERT.csv', index=False) #named this after Bert Model cuz i was only training bert at the time and didnt wanna get confused when i come back

# ================== Summary =======================
print(training_data_df['Label'].value_counts())
print("✅ Combined dataset size:", training_data_df.shape)

Label
0    16430
1     4430
Name: count, dtype: int64
✅ Combined dataset size: (20860, 2)


IGNORE THIS PART

I was trying to see if adding emoji detection would improve dataset

In [279]:
training_data_df
print(hs_size)

3000


In [280]:
# ---------------------------------------------------------
# Clean tweets to support emoji detection
# ---------------------------------------------------------
import re
import emoji

# Define emoji-aware cleaning function
def clean_tweet(text):
    if pd.isnull(text):
        return ""
    
    # Remove 'rt', mentions, and links
    text = re.sub(r'\brt\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'@\w+:?', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Convert emojis to descriptive words
    text = emoji.demojize(text, delimiters=(" ", " "))
    
    # Remove HTML entities (like &#8230;)
    text = re.sub(r'&#\d+;', '', text)
    
    # Remove special characters (keep only alphanumerics and emoji words)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Normalize spacing + lowercase
    text = re.sub(r'\s+', ' ', text).strip().lower()
    
    return text

# Apply the function to create a cleaned version of the content
training_data_df['CleanContent'] = training_data_df['Content'].apply(clean_tweet)

# Save to a new emoji-aware dataset
training_data_df[['CleanContent', 'Label']].to_csv("combined_dataset_emoji_cleaned.csv", index=False)

# Preview
training_data_df[['CleanContent', 'Label']].head()


Unnamed: 0,CleanContent,Label
0,i feel nothing i feel worthless and pain and i...,0
1,i instantly become terrified but it was the di...,0
2,i want seem to be pulling me in different dire...,0
3,i write this short post for any and all of you...,0
4,i don t have any money for beauty supplies or ...,0
