# Import Library

In [2]:
# Standard libraries
import os
import re
import ast

import pandas as pd

# NLP libraries
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Emoji handling
import emoji


# Drop Unused Column

In [4]:
file_1_path = r".\Ori_Hate_Speech_Dataset\HateSpeechDataset_Unseen.csv"

df1 = pd.read_csv(file_1_path, encoding='ISO-8859-1', low_memory=False)[['comment', 'label']]
data= df1.rename(columns={'comment': 'text'})


# Check NULL & Drop NULL


In [None]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41108 entries, 0 to 41107
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    41108 non-null  object
 1   label   41108 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 642.4+ KB
None


# Remove Duplicated

In [58]:
duplicate_rows = data[data.duplicated()]
print(f"Number of duplicated rows: {duplicate_rows.shape[0]}")

data = data.drop_duplicates()

print("After Remove Dup")
duplicate_rows = data[data.duplicated()]
print(f"Number of duplicated rows: {duplicate_rows.shape[0]}")


Number of duplicated rows: 10
After Remove Dup
Number of duplicated rows: 0


# Lowercase

In [59]:
data['text'] = data['text'].str.lower()
print(data['text'].head())

0                                  dalits are lowlives
1               gay people are a burden to our society
2                                arabs are not welcome
3    i'm not saying we should actually eliminate he...
4                         bananas are for black people
Name: text, dtype: object


# Remove

## URL/HTML/Emoji

In [60]:
# Remove URLs
data['text'] = data['text'].apply(lambda x: re.sub(r'(http|https|www)\S+', '', str(x)))

# Remove HTML
html_tags_pattern = r'<.*?>'
# sub(pattern,replace,text)
data['text'] = data['text'].apply(lambda x: re.sub(html_tags_pattern, '', str(x)))
data['text'] = data['text'].apply(lambda x: re.sub(r'\S+html\b', '', str(x)))

# Remove emojis
def remove_emojis(text):
    text = emoji.demojize(text)
    text = re.sub(r':\w+:', '', text)
    return text
data['text'] = data['text'].apply(remove_emojis)


## Common symbol substitutions

In [61]:
# Common symbol substitutions
replace_dict = {
    '@': 'a',
    '$': 's',
    '$$':'ss',
    '0': 'o',
    '3': 'e',
    '1': 'i',
    '5': 's',
    '7': 't',
    '4': 'a',
    '9' : 'g',
}
def replace_symbols(text):
    text = str(text)
    for symbol, letter in replace_dict.items():
        # between the alp: (?<=[A-Za-z])symbol(?=[A-Za-z])
        # after the alp：(?<=[A-Za-z])symbol
        # before the alp：symbol(?=[A-Za-z])
        pattern = rf'(?<=[A-Za-z]){re.escape(symbol)}(?=[A-Za-z])|(?<=[A-Za-z]){re.escape(symbol)}|{re.escape(symbol)}(?=[A-Za-z])'
        text = re.sub(pattern, letter, text)
    return text

data['text'] = data['text'].apply(replace_symbols)


## Replace Abbreviations 1.0

In [None]:
# Abbreviations List base on unknow word visualization
abbreviations = {
    # it's, we're,i'll,"let's": "let us",
    "it's":"it is",
    "we're":"were are",
    "let's":"let us",
    "i'll":"i will",
}
# 9. Replace Abbreviations
def replace_abbreviations(text):
    text = str(text)
    for abbr, full_form in abbreviations.items():
        text = re.sub(r'\b' + re.escape(abbr) + r'\b', full_form, text)
    return str(text)
data['text'] = data['text'].apply(replace_abbreviations)


## Remove ASCII characters/Punctuation/White Space/All Number Rows/Elongation

In [63]:
# 11. Remove ASCII characters & delete unused punctuation
# df['text'] = df['text'].apply(lambda x: re.sub(r'[^A-Za-z0-9\s.,!?]', '', str(x)))
data['text'] = data['text'].apply(lambda x: re.sub(r'[^A-Za-z0-9\s]', '', str(x)))

# 12. Remove excessive whitespace
# .strip() removes any leading or trailing whitespace from the text
data['text'] = data['text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# 13.Remove only number rows
only_numbers_df = data[data['text'].astype(str).str.strip().str.isdigit()]
# print(only_numbers_df)
data = data[~data['text'].astype(str).str.strip().str.isdigit()] #turn all data into string (astype(str), remove space (strip()), check is whole string is digit)

# 14.Remove repeated punctuation
# df['text'] = df['text'].apply(lambda x: re.sub(r'([.!?])\s*\1+', r'\1', x))

# 15.Removing elongation (example: goodddddddddd)
data['text'] = data['text'].apply(lambda x: re.sub(r'(.)\1{2,}', r'\1\1', x))

## Replace Abbreviations 2.0

In [64]:
#  after remove punctuation

# Abbreviations List base on unknow word visualization
abbreviations = {
    "auser" : "",
    "werent":"were not","arent": "are not",
    "isnt": "is not",
    "cant": "can not",
    "shes": "she is","hes": "he is",
    "youre": "you are", 
    "youll": "you will",
    "youve": "you have",
    "weve": "we have",
    "yall":"you all",
    "theyre": "they are", 
    "theyve": "they have",
    "doesnt": "does not", 
    "dont":"do not",
    "didnt": "did not",
    "wont": "will not",
    "wouldnt": "would not",
    "shouldnt": "should not",
    "couldnt": "could not",
    "im": "i am",
    "iam": "i am",
    "ive": "i have",
    "id": "i would",
    "wth":"what the heal", "wtf":"what the fuck",
    "fk":"fuck", "f**k":"fuck","fu*k":"fuck", "f*ck":"fuck","fck":"fuck","fcking":"fucking",
    "cuz":"because", "bcuz":"because","becuz":"because",
    "bihday":"birthday",
    "etc":"et cetera",
    "selfie":"self portrait photograph",
    "lol":"laughing out loud",
    "lmao":"laughing my ass off",
    "forex":" foreign exchange",
    "lgbt":"transgender",
    "blm":"black lives matter",
    "obama":"Barack Obama",
    "omg":"oh my god",
    "ppl":"people",
    "fathersday":"father day",
}
# Replace Abbreviations
def replace_abbreviations(text):
    text = str(text)
    for abbr, full_form in abbreviations.items():
        text = re.sub(r'\b' + re.escape(abbr) + r'\b', full_form, text)
    return str(text)
data['text'] = data['text'].apply(replace_abbreviations)

In [65]:
# check NUll
data.isnull().sum()

text     0
label    0
dtype: int64

In [None]:
print("\nMissing values per column:\n", data.isnull().sum())
data = data.dropna(subset=['text'])

print("\nMissing values per column after dropping NaN:\n", data.isnull().sum())




Missing values per column:
 text     0
label    0
dtype: int64

Missing values per column after dropping NaN:
 text     0
label    0
dtype: int64


## Tokenization

In [67]:
filteredTokens = []
for token in data['text']:
    token = str(token)
    wordtokens = nltk.tokenize.word_tokenize(token)
    filteredTokens.append(wordtokens)
data['text']=filteredTokens


## Remove Stop Word

In [68]:
# Convert strings back to lists, stored as strings in the CSV file 
data['text'] = data['text'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x) #(Yadav, 2023)

stopTokens = nltk.corpus.stopwords.words("english")
stopTokens.remove('not') 
stopTokens.remove('no') 

def removeStopWord(words):
    return [word for word in words if word.lower() not in stopTokens]
data['text'] = data['text'].apply(removeStopWord)

## Lemmatization

In [None]:
# Lemmatization
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import pandas as pd
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger') #used for Part-of-Speech (POS) tagging

lemmatizer = WordNetLemmatizer()

def get_pos_tagging(word):
    #[0][1]:('running', 'VBG')
    #[0][1][0]:('V')
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV} #need this for wordnet cuz wordnet only have 4 postag
    return tag_dict.get(tag, wordnet.NOUN)  # Default to noun if no match


def lemmatize_text(text):
    lemmatized_words = [lemmatizer.lemmatize(word, get_pos_tagging(word)) for word in text] 
    return ' '.join(lemmatized_words)

data['text'] = data['text'].apply(lemmatize_text)

output_file_path = r".\Pre_Hate_Dataset\UnseenData_ForTestSetUsed.csv"
data.to_csv(output_file_path, index=False, encoding="utf-8")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Final Check

In [None]:
checkNull_file_path =r".\Pre_Hate_Dataset\UnseenData_ForTestSetUsed.csv"
checkNull = pd.read_csv(checkNull_file_path, encoding='ISO-8859-1')
checkNull.info()
checkNull.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41098 entries, 0 to 41097
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    41098 non-null  object
 1   label   41098 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 642.3+ KB


text     0
label    0
dtype: int64

In [None]:
# drop NULL
file_path = r".\Pre_Hate_Dataset\UnseenData_ForTestSetUsed.csv"
data = pd.read_csv(file_path, encoding='ISO-8859-1')

print("\nMissing values per column:\n", data.isnull().sum())
data = data.dropna(subset=['text'])

print("\nMissing values per column after dropping NaN:\n", data.isnull().sum())

output_file =r".\Pre_Hate_Dataset\UnseenData_ForTestSetUsed.csv"
data.to_csv(output_file, index=False, encoding='ISO-8859-1')



Missing values per column:
 text     0
label    0
dtype: int64

Missing values per column after dropping NaN:
 text     0
label    0
dtype: int64


In [72]:
file_path = r".\Pre_Hate_Dataset\UnseenData_ForTestSetUsed.csv"
data = pd.read_csv(file_path, encoding='ISO-8859-1')

duplicate_rows = data[data.duplicated()]
print(f"Number of duplicated rows: {duplicate_rows.shape[0]}")

data = data.drop_duplicates()

print("After Remove Dup")
duplicate_rows = data[data.duplicated()]
print(f"Number of duplicated rows: {duplicate_rows.shape[0]}")

output_file = r".\Pre_Hate_Dataset\UnseenData_ForTestSetUsed.csv"
data.to_csv(output_file, index=False, encoding='ISO-8859-1')

Number of duplicated rows: 460
After Remove Dup
Number of duplicated rows: 0


In [73]:
# Replace Abbreviations after remove punctuation
file_path = r".\Pre_Hate_Dataset\UnseenData_ForTestSetUsed.csv"
df = pd.read_csv(file_path, encoding='utf-8')

# Abbreviations List base on unknow word visualization
abbreviations = {
    "auser" : "",
    "werent":"were not","arent": "are not",
    "isnt": "is not",
    "cant": "can not",
    "shes": "she is","hes": "he is",
    "youre": "you are", 
    "youll": "you will",
    "youve": "you have",
    "weve": "we have",
    "yall":"you all",
    "theyre": "they are", 
    "theyve": "they have",
    "doesnt": "does not", 
    "dont":"do not",
    "didnt": "did not",
    "wont": "will not",
    "wouldnt": "would not",
    "shouldnt": "should not",
    "couldnt": "could not",
    "im": "i am",
    "iam": "i am",
    "ive": "i have",
    "id": "i would",
    "wth":"what the heal", "wtf":"what the fuck",
    "fk":"fuck", "f**k":"fuck","fu*k":"fuck", "f*ck":"fuck","fck":"fuck","fcking":"fucking",
    "cuz":"because", "bcuz":"because","becuz":"because",
    "bihday":"birthday",
    "etc":"et cetera",
    "selfie":"self portrait photograph",
    "lol":"laughing out loud",
    "lmao":"laughing my ass off",
    "forex":" foreign exchange",
    "lgbt":"transgender",
    "blm":"black lives matter",
    "obama":"Barack Obama",
    "omg":"oh my god",
    "ppl":"people",
    "fathersday":"father day",
}
def replace_abbreviations(text):
    text = str(text)
    for abbr, full_form in abbreviations.items():
        text = re.sub(r'\b' + re.escape(abbr) + r'\b', full_form, text)
    return str(text)
df['text'] = df['text'].apply(replace_abbreviations)

output_file_path =r".\Pre_Hate_Dataset\UnseenData_ForTestSetUsed.csv"
df.to_csv(output_file_path, index=False, encoding="utf-8")
print(f"Data has been saved to {output_file_path}")

Data has been saved to .\Pre_Hate_Dataset\UnseenData_ForTestSetUsed.csv
