<a href="https://colab.research.google.com/github/bermudezfc/CCMACLRL_EXERCISES_COM221ML/blob/main/Exercise7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 7: Hate Speech Classification using Multinomial Naive Bayes

Instructions:
- You do not need to split your data. Use the training, validation and test sets provided below.
- Use Multinomial Naive Bayes to train a model that can classify if a sentence if a hate speech or not
- A sentence with a label of zero (0) is classified as non-hate speech
- A sentence with a label of one (1) is classified as a hate speech

Apply text pre-processing techniques such as
- Converting to lowercase
- Stop word Removal
- Removal of digits, special characters
- Stemming or Lemmatization but not both
- Count Vectorizer or TF-IDF Vectorizer but not both

Evaluate your model by:
- Providing input by yourself
- Creating a Confusion Matrix
- Calculating the Accuracy, Precision, Recall and F1-Score

In [377]:
import pandas as pd
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [378]:
splits = {'train': 'unique_train_dataset.csv', 'validation': 'unique_validation_dataset.csv', 'test': 'unique_test_dataset.csv'}

## Training Set

Use this to train your model

In [379]:
df_train = pd.read_csv("hf://datasets/mapsoriano/2016_2022_hate_speech_filipino/" + splits["train"])

## Validation Set

Use this set to evaluate your model

In [380]:
df_validation = pd.read_csv("hf://datasets/mapsoriano/2016_2022_hate_speech_filipino/" + splits["validation"])

## Test Set

Use this set to test your model

In [381]:
df_test = pd.read_csv("hf://datasets/mapsoriano/2016_2022_hate_speech_filipino/" + splits["test"])

In [382]:
df_test.head()

Unnamed: 0,text,label
0,Binay: Patuloy ang kahirapan dahil sa maling p...,0
1,SA GOBYERNONG TAPAT WELCOME SA BAGUO ANG LAHAT...,0
2,wait so ur telling me Let Leni Lead mo pero NY...,1
3,[USERNAME]wish this is just a nightmare that ...,0
4,doc willie ong and isko sabunutan po,0


In [383]:
print(len(df_train))

21773


In [384]:
df_train.isnull().any().sum()

0

In [385]:
df_train.head()

Unnamed: 0,text,label
0,Presidential candidate Mar Roxas implies that ...,1
1,Parang may mali na sumunod ang patalastas ng N...,1
2,Bet ko. Pula Ang Kulay Ng Posas,1
3,[USERNAME] kakampink,0
4,Bakit parang tahimik ang mga PINK about Doc Wi...,1


In [386]:
tagalog = [
    "ako", "sa", "akin", "ko", "aking", "sarili", "kami", "atin", "ang", "aming",
    "amin", "ating", "ka", "iyong", "iyo", "inyong", "siya", "kanya", "mismo", "ito",
    "nito", "kanyang", "sila", "nila", "kanila", "kanilang", "kung", "ano", "alin",
    "sino", "kanino", "na", "mga", "iyon", "am", "ay", "maging", "naging", "mayroon",
    "may", "nagkaroon", "pagkakaroon", "gumawa", "ginagawa", "ginawa", "paggawa",
    "ibig", "dapat", "maaari", "marapat", "kong", "ikaw", "tayo", "hindi", "namin",
    "gusto", "nais", "niyang", "nilang", "niya", "huwag", "ginawang", "gagawin",
    "maaaring", "sabihin", "narito", "kapag", "ni", "nasaan", "bakit", "paano",
    "kailangan", "walang", "katiyakan", "isang", "at", "pero", "o", "dahil", "bilang",
    "hanggang", "habang", "ng", "pamamagitan", "para", "tungkol", "laban", "pagitan",
    "panahon", "bago", "pagkatapos", "itaas", "ibaba", "mula", "pataas", "pababa",
    "palabas", "ibabaw", "ilalim", "muli", "pa", "minsan", "dito", "doon", "saan",
    "lahat", "anumang", "kapwa", "bawat", "ilan", "karamihan", "iba", "tulad",
    "lamang", "pareho", "kaya", "kaysa", "masyado", "napaka", "isa", "bababa",
    "kulang", "marami", "ngayon", "kailanman", "sabi", "nabanggit", "din", "kumuha",
    "pumunta", "pumupunta", "ilagay", "makita", "nakita", "katulad", "mahusay",
    "likod", "kahit", "paraan", "noon", "gayunman", "dalawa", "tatlo", "apat",
    "lima", "una", "pangalawa"
]


In [387]:

# convert text to lowercase
df_train["text"] = df_train["text"].str.lower()

# Removing stopwords from the data
stop_words = tagalog + stopwords.words("english")
df_train["text"] = df_train["text"].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))

# removing links
df_train["text"] = df_train["text"].apply(lambda x: re.sub(r"http\S+|www\.\S+", "", x))

# removing email addresses
df_train["text"] = df_train["text"].apply(lambda x: re.sub(r"\w+@\w+\.com", "", x))

# removing punctuation marks
df_train["text"] = df_train["text"].apply(lambda x: re.sub(r"[.,;:!\?\"'`]", "", x))

# removing special characters
df_train["text"] = df_train["text"].apply(lambda x: re.sub(r"[@#$%^&*\/\+-_=\{\}<>]", "", x))

# removing unnecessary characters
df_train["text"] = df_train["text"].apply(lambda x: re.sub(r"½m|½s|½t|½ï", "", x))



In [388]:
df_train.head()

Unnamed: 0,text,label
0,presidential candidate mar roxas implies govt ...,1
1,parang mali sumunod patalastas nescaf coffee b...,1
2,bet ko pula kulay posas,1
3,username kakampink,0
4,parang tahimik pink doc willie ong reaction paper,1


In [389]:
df_train.sample(10)

Unnamed: 0,text,label
6123,norberto gonzales owemjiiiiii lee kuan yew phi...,0
21325,sana kakampinks let leni lead leni robredo,0
18744,username smnipresidential debate carl balita c...,0
12567,believe mar roxas call sen grace poe taken ser...,0
2237,di gane kaayo halos hadlok pangutana boy abund...,1
18857,makialam,0
21131,jackkosh usernameconvicted several tax evasion...,1
18766,grabe binay oi,0
17181,totoo bang num si binay presidential survey t...,1
19522,username lakas makamind conditioning survey ho...,0


In [390]:
df_train.head()

Unnamed: 0,text,label
0,presidential candidate mar roxas implies govt ...,1
1,parang mali sumunod patalastas nescaf coffee b...,1
2,bet ko pula kulay posas,1
3,username kakampink,0
4,parang tahimik pink doc willie ong reaction paper,1


In [391]:
wnl = WordNetLemmatizer()
df_train["text"] = df_train["text"].apply(lambda x: " ".join(wnl.lemmatize(word, "v") for word in x.split()))

df_train.head()

Unnamed: 0,text,label
0,presidential candidate mar roxas imply govt li...,1
1,parang mali sumunod patalastas nescaf coffee b...,1
2,bet ko pula kulay posas,1
3,username kakampink,0
4,parang tahimik pink doc willie ong reaction paper,1


In [392]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit(df_train["text"])
count_array = count_vectorizer.transform(df_train["text"]).toarray()

data_cv = pd.DataFrame(count_array, columns=count_vectorizer.get_feature_names_out())
data_cv


Unnamed: 0,aa,aaaaa,aaaaaa,aaaaaaa,aaaaaaaaaaaaaaaaa,aaaaaah,aaaaaahhhhhhh,aaaaahhhhh,aaaah,aaaannd,...,zubiaga,zubiagapartylist,zubiri,zubiripinag,zulueta,zumba,zuriaga,zyx,zzaj,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21768,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21769,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21770,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21771,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [393]:
# Applying preprocessing to the datasets
df_train['cleaned_text'] = df_train['text']
df_validation['cleaned_text'] = df_validation['text']
df_test['cleaned_text'] = df_test['text']


In [394]:
# Vectorization
vectorizer = TfidfVectorizer()  # or CountVectorizer()
X_train = vectorizer.fit_transform(df_train['cleaned_text'])
y_train = df_train['label']

X_validation = vectorizer.transform(df_validation['cleaned_text'])
y_validation = df_validation['label']

X_test = vectorizer.transform(df_test['cleaned_text'])
y_test = df_test['label']

In [395]:
# Multinomial Naive Bayes Model Training
model = MultinomialNB()
model.fit(X_train, y_train)

In [396]:
# Validation
y_pred_val = model.predict(X_validation)
print("Validation Accuracy: ", accuracy_score(y_validation, y_pred_val))

Validation Accuracy:  0.8228571428571428


In [397]:
# Test
y_pred_test = model.predict(X_test)
print("Test Accuracy: ", accuracy_score(y_test, y_pred_test))

Test Accuracy:  0.8202846975088968


In [398]:
# Evaluation Metrics
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

Confusion Matrix:
[[1073  339]
 [ 166 1232]]


In [399]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))


Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.76      0.81      1412
           1       0.78      0.88      0.83      1398

    accuracy                           0.82      2810
   macro avg       0.83      0.82      0.82      2810
weighted avg       0.83      0.82      0.82      2810



In [410]:
# NEW: Testing Tagalog hate speech detection with new text input
new_text = pd.Series("pogi")

# Apply the same preprocessing steps to the new input
#new_text_cleaned = new_text.apply(preprocess_text)

# Transform the new text using the trained vectorizer (vect)
new_text_transform = vectorizer.transform(new_text)

# Make the prediction using the trained Naive Bayes model (nb)
prediction = model.predict(new_text_transform)

# Interpret the prediction result
if prediction == 1:
    print("The sentence is classified as hate speech.")
else:
    print("The sentence is classified as non-hate speech.")

The sentence is classified as hate speech.
