<a href="https://colab.research.google.com/github/badong0/CCMACLRL_EXERCISES_COM221ML/blob/main/Exercise7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 7: Hate Speech Classification using Multinomial Naive Bayes

Instructions:
- You do not need to split your data. Use the training, validation and test sets provided below.
- Use Multinomial Naive Bayes to train a model that can classify if a sentence if a hate speech or not
- A sentence with a label of zero (0) is classified as non-hate speech
- A sentence with a label of one (1) is classified as a hate speech

Apply text pre-processing techniques such as
- Converting to lowercase
- Stop word Removal
- Removal of digits, special characters
- Stemming or Lemmatization but not both
- Count Vectorizer or TF-IDF Vectorizer but not both

Evaluate your model by:
- Providing input by yourself
- Creating a Confusion Matrix
- Calculating the Accuracy, Precision, Recall and F1-Score

In [47]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import seaborn as sns
import re
import os, types

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, precision_score, recall_score, accuracy_score, balanced_accuracy_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download("punkt")
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
splits = {'train': 'unique_train_dataset.csv', 'validation': 'unique_validation_dataset.csv', 'test': 'unique_test_dataset.csv'}

## Training Set

Use this to train your model

In [18]:
df_train = pd.read_csv("hf://datasets/mapsoriano/2016_2022_hate_speech_filipino/" + splits["train"])

## Validation Set

Use this set to evaluate your model

In [None]:
df_validation = pd.read_csv("hf://datasets/mapsoriano/2016_2022_hate_speech_filipino/" + splits["validation"])

## Test Set

Use this set to test your model

In [None]:
df_test = pd.read_csv("hf://datasets/mapsoriano/2016_2022_hate_speech_filipino/" + splits["test"])

In [48]:
# Importing necessary libraries
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

df_train.head()

# Download the necessary resources for nltk (if needed)
nltk.download('punkt')
nltk.download('wordnet')
# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocessing Function for Tagalog Text with Extended Stop Words and Lemmatization
def preprocess_text(text):
    # Converting to lowercase
    text = text.lower()
    # Removing special characters and digits
    text = re.sub(r'\d+', '', text)  # remove digits
    text = re.sub(r'[^\w\s]', '', text)  # remove special characters
    # Extended list of stopwords (Tagalog + English)
    stopwords = set([
        # Tagalog stopwords
        'akin', 'aking', 'ako', 'alin', 'am', 'amin', 'aming', 'ang', 'ano', 'anumang', 'apat', 'at', 'atin', 'ating', 'ay',
        'bababa', 'bago', 'bakit', 'bawat', 'bilang', 'dahil', 'dalawa', 'dapat', 'din', 'dito', 'doon', 'gagawin',
        'gayunman', 'ginagawa', 'ginawa', 'ginawang', 'gumawa', 'gusto', 'habang', 'hanggang', 'hindi', 'huwag',
        'iba', 'ibaba', 'ibabaw', 'ibig', 'ikaw', 'ilagay', 'ilalim', 'ilan', 'inyong', 'isa', 'isang', 'itaas', 'ito',
        'iyo', 'iyon', 'iyong', 'ka', 'kahit', 'kailangan', 'kailanman', 'kami', 'kanila', 'kanilang', 'kanino', 'kanya',
        'kanyang', 'kapag', 'kapwa', 'karamihan', 'katiyakan', 'katulad', 'kaya', 'kaysa', 'ko', 'kong', 'kulang',
        'kumuha', 'kung', 'laban', 'lahat', 'lamang', 'likod', 'lima', 'maaari', 'maaaring', 'maging', 'mahusay',
        'makita', 'marami', 'marapat', 'masyado', 'may', 'mayroon', 'mga', 'minsan', 'mismo', 'mula', 'muli', 'na',
        'nabanggit', 'naging', 'nagkaroon', 'nais', 'nakita', 'namin', 'napaka', 'narito', 'nasaan', 'ng', 'ngayon', 'ni',
        'nila', 'nilang', 'nito', 'niya', 'niyang', 'noon', 'o', 'pa', 'paano', 'pababa', 'paggawa', 'pagitan',
        'pagkakaroon', 'pagkatapos', 'palabas', 'pamamagitan', 'panahon', 'pangalawa', 'para', 'paraan', 'pareho',
        'pataas', 'pero', 'pumunta', 'pumupunta', 'sa', 'saan', 'sabi', 'sabihin', 'sarili', 'sila', 'sino', 'siya',
        'tatlo', 'tayo', 'tulad', 'tungkol', 'una', 'walang',
        # Additional common Tagalog stop words
        'ba', 'eh', 'ha', 'hoy', 'na', 'nang', 'pala', 'po', 'yung', 'yung', 'ano', 'iyang', 'eh', 'yan', 'ngunit',
        # English stopwords
        'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'as', 'at',
        'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'could', 'did', 'do',
        'does', 'doing', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'has', 'have', 'having',
        'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'it',
        'its', 'itself', 'just', 'me', 'more', 'most', 'my', 'myself', 'no', 'nor', 'not', 'now', 'of', 'off', 'on',
        'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', 'she',
        'should', 'so', 'some', 'such', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then',
        'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was',
        'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'with', 'would', 'you',
        'your', 'yours', 'yourself', 'yourselves'
    ])
    # Apply lemmatization to each word (after removing stopwords)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords])
    return text

# Applying preprocessing to the datasets
df_train['cleaned_text'] = df_train['text'].apply(preprocess_text)
df_validation['cleaned_text'] = df_validation['text'].apply(preprocess_text)
df_test['cleaned_text'] = df_test['text'].apply(preprocess_text)
# Vectorization
vectorizer = TfidfVectorizer()  # or CountVectorizer()
X_train = vectorizer.fit_transform(df_train['cleaned_text'])
y_train = df_train['label']
X_validation = vectorizer.transform(df_validation['cleaned_text'])
y_validation = df_validation['label']
X_test = vectorizer.transform(df_test['cleaned_text'])
y_test = df_test['label']

# Multinomial Naive Bayes Model Training
model = MultinomialNB()
model.fit(X_train, y_train)

# Validation
y_pred_val = model.predict(X_validation)
print("Validation Accuracy: ", accuracy_score(y_validation, y_pred_val))

# Test
y_pred_test = model.predict(X_test)
print("Test Accuracy: ", accuracy_score(y_test, y_pred_test))

# Evaluation Metrics
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Validation Accuracy:  0.8382142857142857
Test Accuracy:  0.8380782918149466
Confusion Matrix:
[[1134  278]
 [ 177 1221]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.80      0.83      1412
           1       0.81      0.87      0.84      1398

    accuracy                           0.84      2810
   macro avg       0.84      0.84      0.84      2810
weighted avg       0.84      0.84      0.84      2810



In [56]:
# NEW: Testing Tagalog hate speech detection with new text input
new_text = pd.Series("takot ka naman")
# Apply the same preprocessing steps to the new input
#new_text_cleaned = new_text.apply(preprocess_text)
# Transform the new text using the trained vectorizer (vect)
new_text_transform = vectorizer.transform(new_text)
# Make the prediction using the trained Naive Bayes model (nb)
prediction = model.predict(new_text_transform)
# Interpret the prediction result
if prediction == 1:
    print("The sentence is classified as hate speech.")
else:
    print("The sentence is classified as non-hate speech.")

The sentence is classified as hate speech.
