# Naive Bayes Classifier Models on Text Data
## Author: Aron Gu
## Date: November 9, 2024

## Import Libraries

In [483]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, ComplementNB
from sklearn.metrics import classification_report, f1_score, accuracy_score
import re

## Load Text Dataset

In [484]:
text_data = pd.read_csv('valid.csv')

## Dataset Info

In [485]:
# Display first 30 rows of dataset
text_data.head(30)

Unnamed: 0,id,text,label
0,0,We started giving this to my 5 year old Labrad...,dogs
1,1,This product is easy to set up and use. I have...,dogs
2,2,my dog has nerves and wants to itch and chew.....,dogs
3,3,the breeded gave us a can of Nupro when we too...,dogs
4,4,"Very study, well made poop bag. Easy to open a...",dogs
5,5,Perfect size for 2 cats. It's made out of stur...,cats
6,6,Overpriced for the size. I was expecting a muc...,dogs
7,7,"Our dog is usually a nibbler, but not with thi...",dogs
8,8,My dogs love these! I give them either as stan...,dogs
9,9,My mom gave me one of these for my cats as I h...,cats


In [486]:
# Display info on dataset
text_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15499 entries, 0 to 15498
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      15499 non-null  int64 
 1   text    15499 non-null  object
 2   label   15499 non-null  object
dtypes: int64(1), object(2)
memory usage: 363.4+ KB


In [487]:
# Get statistics on labels for dataset
text_data['label'].describe()

count     15499
unique        2
top        dogs
freq       9333
Name: label, dtype: object

## Preprocess Text in Dataset

In [488]:
# Basic text cleaning function without manual stop word removal
def preprocess_text(text):
    # Remove special characters, punctuation, and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
    # Convert text to lowercase
    return text.lower()

# Apply preprocessing to text data
text_data['processed_text'] = text_data['text'].apply(preprocess_text)
text_data.head(30)

Unnamed: 0,id,text,label,processed_text
0,0,We started giving this to my 5 year old Labrad...,dogs,we started giving this to my year old labrado...
1,1,This product is easy to set up and use. I have...,dogs,this product is easy to set up and use i have ...
2,2,my dog has nerves and wants to itch and chew.....,dogs,my dog has nerves and wants to itch and chewth...
3,3,the breeded gave us a can of Nupro when we too...,dogs,the breeded gave us a can of nupro when we too...
4,4,"Very study, well made poop bag. Easy to open a...",dogs,very study well made poop bag easy to open and...
5,5,Perfect size for 2 cats. It's made out of stur...,cats,perfect size for cats its made out of sturdy ...
6,6,Overpriced for the size. I was expecting a muc...,dogs,overpriced for the size i was expecting a much...
7,7,"Our dog is usually a nibbler, but not with thi...",dogs,our dog is usually a nibbler but not with this...
8,8,My dogs love these! I give them either as stan...,dogs,my dogs love these i give them either as stand...
9,9,My mom gave me one of these for my cats as I h...,cats,my mom gave me one of these for my cats as i h...


## Feature Engineering(Remove Irrelevant Features)

- Since we only will use the preprocessed text to determine the labels, the text and id columns can be removed from the dataset

In [489]:
# Remove text and id columns from text_data set
text_data.drop(['id', 'text'], axis=1, inplace=True)
text_data.head(30)

Unnamed: 0,label,processed_text
0,dogs,we started giving this to my year old labrado...
1,dogs,this product is easy to set up and use i have ...
2,dogs,my dog has nerves and wants to itch and chewth...
3,dogs,the breeded gave us a can of nupro when we too...
4,dogs,very study well made poop bag easy to open and...
5,cats,perfect size for cats its made out of sturdy ...
6,dogs,overpriced for the size i was expecting a much...
7,dogs,our dog is usually a nibbler but not with this...
8,dogs,my dogs love these i give them either as stand...
9,cats,my mom gave me one of these for my cats as i h...


## Segreating Variables: Independent and Dependent Variables

In [490]:
X = text_data['processed_text']
y = text_data['label']

## Split Data into Train and Test Sets

In [491]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Create Bag of Words for Training and Test Data(Multinomial and Complement Naive Bayes Model)

In [492]:
count_vector = CountVectorizer(stop_words='english')
train_data = count_vector.fit_transform(X_train)
test_data = count_vector.transform(X_test)

## Create Bag of Words for Training and Test Data(Bernoulli Naive Bayes Model)

In [493]:
count_vector_binary = CountVectorizer(stop_words='english', binary=True)
train_data_binary = count_vector.fit_transform(X_train)
test_data_binary = count_vector.transform(X_test)

## Build Multinomial Naive Bayes Model

In [494]:
multinomial_NB_model = MultinomialNB()

# Train model and make predictions on training and test data
multinomial_NB_model.fit(train_data, y_train)

y_predict_train = multinomial_NB_model.predict(train_data)
y_predict = multinomial_NB_model.predict(test_data)

In [495]:
# Get accuracy for train and test data
print(f"Accuracy for Training data: {accuracy_score(y_train, y_predict_train): .4f}")
print(f"Accuracy for Testing data: {accuracy_score(y_test, y_predict): .4f}")

Accuracy for Training data:  0.8930
Accuracy for Testing data:  0.8413


In [496]:
# Get f1 score for train and test data
print(f"F1 Score for Training data on Dogs: {f1_score(y_train, y_predict_train, pos_label='dogs'): .4f}")
print(f"F1 Score for Testing data on Dogs: {f1_score(y_test, y_predict, pos_label='dogs'): .4f}")
print(f"F1 Score for Training data on Cats: {f1_score(y_train, y_predict_train, pos_label='cats'): .4f}")
print(f"F1 Score for Testing data on Cats: {f1_score(y_test, y_predict, pos_label='cats'): .4f}")

F1 Score for Training data on Dogs:  0.9131
F1 Score for Testing data on Dogs:  0.8708
F1 Score for Training data on Cats:  0.8607
F1 Score for Testing data on Cats:  0.7942


In [497]:
# Print classification report for training data
print("Classification Report:\n", classification_report(y_train, y_predict_train))

Classification Report:
               precision    recall  f1-score   support

        cats       0.89      0.83      0.86      4301
        dogs       0.90      0.93      0.91      6548

    accuracy                           0.89     10849
   macro avg       0.89      0.88      0.89     10849
weighted avg       0.89      0.89      0.89     10849



In [498]:
# Print classification report for test data
print("Classification Report:\n", classification_report(y_test, y_predict))

Classification Report:
               precision    recall  f1-score   support

        cats       0.83      0.76      0.79      1865
        dogs       0.85      0.89      0.87      2785

    accuracy                           0.84      4650
   macro avg       0.84      0.83      0.83      4650
weighted avg       0.84      0.84      0.84      4650



## Build Bernoulli Naive Bayes Model

In [499]:
bernoulli_NB_model = BernoulliNB()

# Train model and make predictions on training and test data
bernoulli_NB_model.fit(train_data_binary, y_train)

y_predict_train_binary = bernoulli_NB_model.predict(train_data_binary)
y_predict_binary = bernoulli_NB_model.predict(test_data_binary)

In [500]:
# Get accuracy for train and test data
print(f"Accuracy for Training data: {accuracy_score(y_train, y_predict_train_binary): .4f}")
print(f"Accuracy for Testing data: {accuracy_score(y_test, y_predict_binary): .4f}")

Accuracy for Training data:  0.8483
Accuracy for Testing data:  0.7905


In [501]:
# Get f1 score for train and test data
print(f"F1 Score for Training data on Dogs: {f1_score(y_train, y_predict_train_binary, pos_label='dogs'): .4f}")
print(f"F1 Score for Testing data on Dogs: {f1_score(y_test, y_predict_binary, pos_label='dogs'): .4f}")
print(f"F1 Score for Training data on Cats: {f1_score(y_train, y_predict_train_binary, pos_label='cats'): .4f}")
print(f"F1 Score for Testing data on Cats: {f1_score(y_test, y_predict_binary, pos_label='cats'): .4f}")

F1 Score for Training data on Dogs:  0.8844
F1 Score for Testing data on Dogs:  0.8414
F1 Score for Training data on Cats:  0.7793
F1 Score for Testing data on Cats:  0.6916


In [502]:
# Print classification report for training data
print("Classification Report:\n", classification_report(y_train, y_predict_train_binary))

Classification Report:
               precision    recall  f1-score   support

        cats       0.92      0.68      0.78      4301
        dogs       0.82      0.96      0.88      6548

    accuracy                           0.85     10849
   macro avg       0.87      0.82      0.83     10849
weighted avg       0.86      0.85      0.84     10849



In [503]:
# Print classification report for test data
print("Classification Report:\n", classification_report(y_test, y_predict_binary))

Classification Report:
               precision    recall  f1-score   support

        cats       0.84      0.59      0.69      1865
        dogs       0.77      0.93      0.84      2785

    accuracy                           0.79      4650
   macro avg       0.81      0.76      0.77      4650
weighted avg       0.80      0.79      0.78      4650



## Build Complement Naive Bayes Model

In [504]:
complement_NB_model = ComplementNB()

# Train model and make predictions on training and test data
complement_NB_model.fit(train_data, y_train)

y_predict_train_complement = complement_NB_model.predict(train_data)
y_predict_complement = complement_NB_model.predict(test_data)

In [505]:
# Get accuracy for train and test data
print(f"Accuracy for Training data: {accuracy_score(y_train, y_predict_train_complement): .4f}")
print(f"Accuracy for Testing data: {accuracy_score(y_test, y_predict_complement): .4f}")

Accuracy for Training data:  0.8944
Accuracy for Testing data:  0.8394


In [506]:
# Get f1 score for train and test data
print(f"F1 Score for Training data on Dogs: {f1_score(y_train, y_predict_train_complement, pos_label='dogs'): .4f}")
print(f"F1 Score for Testing data on Dogs: {f1_score(y_test, y_predict_complement, pos_label='dogs'): .4f}")
print(f"F1 Score for Training data on Cats: {f1_score(y_train, y_predict_train_complement, pos_label='cats'): .4f}")
print(f"F1 Score for Testing data on Cats: {f1_score(y_test, y_predict_complement, pos_label='cats'): .4f}")

F1 Score for Training data on Dogs:  0.9130
F1 Score for Testing data on Dogs:  0.8672
F1 Score for Training data on Cats:  0.8656
F1 Score for Testing data on Cats:  0.7967


In [507]:
# Print classification report for training data
print("Classification Report:\n", classification_report(y_train, y_predict_train_complement))

Classification Report:
               precision    recall  f1-score   support

        cats       0.87      0.86      0.87      4301
        dogs       0.91      0.92      0.91      6548

    accuracy                           0.89     10849
   macro avg       0.89      0.89      0.89     10849
weighted avg       0.89      0.89      0.89     10849



In [508]:
# Print classification report for test data
print("Classification Report:\n", classification_report(y_test, y_predict_complement))

Classification Report:
               precision    recall  f1-score   support

        cats       0.81      0.78      0.80      1865
        dogs       0.86      0.88      0.87      2785

    accuracy                           0.84      4650
   macro avg       0.83      0.83      0.83      4650
weighted avg       0.84      0.84      0.84      4650



## Conclusions From Comparing the 3 Naive Bayes Models
- Multinomial and Complement Naive Bayes performs better than Bernoulli Naive Bayes given the higher accuracy scores
- This is to be expected since Multinomial and Complement accounts for the frequency of the words in each document(i.e. sentence) whereas Bernoulli only accounts for the presence or absence of words in each document(1 indicates presence of word in sentence, 0 indicates absence of word insentence)