In [3]:
!pip install nltk scikit-learn pandas matplotlib

import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
# Load the uploaded CSV file
df = pd.read_csv('/content/product_reviews_dataset.csv')

# Show the first few rows
df.head()


Unnamed: 0,review_text,review_label
0,Good value for money. Highly recommend! Very u...,good
1,Five stars! Very useful and easy to use.,good
2,Very useful and easy to use. Very useful and e...,good
3,Loved it. Very useful and easy to use. Excelle...,good
4,Worst product ever! Very disappointed.,worst


In [6]:
# Set up stopwords and stemmer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab') # Download punkt_tab
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Function to preprocess review text
def preprocess(text):
    # Convert non-string values to empty strings
    if not isinstance(text, str):
        text = ''
    tokens = nltk.word_tokenize(text.lower())
    filtered = [stemmer.stem(word) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(filtered)

# Apply the function to clean all reviews
df['cleaned_review'] = df['review_text'].apply(preprocess)
df[['review_text', 'cleaned_review', 'review_label']].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,review_text,cleaned_review,review_label
0,Good value for money. Highly recommend! Very u...,good valu money highli recommend use easi use ...,good
1,Five stars! Very useful and easy to use.,five star use easi use,good
2,Very useful and easy to use. Very useful and e...,use easi use use easi use highli recommend,good
3,Loved it. Very useful and easy to use. Excelle...,love use easi use excel product five star work...,good
4,Worst product ever! Very disappointed.,worst product ever disappoint,worst


In [8]:
# Convert cleaned text into numerical features
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_review'])

# Target labels
y = df['review_label']

In [9]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)


In [10]:
# Predict on test data
y_pred = model.predict(X_test)

# Show evaluation results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.99

Classification Report:
               precision    recall  f1-score   support

         bad       1.00      0.97      0.99        73
        good       1.00      1.00      1.00        57
       worst       0.97      1.00      0.99        70

    accuracy                           0.99       200
   macro avg       0.99      0.99      0.99       200
weighted avg       0.99      0.99      0.99       200


Confusion Matrix:
 [[71  0  2]
 [ 0 57  0]
 [ 0  0 70]]


In [11]:
import joblib

# Save trained model and TF-IDF vectorizer
joblib.dump(model, 'naive_bayes_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']