# OBJECTIVE
To automatically classify customer product reviews into predefined categories or themes
(such as Delivery Issue, Product Quality, Customer Service, etc.) using Natural Language Processing (NLP) 
and Machine Learning (Multinomial Naive Bayes).
# IMPORTANCE
Companies like Amazon, Flipkart, etc., receive thousands of customer reviews. Manually reading and analyzing them is:
Time-consuming
Prone to human error
Hard to scale
This project automates that process by:
Reading each review Understanding what the issue or topic is about Categorizing it into themes

# INPUT DATA
A CSV file (product_reviews.csv) containing:
review_text (the customer's review)
category (manually labeled themes for training)

# OUTPUT DATA
A trained machine learning model that can predict the category (theme) of a customer review.

# CHALLANGES
1 IMBALANCED DATA
2 LACK OF MULTILINGUAL SUPPORT
3 LACK OF SEMANTIC UNDERSTANDING

# IMPROVEMENTS
1 ADD MORE TRAINING DATA
2 IMPROVE BIASNESS OF MODEL






In [19]:
# IMPORTING LIBRARIES
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB  
from sklearn.metrics import classification_report
import joblib

#PREPEOCESSING
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    return text

#LOAD DATASET
df = pd.read_csv('data/product_reviews.csv')
df.dropna(inplace=True)
df['clean_review'] = df['review_text'].apply(clean_text)

#SPLITTING DATASET
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_review'], df['category'], test_size=0.2, random_state=42
)

#VECTORIZER
vectorizer = TfidfVectorizer(max_features=3000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

#MODEL TRAINING
model = MultinomialNB()
model.fit(X_train_vec, y_train)

#EVALUATE
y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))

#SAVING
joblib.dump(model, "review_classifier_nb.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

#PREDICTION
def predict_theme(text):
    model = joblib.load("review_classifier_nb.pkl")
    vectorizer = joblib.load("tfidf_vectorizer.pkl")
    text = clean_text(text)
    vec = vectorizer.transform([text])
    return model.predict(vec)[0]


# Test it
print(predict_theme("delivery was late"))

                  precision    recall  f1-score   support

Customer Service       1.00      1.00      1.00        29
  Delivery Issue       1.00      1.00      1.00        15
 Product Quality       1.00      1.00      1.00        16

        accuracy                           1.00        60
       macro avg       1.00      1.00      1.00        60
    weighted avg       1.00      1.00      1.00        60

Delivery Issue
