<a href="https://colab.research.google.com/github/akbhrr/Numpy/blob/main/Customer_Rating_Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#import libraries

In [14]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score, classification_report


#load dataset

In [15]:

df = pd.read_csv("product_rating_prediction_5000.csv")
print("Dataset Loaded Successfully!")
print(df.head())


Dataset Loaded Successfully!
        product_name product_category  \
0            T-Shirt      Electronics   
1   Wireless Earbuds           Sports   
2   Face Moisturizer             Home   
3    Smartphone Case      Electronics   
4  Bluetooth Speaker          Fashion   

                                 product_description  \
0  horrible horrible scam waste of money. Packagi...   
1  superb high quality high quality premium. It a...   
2  satisfied reliable decent good. The product fe...   
3  excellent perfect high quality loved it. The b...   
4  worst broken scam horrible. The build quality ...   

                                         review_text     price  discount  \
0          terrible worst. The product feels sturdy.   1964.54        29   
1  superb value for money. The product feels sturdy.  15200.27        20   
2          decent decent. The build quality is okay.  16161.85        58   
3     superb high quality. The product feels sturdy.  13259.14        37   
4      

#cleaning

In [16]:
df["product_description"] = df["product_description"].fillna("")
df["review_text"] = df["review_text"].fillna("")
df["product_category"] = df["product_category"].fillna("Unknown")

#encoding

In [41]:
le= LabelEncoder()
df["product_category_encoded"] = le.fit_transform(df["product_category"])

#TF-IDF VECTORIZATION FOR TEXT

In [42]:
tfidf_desc = TfidfVectorizer(stop_words="english", max_features=1000)
tfidf_review = TfidfVectorizer(stop_words="english", max_features=1000)

desc_tfidf = tfidf_desc.fit_transform(df["product_description"]).toarray()
review_tfidf = tfidf_review.fit_transform(df["review_text"]).toarray()

desc_df = pd.DataFrame(desc_tfidf)
review_df = pd.DataFrame(review_tfidf)

#numeric features

In [43]:
numeric_features = df[["price", "discount", "product_category_encoded"]].reset_index(drop=True)

#combine all the features

In [44]:
X = pd.concat([numeric_features, desc_df, review_df], axis=1)
y = df["rating"]
print("features\n",X)
print("class label\n",y)

features
          price  discount  product_category_encoded    0         1         2  \
0      1964.54        29                         1  0.0  0.000000  0.000000   
1     15200.27        20                         4  0.0  0.000000  0.282422   
2     16161.85        58                         3  0.0  0.000000  0.000000   
3     13259.14        37                         1  0.0  0.000000  0.000000   
4     12995.90        21                         2  0.0  0.000000  0.000000   
...        ...       ...                       ...  ...       ...       ...   
4995  13934.07        50                         3  0.0  0.000000  0.000000   
4996  10725.94        42                         2  0.0  0.000000  0.373107   
4997  13610.74        61                         2  0.0  0.706952  0.000000   
4998  18912.02         0                         3  0.0  0.000000  0.000000   
4999  12876.18         3                         4  0.0  0.000000  0.000000   

             3         4         5       

# Convert all column names to string type to avoid TypeError in scikit-learn

In [45]:
X.columns = X.columns.astype(str)

print("Final feature shape:", X.shape)

Final feature shape: (5000, 97)


#TRAIN/TEST SPLIT

In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


#TRAIN RANDOM FOREST MODE

In [47]:
# 9. TRAIN RANDOM FOREST MODEL
# ============================================================
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42
)


model.fit(X_train, y_train)

#predict

In [48]:
y_pred = model.predict(X_test)


In [49]:
y_pred

array([2, 4, 5, 4, 3, 4, 4, 3, 2, 2, 5, 1, 5, 4, 4, 5, 3, 1, 1, 5, 1, 5,
       1, 4, 3, 3, 4, 3, 1, 1, 5, 2, 4, 5, 1, 4, 2, 4, 1, 1, 2, 1, 1, 3,
       2, 4, 1, 4, 1, 5, 1, 2, 5, 2, 5, 4, 2, 2, 4, 5, 5, 5, 2, 3, 1, 3,
       3, 4, 1, 1, 4, 3, 5, 3, 5, 2, 4, 3, 2, 4, 3, 2, 4, 2, 2, 2, 2, 3,
       5, 2, 3, 1, 4, 1, 1, 4, 2, 2, 4, 4, 5, 3, 4, 3, 4, 5, 4, 3, 4, 5,
       1, 4, 1, 3, 3, 2, 2, 2, 4, 1, 5, 5, 4, 3, 5, 1, 1, 1, 4, 3, 4, 5,
       5, 4, 2, 3, 3, 3, 1, 4, 4, 3, 2, 2, 4, 4, 3, 4, 2, 3, 5, 2, 1, 2,
       2, 1, 3, 5, 1, 3, 5, 5, 2, 2, 5, 4, 2, 2, 1, 4, 4, 5, 5, 5, 5, 1,
       5, 4, 1, 4, 2, 1, 2, 5, 1, 2, 3, 1, 5, 4, 4, 2, 5, 4, 5, 2, 3, 4,
       3, 2, 2, 2, 2, 1, 4, 3, 2, 1, 4, 3, 4, 5, 5, 4, 5, 5, 1, 1, 1, 3,
       4, 2, 1, 5, 4, 4, 2, 4, 4, 3, 1, 3, 1, 4, 1, 5, 4, 2, 1, 2, 1, 3,
       2, 2, 2, 3, 5, 3, 2, 4, 1, 1, 1, 5, 1, 1, 3, 2, 2, 1, 3, 2, 4, 3,
       2, 1, 1, 3, 5, 3, 3, 4, 4, 4, 1, 5, 2, 5, 3, 1, 1, 2, 3, 3, 5, 3,
       4, 1, 4, 5, 5, 4, 5, 1, 3, 5, 1, 5, 3, 5, 5,

#Evaluate Model

In [50]:
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy:", accuracy_score(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00       193
           2       1.00      1.00      1.00       209
           3       1.00      1.00      1.00       212
           4       1.00      1.00      1.00       196
           5       1.00      1.00      1.00       190

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000


Accuracy: 1.0


#predict new

In [60]:
def predict_rating(product_name, category, description, review, price, discount):

    # 1. Encode category using same label encoder
    category_encoded = le.transform([category])[0]

    # 2. TF-IDF transform for description
    desc_vec = tfidf_desc.transform([description]).toarray()

    # 3. TF-IDF transform for review text
    review_vec = tfidf_review.transform([review]).toarray()

    # 4. numeric features
    numeric = np.array([[price, discount, category_encoded]])
    # 5. combine all features
    full_features = np.concatenate([numeric, desc_vec, review_vec], axis=1)

    # . Predict using trained model
    predicted_rating = model.predict(full_features)[0]

    return predicted_rating


# Example: Try predicting a new product
new_rating = predict_rating(
    product_name="Headphone",
    category="Electronics",
    description="average sound quality and normal battery backup",
    review="value for the money",
    price=1499,
    discount=20
)

print("Predicted Rating:", new_rating)


Predicted Rating: 3


