In [20]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import Normalizer
from nltk.tokenize import word_tokenize
nltk.download('all')

In [54]:
df = pd.read_csv('reviews.csv')
df = df[['Description', 'Stars']]

In [55]:
df = df.dropna()
df

Unnamed: 0,Description,Stars
0,"I am a large, sometimes an XL. I ordered a 4XL...",5.0
1,I purchased various Legendary Whitetails Men's...,5.0
2,"I love this jacket shirt. It is really nice, w...",5.0
3,The short answer to if you should go down from...,4.0
4,I got this for my husband for Christmas. Appar...,5.0
...,...,...
2995,Got this for my husband not knowing if it was ...,5.0
2996,"Great fabric and lining, this ""Shaket"" is made...",3.0
2997,This is a really nice jacket. It’s lined and ...,5.0
2998,"Was looking for a ""sh-acket"" that was not only...",5.0


In [53]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [56]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    # Convert tokens to lowercase
    tokens = [token.lower() for token in tokens]
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [84]:
df['processed_description'] = df['Description'].apply(preprocess_text)
df['Stars'] = df['Stars'].astype(int)
df

Unnamed: 0,Description,Stars,processed_description
0,"I am a large, sometimes an XL. I ordered a 4XL...",5,"large , sometimes xl . ordered 4xl based diagr..."
1,I purchased various Legendary Whitetails Men's...,5,purchased various legendary whitetail men 's j...
2,"I love this jacket shirt. It is really nice, w...",5,"love jacket shirt . really nice , warm comfort..."
3,The short answer to if you should go down from...,4,"short answer go typical size : yes , probably ..."
4,I got this for my husband for Christmas. Appar...,5,got husband christmas . apparently popular . l...
...,...,...,...
2995,Got this for my husband not knowing if it was ...,5,got husband knowing going fit well . 's hard f...
2996,"Great fabric and lining, this ""Shaket"" is made...",3,"great fabric lining , `` shaket '' made materi..."
2997,This is a really nice jacket. It’s lined and ...,5,really nice jacket . ’ lined medium weight rai...
2998,"Was looking for a ""sh-acket"" that was not only...",5,looking `` sh-acket '' lightweight provided wa...


In [86]:
column_to_drop = 'Description'
df.drop(columns=[column_to_drop], inplace=True)


In [87]:

df

Unnamed: 0,Stars,processed_description
0,5,"large , sometimes xl . ordered 4xl based diagr..."
1,5,purchased various legendary whitetail men 's j...
2,5,"love jacket shirt . really nice , warm comfort..."
3,4,"short answer go typical size : yes , probably ..."
4,5,got husband christmas . apparently popular . l...
...,...,...
2995,5,got husband knowing going fit well . 's hard f...
2996,3,"great fabric lining , `` shaket '' made materi..."
2997,5,really nice jacket . ’ lined medium weight rai...
2998,5,looking `` sh-acket '' lightweight provided wa...


(600,)

In [88]:
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words='english')

In [89]:
y = df.Stars
x = vectorizer.fit_transform(df.processed_description)

In [90]:
X_Train, X_Test, y_Train, y_Test = train_test_split(x,y,test_size= 0.2, random_state=42)

In [91]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
logreg_classifier = LogisticRegression(max_iter=1000)
logreg_classifier.fit(X_Train, y_Train)
y_pred = logreg_classifier.predict(X_Test)
accuracy = accuracy_score(y_Test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_Test, y_pred))


Accuracy: 1.00
              precision    recall  f1-score   support

           3       1.00      1.00      1.00        56
           4       1.00      1.00      1.00        60
           5       1.00      1.00      1.00       484

    accuracy                           1.00       600
   macro avg       1.00      1.00      1.00       600
weighted avg       1.00      1.00      1.00       600



In [92]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nb_classifier = MultinomialNB()
nb_classifier.fit(X_Train, y_Train)
y_pred = nb_classifier.predict(X_Test)
accuracy = accuracy_score(y_Test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_Test, y_pred))


Accuracy: 1.00
              precision    recall  f1-score   support

           3       1.00      1.00      1.00        56
           4       1.00      1.00      1.00        60
           5       1.00      1.00      1.00       484

    accuracy                           1.00       600
   macro avg       1.00      1.00      1.00       600
weighted avg       1.00      1.00      1.00       600



In [93]:

random_review = "good fit and sewing"

vectorized_review = vectorizer.transform([random_review])


predicted_rating = logreg_classifier.predict(vectorized_review)[0]

print(f"The predicted rating for the review '{random_review}' is: {predicted_rating}")

The predicted rating for the review 'good fit and sewing' is: 5


In [94]:
random_review = "good fit and sewing"

vectorized_review = vectorizer.transform([random_review])


predicted_rating = nb_classifier.predict(vectorized_review)[0]

print(f"The predicted rating for the review '{random_review}' is: {predicted_rating}")

The predicted rating for the review 'good fit and sewing' is: 5


In [107]:
random_review = "good fit but bad sewing"

vectorized_review = vectorizer.transform([random_review])


predicted_rating = nb_classifier.predict(vectorized_review)[0]

print(f"The predicted rating for the review '{random_review}' is: {predicted_rating}")

The predicted rating for the review 'good fit but bad sewing' is: 4
