In [22]:
import re
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc

%matplotlib inline
pd.options.display.max_rows = 10
np.set_printoptions(precision=4, suppress=True)

In [2]:
df = pd.read_csv('balanced_reviews.csv')
df

Unnamed: 0,ReviewId,RecipeId,AuthorId,AuthorName,Rating,Review,DateSubmitted,DateModified
0,1163076,42738,1728091,frosty1961,0,This was very good but I added 1/2 a cup of sh...,2010-11-12T14:38:46Z,2010-11-12T14:38:46Z
1,187623,129556,90324,Tarra1,0,We call this ABC Dip almonds bacon and cheese....,2005-07-13T10:05:25Z,2005-07-13T10:05:25Z
2,1343062,208146,2465854,jarvisg,0,I made this for an office potluck; threw it to...,2012-10-24T16:41:50Z,2012-10-24T16:41:50Z
3,538267,49769,64642,Molly53,0,I have this very same recipe in an old cookboo...,2007-12-20T21:10:59Z,2007-12-20T21:10:59Z
4,1174163,169929,679953,weekend cooker,0,Great cookies! !! Made recipe as posted with n...,2010-12-20T17:56:14Z,2010-12-20T17:56:14Z
...,...,...,...,...,...,...,...,...
95995,1427035,268418,114197,Dave of Tucson,5,The seasoning is the best that I have tried! T...,2014-01-24T16:17:08Z,2014-01-24T16:17:08Z
95996,988862,145591,7676,Dr. Paul,5,Great Recipe. I'm living in a guest house in K...,2009-11-14T12:15:27Z,2009-11-14T12:15:27Z
95997,489331,118821,513848,memaw 2,5,delicious,2007-10-01T16:33:20Z,2007-10-01T16:33:20Z
95998,317323,128343,130819,Gerry,5,We love crunchy when it comes to perogies and ...,2006-09-28T16:10:03Z,2006-09-28T16:10:03Z


## Preprocess the text

In [4]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df["Cleaned_Review"] = df["Review"].apply(clean_text)
df["Rating"] = df["Rating"].astype(int)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\awanh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Applying TF-IDF

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

tfidf_matrix = tfidf_vectorizer.fit_transform(df["Cleaned_Review"])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

tfidf_df


Unnamed: 0,abc,abit,able,absolute,absolutely,absolutly,absorb,absorbed,abundance,accent,...,zesty,zing,zingo,zip,ziploc,ziplock,ziti,zucchini,zuchinni,zwt
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.499542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95995,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95996,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95997,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95998,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
y = df["Rating"]

x_train, x_test, y_train, y_test = train_test_split(tfidf_df, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.48


## ROC Curve