In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack

In [11]:

# Load Dataset 
df = pd.read_csv("reviews.csv")
df.head()

Unnamed: 0,category,rating,text,label
0,Home_and_Kitchen,5.0,"Love this! Well made, sturdy, and very comfor...",1
1,Home_and_Kitchen,5.0,"love it, a great upgrade from the original. I...",1
2,Home_and_Kitchen,5.0,This pillow saved my back. I love the look and...,1
3,Home_and_Kitchen,1.0,"Missing information on how to use it, but it i...",1
4,Home_and_Kitchen,5.0,Very nice set. Good quality. We have had the s...,1


In [12]:

# Drop duplicates and missing values
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

# Clean text: lowercase and strip
df['text'] = df['text'].str.strip().str.lower()
df['category'] = df['category'].str.strip().str.lower()

# Encode Categorical Columns 
le_category = LabelEncoder()
df['category'] = le_category.fit_transform(df['category'])

# Text Vectorization 
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_text = vectorizer.fit_transform(df['text'])


In [13]:
# Combine Features 
# Use category and rating as additional features
X_other = df[['category', 'rating']].values
X_other = StandardScaler().fit_transform(X_other)

# Combine sparse text matrix with numeric features
X = hstack([X_text, X_other])
y = df['label']


In [14]:

# Train-Test Split 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Model Training 
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8619923466238736