In [2]:
import pandas as pd
import re
import nltk
import joblib
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

df = pd.read_csv('reviews_badminton\data.csv')
df['Sentiment'] = df['Ratings'].apply(lambda x: 1 if x >= 4 else 0)

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

df['Cleaned_Review'] = df['Review text'].apply(clean_text)

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    df['Cleaned_Review'], df['Sentiment'], test_size=0.2, random_state=42
)

tfidf_ngram = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_ngram = tfidf_ngram.fit_transform(X_train_raw)
X_test_ngram = tfidf_ngram.transform(X_test_raw)

rf_model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
rf_model.fit(X_train_ngram, y_train)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_ngram, y_train)



balanced_log_model = LogisticRegression(max_iter=1000)
balanced_log_model.fit(X_train_res, y_train_res)

print("--- Random Forest + Bigrams ---")
print(classification_report(y_test, rf_model.predict(X_test_ngram)))

print("\n--- SMOTE + Balanced Logistic Regression ---")
print(classification_report(y_test, balanced_log_model.predict(X_test_ngram)))

joblib.dump(rf_model, 'best_sentiment_model.pkl')
joblib.dump(tfidf_ngram, 'tfidf_ngram_vectorizer.pkl')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ychau\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ychau\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


--- Random Forest + Bigrams ---
              precision    recall  f1-score   support

           0       0.68      0.59      0.63       325
           1       0.91      0.93      0.92      1379

    accuracy                           0.87      1704
   macro avg       0.79      0.76      0.78      1704
weighted avg       0.86      0.87      0.86      1704


--- SMOTE + Balanced Logistic Regression ---
              precision    recall  f1-score   support

           0       0.59      0.70      0.64       325
           1       0.93      0.88      0.90      1379

    accuracy                           0.85      1704
   macro avg       0.76      0.79      0.77      1704
weighted avg       0.86      0.85      0.85      1704



['tfidf_ngram_vectorizer.pkl']