In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [2]:
# Load datasets
train_df = pd.read_csv("C:/GitHub/Machine-Learning/INF1279H/train_data.csv")
test_df = pd.read_csv("C:/GitHub/Machine-Learning/INF1279H/test_data.csv")

In [3]:
# Drop unnecessary columns (Clothing ID - not useful for prediction)
train_df.drop(columns=["Clothing ID"], inplace=True)
test_df.drop(columns=["Clothing ID"], inplace=True)

In [4]:
# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
train_df["Title"] = train_df["Title"].fillna("")
train_df["Review Text"] = train_df["Review Text"].fillna("")
test_df["Title"]=test_df["Title"].fillna("")
test_df["Review Text"]= test_df["Review Text"].fillna("")

In [5]:

# Encode categorical features
label_encoders = {}
categorical_cols = ["Division Name", "Department Name", "Class Name"]

for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    
    # Handle unseen categories in test set
    test_df[col] = test_df[col].astype(str).map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    
    label_encoders[col] = le

In [6]:
# Convert text data into TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
train_tfidf = tfidf_vectorizer.fit_transform(train_df["Review Text"] + " " + train_df["Title"])
test_tfidf = tfidf_vectorizer.transform(test_df["Review Text"] + " " + test_df["Title"])


In [7]:

# Define features and target
X_train = hstack((train_tfidf, train_df.drop(columns=["Review Text", "Title", "Rating", "Recommended IND"])))
y_train = train_df["Rating"]
X_test = hstack((test_tfidf, test_df.drop(columns=["Review Text", "Title", "Rating", "Recommended IND"])))
y_test = test_df["Rating"]


In [8]:
# Train Random Forest Model with Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

In [9]:
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_rf = grid_search.best_estimator_

In [11]:
# Predictions
y_pred = best_rf.predict(X_test)

# Evaluate model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Final Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Classification Report:
              precision    recall  f1-score   support

           1       0.42      0.22      0.28       153
           2       0.36      0.12      0.18       317
           3       0.44      0.28      0.34       620
           4       0.41      0.16      0.23      1017
           5       0.66      0.95      0.78      2591

    accuracy                           0.61      4698
   macro avg       0.46      0.34      0.36      4698
weighted avg       0.55      0.61      0.54      4698

Final Accuracy: 0.61
