In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack
from sklearn.preprocessing import LabelEncoder

In [64]:
# Load datasets
train_df = pd.read_csv("C:/GitHub/Machine-Learning/INF1279H/train_data.csv")
test_df = pd.read_csv("C:/GitHub/Machine-Learning/INF1279H/test_data.csv")


In [65]:
# Drop unnecessary columns (Clothing ID - not useful for prediction)
train_df.drop(columns=["Clothing ID"], inplace=True)
test_df.drop(columns=["Clothing ID"], inplace=True)

In [66]:
# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
train_df["Title"] = train_df["Title"].fillna("")
train_df["Review Text"] = train_df["Review Text"].fillna("")
test_df["Title"]=test_df["Title"].fillna("")
test_df["Review Text"]= test_df["Review Text"].fillna("")

In [67]:
# Encode categorical features
label_encoders = {}
categorical_cols = ["Division Name", "Department Name", "Class Name"]

for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    
    # Check for unseen categories in test set
    test_df[col] = test_df[col].astype(str).map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    
    label_encoders[col] = le

In [68]:
# Convert text data into TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
train_tfidf = tfidf_vectorizer.fit_transform(train_df["Review Text"] + " " + train_df["Title"])
test_tfidf = tfidf_vectorizer.transform(test_df["Review Text"] + " " + test_df["Title"])

In [69]:

# Define features and target
X_train = hstack((train_tfidf, train_df.drop(columns=["Review Text", "Title", "Recommended IND", "Rating"])))
y_train = train_df["Recommended IND"]
X_test = hstack((test_tfidf, test_df.drop(columns=["Review Text", "Title", "Recommended IND", "Rating"])))
y_test = test_df["Recommended IND"]

In [70]:
# # Train-Test Split for validation
# X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [71]:
# Train Random Forest Model with Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

In [72]:
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [73]:
# Best model
best_rf = grid_search.best_estimator_

In [74]:

# Predictions
y_pred = best_rf.predict(X_test)


In [75]:
# Evaluate model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Weighted F1-score: {f1_score(y_test, y_pred, average='weighted'):.4f}")

Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.74      0.64       851
           1       0.94      0.88      0.91      3847

    accuracy                           0.85      4698
   macro avg       0.75      0.81      0.77      4698
weighted avg       0.87      0.85      0.86      4698

Weighted F1-score: 0.8586


In [76]:
# # Final Model Training on Full Data
# best_rf.fit(X_train, y_train)

In [77]:
# # Predictions on Test Set
# y_test_pred = best_rf.predict(X_test)

In [78]:
# # Test Evaluation
# print("\nTest Set Performance:")
# print(classification_report(y_test, y_test_pred))
# print(f"Final Weighted F1-score: {f1_score(y_test, y_test_pred, average='weighted'):.4f}")