In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack


In [2]:
# Load datasets
train_df = pd.read_csv("C:/GitHub/Machine-Learning/INF1279H/train_data.csv")
test_df = pd.read_csv("C:/GitHub/Machine-Learning/INF1279H/test_data.csv")

In [3]:
# Drop unnecessary columns (Clothing ID - not useful for prediction)
train_df.drop(columns=["Clothing ID"], inplace=True)
test_df.drop(columns=["Clothing ID"], inplace=True)

In [4]:
# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
train_df["Title"] = train_df["Title"].fillna("")
train_df["Review Text"] = train_df["Review Text"].fillna("")
test_df["Title"]=test_df["Title"].fillna("")
test_df["Review Text"]= test_df["Review Text"].fillna("")

In [5]:
# Encode categorical features
label_encoders = {}
categorical_cols = ["Division Name", "Department Name", "Class Name"]

for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    
    # Handle unseen categories in test set
    test_df[col] = test_df[col].astype(str).map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    
    label_encoders[col] = le

In [6]:
# Convert text data into TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
train_tfidf = tfidf_vectorizer.fit_transform(train_df["Review Text"] + " " + train_df["Title"])
test_tfidf = tfidf_vectorizer.transform(test_df["Review Text"] + " " + test_df["Title"])


In [7]:

# Define features and target
X_train = hstack((train_tfidf, train_df.drop(columns=["Review Text", "Title", "Rating", "Recommended IND"])))
y_train = train_df["Rating"]
X_test = hstack((test_tfidf, test_df.drop(columns=["Review Text", "Title", "Rating", "Recommended IND"])))
y_test = test_df["Rating"]

In [16]:

# Train LightGBM Model with Hyperparameter Tuning
param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500],
    'max_depth': [-1, 10, 20]
}

In [9]:
lgb_clf = lgb.LGBMClassifier(random_state=42, class_weight='balanced')
grid_search = GridSearchCV(lgb_clf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_lgb = grid_search.best_estimator_

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.094555 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 103705
[LightGBM] [Info] Number of data points in the train set: 18788, number of used features: 2034
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438


In [17]:
#import warnings
#warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")

# Convert to DataFrame with feature names
feature_names = [f"feature_{i}" for i in range(X_train.shape[1])]
X_train_df = pd.DataFrame(X_train.toarray(), columns=feature_names)
X_test_df = pd.DataFrame(X_test.toarray(), columns=feature_names)

# Train LightGBM Model
#best_lgb.fit(X_train_df, y_train, feature_name=feature_names)

# Train LightGBM Model
#best_lgb.fit(X_train_csr, y_train)

###################################################
lgb_clf = lgb.LGBMClassifier(random_state=42, class_weight='balanced')
#grid_search = GridSearchCV(lgb_clf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_df, y_train)

# Best model
best_lgb = grid_search.best_estimator_


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.092121 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 103705
[LightGBM] [Info] Number of data points in the train set: 18788, number of used features: 2034
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438


In [18]:
# Predictions
#y_pred = best_lgb.predict(X_test)
y_pred = best_lgb.predict(X_test_df)


# Evaluate model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Final Accuracy: {accuracy_score(y_test, y_pred):.4f}")


Classification Report:
              precision    recall  f1-score   support

           1       0.33      0.20      0.25       153
           2       0.32      0.22      0.26       317
           3       0.39      0.34      0.36       620
           4       0.42      0.39      0.40      1017
           5       0.77      0.85      0.81      2591

    accuracy                           0.62      4698
   macro avg       0.44      0.40      0.42      4698
weighted avg       0.60      0.62      0.61      4698

Final Accuracy: 0.6222
