<a href="https://colab.research.google.com/github/ayowasco/brackets/blob/master/Dissertation_Code_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Model Libraries
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Model Evaluation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
# LOAD AND PREPROCESS DATA

# Load the dataset
data = pd.read_csv('/mnt/data/engineered_dataset.csv')

# Identify features and target variable
X = data.drop('Target', axis=1)
y = data['Target']

# Handle missing values
X = X.fillna(X.median())

# Feature scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# sPLITING DATA INTO TRAINING AND TESTING

# Split the data
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_test, _, _ = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# INITIALIZE AND TRAIN MULTIPLE MODELS

# Initialize models with default parameters
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'MLP': MLPClassifier(random_state=42)
}

# Function to train and evaluate models
def evaluate_model(model, X_tr, X_te, y_tr, y_te):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    accuracy = accuracy_score(y_te, y_pred)
    precision = precision_score(y_te, y_pred, average='weighted')
    recall = recall_score(y_te, y_pred, average='weighted')
    f1 = f1_score(y_te, y_pred, average='weighted')
    return accuracy, precision, recall, f1

# Evaluate all models
results = {}
for name, model in models.items():
    if name in ['SVM', 'MLP']:
        acc, prec, rec, f1 = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
    else:
        acc, prec, rec, f1 = evaluate_model(model, X_train, X_test, y_train, y_test)
    results[name] = {'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'F1 Score': f1}

# Display results
results_df = pd.DataFrame(results).T
print(results_df.sort_values(by='F1 Score', ascending=False))


In [None]:
# Hyperparameter Tuning for CatBoost, XGBoost, and LightGBM
 #CATBOOST

catboost_param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.01, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5]
}

catboost_model = CatBoostClassifier(random_state=42, verbose=0)
catboost_grid = GridSearchCV(estimator=catboost_model,
                             param_grid=catboost_param_grid,
                             cv=3,
                             scoring='f1_weighted',
                             n_jobs=-1)
catboost_grid.fit(X_train, y_train)
catboost_best = catboost_grid.best_estimator_

print("Best Parameters for CatBoost:", catboost_grid.best_params_)
print("Best F1 Score for CatBoost:", catboost_grid.best_score_)

y_pred_catboost = catboost_best.predict(X_test)
catboost_test_f1 = f1_score(y_test, y_pred_catboost, average='weighted')
print("Test F1 Score for CatBoost:", catboost_test_f1)


In [None]:
# XGBOOST

xgboost_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.7, 1],
    'colsample_bytree': [0.7, 1]
}

xgboost_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgboost_grid = GridSearchCV(estimator=xgboost_model,
                            param_grid=xgboost_param_grid,
                            cv=3,
                            scoring='f1_weighted',
                            n_jobs=-1)
xgboost_grid.fit(X_train, y_train)
xgboost_best = xgboost_grid.best_estimator_

print("Best Parameters for XGBoost:", xgboost_grid.best_params_)
print("Best F1 Score for XGBoost:", xgboost_grid.best_score_)

y_pred_xgboost = xgboost_best.predict(X_test)
xgboost_test_f1 = f1_score(y_test, y_pred_xgboost, average='weighted')
print("Test F1 Score for XGBoost:", xgboost_test_f1)


In [None]:
# LIGHTBOOST

lightgbm_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'num_leaves': [31, 50],
    'max_depth': [4, 6, 8],
    'subsample': [0.7, 1],
    'colsample_bytree': [0.7, 1]
}

lightgbm_model = LGBMClassifier(random_state=42)
lightgbm_grid = GridSearchCV(estimator=lightgbm_model,
                             param_grid=lightgbm_param_grid,
                             cv=3,
                             scoring='f1_weighted',
                             n_jobs=-1)
lightgbm_grid.fit(X_train, y_train)
lightgbm_best = lightgbm_grid.best_estimator_

print("Best Parameters for LightGBM:", lightgbm_grid.best_params_)
print("Best F1 Score for LightGBM:", lightgbm_grid.best_score_)

y_pred_lightgbm = lightgbm_best.predict(X_test)
lightgbm_test_f1 = f1_score(y_test, y_pred_lightgbm, average='weighted')
print("Test F1 Score for LightGBM:", lightgbm_test_f1)


In [None]:
# EVALUATING CONFUSION METRICS

# Confusion Matrix for CatBoost
conf_matrix = confusion_matrix(y_test, y_pred_catboost)
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - CatBoost')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
# FEATURE IMPORTANCE ANALYSIS

# Feature importance for CatBoost
feature_importances = catboost_best.get_feature_importance()
feature_names = X.columns
fi_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
fi_df = fi_df.sort_values(by='Importance', ascending=False)
print(fi_df.head(10))


In [None]:
FINAL MODEL EVALUATION AND CONCLUSION


# Compile and compare final results for CatBoost, XGBoost, and LightGBM
tuned_results = {
    'CatBoost': {
        'Best CV F1 Score': catboost_grid.best_score_,
        'Test F1 Score': catboost_test_f1,
        'Best Parameters': catboost_grid.best_params_
    },
    'XGBoost': {
        'Best CV F1 Score': xgboost_grid.best_score_,
        'Test F1 Score': xgboost_test_f1,
        'Best Parameters': xgboost_grid.best_params_
    },
    'LightGBM': {
        'Best CV F1 Score': lightgbm_grid.best_score_,
        'Test F1 Score': lightgbm_test_f1,
        'Best Parameters': lightgbm_grid.best_params_
    }
}

tuned_results_df = pd.DataFrame(tuned_results).T
print(tuned_results_df)
