In [1]:
#for comparison purposes, we are running various classification methods on this data and comparing results
#we use a 2500 observation random sample, determine which models perform best, run on full dataset, then validate
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import numpy.linalg as LA

# Load the data
df_csv = pd.read_csv("league_data.csv", dtype={'win': str})

# Drop irrelevant/metadata columns
columns_to_drop = [
    'game_id', 'game_version', 'participant_id', 'puuid', 'summoner_name', 'summoner_id',
    'solo_tier', 'solo_rank', 'solo_lp', 'solo_wins', 'solo_losses',
    'flex_tier', 'flex_rank', 'flex_lp', 'flex_wins', 'flex_losses',
    'champion_mastery_lastPlayTime', 'champion_mastery_lastPlayTime_utc',
    'champion_id', 'map_id', 'platform_id', 'game_type', 'team_id',
    'game_start_utc', 'queue_id', 'game_mode'
]

# Filter for CLASSIC + ranked solo/duo games
df_filtered = df_csv[(df_csv['game_mode'] == 'CLASSIC') & (df_csv['queue_id'] == 420)].copy()

# Drop metadata columns
df_filtered_cleaned = df_filtered.drop(columns=[col for col in columns_to_drop if col in df_filtered.columns])

# Convert 'win' column to binary
df_filtered_cleaned['win'] = (df_filtered_cleaned['win'] == 'TRUE').astype(int)

# Drop non-numeric/categorical columns (and item columns)
df_numeric_only = df_filtered_cleaned.drop(columns=df_filtered_cleaned.select_dtypes(include=['object', 'category']).columns)
df_numeric_only = df_numeric_only.drop(columns=[col for col in df_numeric_only.columns if col.startswith("item")])

# Final predictor/response matrices
X = df_numeric_only.drop(columns=['win']).fillna(df_numeric_only.mean())
y = df_numeric_only['win']

# --- Subsample preparation (2,500 observations) ---
sample_indices = np.random.choice(X.index, size=2500, replace=False)
X_sample = X.loc[sample_indices]
y_sample = y.loc[sample_indices]

# --- Robust PCA Manual Calculation of L and S ---
def robust_pca_fast(M, max_iter=150, tol=1e-4):
    def shrinkage_operator(x, tau):
        return np.sign(x) * np.maximum(np.abs(x) - tau, 0.)
    def svd_thresholding_operator(X, tau):
        U, S, Vh = LA.svd(X, full_matrices=False)
        S_thresh = shrinkage_operator(S, tau)
        return U @ np.diag(S_thresh) @ Vh
    S = np.zeros_like(M)
    Y = np.zeros_like(M)
    mu = np.prod(M.shape) / (4.0 * LA.norm(M, ord=1))
    mu_inv = 1.0 / mu
    lam = 1.0 / np.sqrt(np.max(M.shape))
    for _ in range(max_iter):
        L = svd_thresholding_operator(M - S + mu_inv * Y, mu_inv)
        S = shrinkage_operator(M - L + mu_inv * Y, lam * mu_inv)
        Y = Y + mu * (M - L - S)
        error = LA.norm(M - L - S, ord='fro')
        if error < tol:
            break
    return L, S

# --- Preprocessing and robust PCA ---
scaler_raw = StandardScaler()
X_scaled_sample = scaler_raw.fit_transform(X_sample)
L_sample, S_sample = robust_pca_fast(X_scaled_sample)

# --- Train/test split and scale ---
X_train, X_test, y_train, y_test = train_test_split(L_sample, y_sample, test_size=0.3, stratify=y_sample)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- PCA and Logistic Regression ---
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_pca, y_train)
y_pred = log_reg.predict(X_test_pca)
y_proba = log_reg.predict_proba(X_test_pca)[:, 1]

# --- Performance output ---
print("PCLR Subsample Results (2,500 rows):")
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba):.4f}")


PCLR Subsample Results (2,500 rows):
Accuracy:  0.8173
F1 Score:  0.8218
ROC AUC:   0.8950


In [2]:
#logistic regression, no PCA, l1 & L2 regularization
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- Subsample (2500 for quick performance) ---
sample_indices = np.random.choice(X.index, size=2500, replace=False)
X_sample = X.loc[sample_indices]
y_sample = y.loc[sample_indices]

# --- Split and scale ---
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, stratify=y_sample)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- L2 (Ridge) Regularization ---
log_reg_l2 = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000)
log_reg_l2.fit(X_train_scaled, y_train)
y_pred_l2 = log_reg_l2.predict(X_test_scaled)
y_proba_l2 = log_reg_l2.predict_proba(X_test_scaled)[:, 1]

# --- L1 (Lasso) Regularization ---
log_reg_l1 = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
log_reg_l1.fit(X_train_scaled, y_train)
y_pred_l1 = log_reg_l1.predict(X_test_scaled)
y_proba_l1 = log_reg_l1.predict_proba(X_test_scaled)[:, 1]

# --- Print results ---
print("Logistic Regression WITHOUT PCA\n")

print("L2 Regularization:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_l2):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_l2):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba_l2):.4f}\n")

print("L1 Regularization:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_l1):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_l1):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba_l1):.4f}")


Logistic Regression WITHOUT PCA

L2 Regularization:
Accuracy:  0.8867
F1 Score:  0.8877
ROC AUC:   0.9532

L1 Regularization:
Accuracy:  0.8840
F1 Score:  0.8848
ROC AUC:   0.9527


In [3]:
#decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- Subsample (reuse or reset this as needed) ---
sample_indices = np.random.choice(X.index, size=2500, replace=False)
X_sample = X.loc[sample_indices]
y_sample = y.loc[sample_indices]

# --- Train/test split and scale ---
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, stratify=y_sample)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Train Decision Tree ---
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train_scaled, y_train)
y_pred_tree = tree_clf.predict(X_test_scaled)
y_proba_tree = tree_clf.predict_proba(X_test_scaled)[:, 1]

# --- Evaluate performance ---
print("Decision Tree Results:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_tree):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_tree):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba_tree):.4f}")


Decision Tree Results:
Accuracy:  0.7760
F1 Score:  0.7789
ROC AUC:   0.7760


In [4]:
#random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- Subsample (reuse if already defined) ---
sample_indices = np.random.choice(X.index, size=2500, replace=False)
X_sample = X.loc[sample_indices]
y_sample = y.loc[sample_indices]

# --- Train/test split and scale ---
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, stratify=y_sample)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Train Random Forest ---
forest_clf = RandomForestClassifier(n_estimators=100)
forest_clf.fit(X_train_scaled, y_train)
y_pred_forest = forest_clf.predict(X_test_scaled)
y_proba_forest = forest_clf.predict_proba(X_test_scaled)[:, 1]

# --- Evaluate performance ---
print("Random Forest Results:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_forest):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_forest):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba_forest):.4f}")


Random Forest Results:
Accuracy:  0.8640
F1 Score:  0.8654
ROC AUC:   0.9377


In [5]:
#XGBoost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- Subsample (2500 rows for speed) ---
sample_indices = np.random.choice(X.index, size=2500, replace=False)
X_sample = X.loc[sample_indices]
y_sample = y.loc[sample_indices]

# --- Train/test split and scale ---
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, stratify=y_sample)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Train XGBoost ---
xgb_clf = XGBClassifier(eval_metric='logloss')
xgb_clf.fit(X_train_scaled, y_train)
y_pred_xgb = xgb_clf.predict(X_test_scaled)
y_proba_xgb = xgb_clf.predict_proba(X_test_scaled)[:, 1]

# --- Evaluate performance ---
print("XGBoost Results:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_xgb):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_xgb):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba_xgb):.4f}")


XGBoost Results:
Accuracy:  0.8733
F1 Score:  0.8742
ROC AUC:   0.9469


In [6]:
#lightgbm
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- Subsample (2500 rows for consistency) ---
sample_indices = np.random.choice(X.index, size=2500, replace=False)
X_sample = X.loc[sample_indices]
y_sample = y.loc[sample_indices]

# --- Train/test split and scale ---
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, stratify=y_sample)
scaler = StandardScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)


# --- Train LightGBM ---
lgbm_clf = LGBMClassifier(verbose=-1)
lgbm_clf.fit(X_train_scaled, y_train)
y_pred_lgbm = lgbm_clf.predict(X_test_scaled)
y_proba_lgbm = lgbm_clf.predict_proba(X_test_scaled)[:, 1]

# --- Evaluate performance ---
print("LightGBM Results:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_lgbm):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_lgbm):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba_lgbm):.4f}")


LightGBM Results:
Accuracy:  0.8693
F1 Score:  0.8727
ROC AUC:   0.9524


In [7]:
#support vector machines
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- Subsample (2500 rows for consistency) ---
sample_indices = np.random.choice(X.index, size=2500, replace=False)
X_sample = X.loc[sample_indices]
y_sample = y.loc[sample_indices]

# --- Train/test split and scale ---
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, stratify=y_sample)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Train SVM (with probability enabled for ROC AUC) ---
svm_clf = SVC(probability=True, kernel='rbf')
svm_clf.fit(X_train_scaled, y_train)
y_pred_svm = svm_clf.predict(X_test_scaled)
y_proba_svm = svm_clf.predict_proba(X_test_scaled)[:, 1]

# --- Evaluate performance ---
print("Support Vector Machine Results:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_svm):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_svm):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba_svm):.4f}")


Support Vector Machine Results:
Accuracy:  0.8720
F1 Score:  0.8703
ROC AUC:   0.9547


In [8]:
#k nearest neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- Subsample (2500 rows for consistency) ---
sample_indices = np.random.choice(X.index, size=2500, replace=False)
X_sample = X.loc[sample_indices]
y_sample = y.loc[sample_indices]

# --- Train/test split and scale ---
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, stratify=y_sample)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Train KNN ---
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train_scaled, y_train)
y_pred_knn = knn_clf.predict(X_test_scaled)
y_proba_knn = knn_clf.predict_proba(X_test_scaled)[:, 1]

# --- Evaluate performance ---
print("K-Nearest Neighbors Results:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_knn):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_knn):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba_knn):.4f}")


K-Nearest Neighbors Results:
Accuracy:  0.7787
F1 Score:  0.7769
ROC AUC:   0.8473


In [9]:
#naive bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- Subsample (2500 rows for consistency) ---
sample_indices = np.random.choice(X.index, size=2500, replace=False)
X_sample = X.loc[sample_indices]
y_sample = y.loc[sample_indices]

# --- Train/test split and scale ---
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, stratify=y_sample)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Train Naive Bayes ---
nb_clf = GaussianNB()
nb_clf.fit(X_train_scaled, y_train)
y_pred_nb = nb_clf.predict(X_test_scaled)
y_proba_nb = nb_clf.predict_proba(X_test_scaled)[:, 1]

# --- Evaluate performance ---
print("Naive Bayes Results:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_nb):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_nb):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba_nb):.4f}")


Naive Bayes Results:
Accuracy:  0.7520
F1 Score:  0.7207
ROC AUC:   0.8250


In [10]:
#MLP classifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- Subsample (2500 rows for consistency) ---
sample_indices = np.random.choice(X.index, size=2500, replace=False)
X_sample = X.loc[sample_indices]
y_sample = y.loc[sample_indices]

# --- Train/test split and scale ---
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, stratify=y_sample)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Train Neural Network (MLP) ---
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500)
mlp_clf.fit(X_train_scaled, y_train)
y_pred_mlp = mlp_clf.predict(X_test_scaled)
y_proba_mlp = mlp_clf.predict_proba(X_test_scaled)[:, 1]

# --- Evaluate performance ---
print("Neural Network (MLP) Results:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_mlp):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_mlp):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba_mlp):.4f}")


Neural Network (MLP) Results:
Accuracy:  0.8640
F1 Score:  0.8587
ROC AUC:   0.9371


In [11]:
#MLP: tinkering with hyperparameters and running on full dataset: two hidden layers (128, 64), relu activation, adam optimizer, 1000 epochs
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# --- Full dataset (already preprocessed into X and y) ---
X_full = X.copy()
y_full = y.copy()

# --- Train/test split and scale ---
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.3, stratify=y_full)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Tuned MLP Neural Network ---
mlp_tuned = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', max_iter=1000)
mlp_tuned.fit(X_train_scaled, y_train)
y_pred_mlp = mlp_tuned.predict(X_test_scaled)
y_proba_mlp = mlp_tuned.predict_proba(X_test_scaled)[:, 1]

# --- Evaluate performance ---
print("Tuned Neural Network Results (Full Dataset):")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_mlp):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_mlp):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba_mlp):.4f}")


Tuned Neural Network Results (Full Dataset):
Accuracy:  0.8795
F1 Score:  0.8780
ROC AUC:   0.9479


In [12]:
#lightGBM on full dataset
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# --- Full dataset (already preprocessed into X and y) ---
X_full = X.copy()
y_full = y.copy()

# --- Train/test split and scale ---
# --- Train/test split and scale ---
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, stratify=y_sample)
scaler = StandardScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)


# --- Train LightGBM ---
lgbm_clf = LGBMClassifier(verbose=-1)
lgbm_clf.fit(X_train_scaled, y_train)
y_pred_lgbm = lgbm_clf.predict(X_test_scaled)
y_proba_lgbm = lgbm_clf.predict_proba(X_test_scaled)[:, 1]

# --- Evaluate performance ---
print("LightGBM Results (Full Dataset):")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_lgbm):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_lgbm):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba_lgbm):.4f}")


LightGBM Results (Full Dataset):
Accuracy:  0.8587
F1 Score:  0.8515
ROC AUC:   0.9375


In [13]:
#XGBoost on full dataset
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# --- Full dataset (already preprocessed into X and y) ---
X_full = X.copy()
y_full = y.copy()

# --- Train/test split and scale ---
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.3, stratify=y_full)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Train XGBoost ---
xgb_clf = XGBClassifier(eval_metric='logloss')
xgb_clf.fit(X_train_scaled, y_train)
y_pred_xgb = xgb_clf.predict(X_test_scaled)
y_proba_xgb = xgb_clf.predict_proba(X_test_scaled)[:, 1]

# --- Evaluate performance ---
print("XGBoost Results (Full Dataset):")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_xgb):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_xgb):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba_xgb):.4f}")


XGBoost Results (Full Dataset):
Accuracy:  0.8911
F1 Score:  0.8909
ROC AUC:   0.9642


In [14]:
#L1 Logistic Regression on full dataset
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# --- Full dataset (already preprocessed into X and y) ---
X_full = X.copy()
y_full = y.copy()

# --- Train/test split and scale ---
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.3, stratify=y_full)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- L1 (Lasso) Logistic Regression ---
log_reg_l1 = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
log_reg_l1.fit(X_train_scaled, y_train)
y_pred_l1 = log_reg_l1.predict(X_test_scaled)
y_proba_l1 = log_reg_l1.predict_proba(X_test_scaled)[:, 1]

# --- Evaluate performance ---
print("L1 Logistic Regression Results (Full Dataset):")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_l1):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_l1):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba_l1):.4f}")


L1 Logistic Regression Results (Full Dataset):
Accuracy:  0.8807
F1 Score:  0.8781
ROC AUC:   0.9486


In [15]:
#lightgbm and xgboost are the best performers.  shuffle tests for validation:
from sklearn.utils import shuffle
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# --- Shuffle labels independently ---
y_shuffled = y_full.sample(frac=1, random_state=42).reset_index(drop=True)
X_shuffled = X_full.reset_index(drop=True)  # Align index with shuffled y

# --- Train/test split and scale (on mismatched X/y) ---
X_train, X_test, y_train_shuffled, y_test_shuffled = train_test_split(
    X_shuffled, y_shuffled, test_size=0.3, stratify=y_shuffled, random_state=42
)

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

# --- LightGBM on shuffled labels ---
lgbm_clf = LGBMClassifier(verbose=-1)
lgbm_clf.fit(X_train_scaled, y_train_shuffled)
y_pred_lgbm = lgbm_clf.predict(X_test_scaled)
y_proba_lgbm = lgbm_clf.predict_proba(X_test_scaled)[:, 1]

# --- XGBoost on shuffled labels ---
xgb_clf = XGBClassifier(eval_metric='logloss')
xgb_clf.fit(X_train_scaled, y_train_shuffled)
y_pred_xgb = xgb_clf.predict(X_test_scaled)
y_proba_xgb = xgb_clf.predict_proba(X_test_scaled)[:, 1]

# --- Results ---
print("LightGBM (Shuffled Labels):")
print(f"Accuracy:  {accuracy_score(y_test_shuffled, y_pred_lgbm):.4f}")
print(f"F1 Score:  {f1_score(y_test_shuffled, y_pred_lgbm):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test_shuffled, y_proba_lgbm):.4f}\n")

print("XGBoost (Shuffled Labels):")
print(f"Accuracy:  {accuracy_score(y_test_shuffled, y_pred_xgb):.4f}")
print(f"F1 Score:  {f1_score(y_test_shuffled, y_pred_xgb):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test_shuffled, y_proba_xgb):.4f}")

LightGBM (Shuffled Labels):
Accuracy:  0.5030
F1 Score:  0.5033
ROC AUC:   0.5028

XGBoost (Shuffled Labels):
Accuracy:  0.4992
F1 Score:  0.4979
ROC AUC:   0.4960


In [16]:
#K-fold cross-validation for XGBoost & LightGBM
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np

# --- Full dataset (already preprocessed) ---
X_full = X.copy()
y_full = y.copy()

# --- Scale entire dataset up front ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_full)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=X.index) #will need to be a dataframe with named columns for the loop below


# --- 5-Fold Stratified CV ---
kf = StratifiedKFold(n_splits=5, shuffle=True)

lgbm_aucs = []
xgb_aucs = []

for train_index, test_index in kf.split(X_scaled_df, y_full):
    X_train, X_test = X_scaled_df.iloc[train_index], X_scaled_df.iloc[test_index]
    y_train, y_test = y_full.iloc[train_index], y_full.iloc[test_index]

    
    # LightGBM
    lgbm_model = LGBMClassifier(verbose=-1)
    lgbm_model.fit(X_train, y_train)
    lgbm_proba = lgbm_model.predict_proba(X_test)[:, 1]
    lgbm_auc = roc_auc_score(y_test, lgbm_proba)
    lgbm_aucs.append(lgbm_auc)
    
    # XGBoost
    xgb_model = XGBClassifier(eval_metric='logloss')
    xgb_model.fit(X_train, y_train)
    xgb_proba = xgb_model.predict_proba(X_test)[:, 1]
    xgb_auc = roc_auc_score(y_test, xgb_proba)
    xgb_aucs.append(xgb_auc)

# --- Results ---
print("LightGBM K-Fold ROC AUCs:", np.round(lgbm_aucs, 4))
print(f"Mean AUC: {np.mean(lgbm_aucs):.4f} | Std Dev: {np.std(lgbm_aucs):.4f}\n")

print("XGBoost K-Fold ROC AUCs:", np.round(xgb_aucs, 4))
print(f"Mean AUC: {np.mean(xgb_aucs):.4f} | Std Dev: {np.std(xgb_aucs):.4f}")


LightGBM K-Fold ROC AUCs: [0.9651 0.9654 0.9639 0.9681 0.9664]
Mean AUC: 0.9658 | Std Dev: 0.0014

XGBoost K-Fold ROC AUCs: [0.9635 0.9643 0.9628 0.9676 0.965 ]
Mean AUC: 0.9646 | Std Dev: 0.0017


In [17]:
#L1-regularization we shuffle test and k-fold cross-validation
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.utils import shuffle
import numpy as np

# --- Full dataset ---
X_full = X.copy()
y_full = y.copy()

# --- Scale full data ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_full)

# ------------------------------
# Shuffle Test (L1 Logistic)
# ------------------------------
_, y_shuffled = shuffle(X_scaled, y_full)

X_train, X_test, y_train_shuff, y_test_shuff = train_test_split(X_scaled, y_shuffled, test_size=0.3, stratify=y_shuffled)

logreg_l1 = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
logreg_l1.fit(X_train, y_train_shuff)
y_proba_shuff = logreg_l1.predict_proba(X_test)[:, 1]

print("L1 Logistic Regression – Shuffle Test")
print(f"ROC AUC (Shuffled): {roc_auc_score(y_test_shuff, y_proba_shuff):.4f}")
print()

# ------------------------------
# 5-Fold Cross-Validation
# ------------------------------
kf = StratifiedKFold(n_splits=5, shuffle=True)
auc_scores = []

for train_index, test_index in kf.split(X_scaled, y_full):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y_full.iloc[train_index], y_full.iloc[test_index]
    
    model = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
    auc_scores.append(auc)

print("L1 Logistic Regression – 5-Fold Cross-Validation")
print("AUCs:", np.round(auc_scores, 4))
print(f"Mean AUC: {np.mean(auc_scores):.4f} | Std Dev: {np.std(auc_scores):.4f}")


L1 Logistic Regression – Shuffle Test
ROC AUC (Shuffled): 0.5060

L1 Logistic Regression – 5-Fold Cross-Validation
AUCs: [0.9455 0.9493 0.9531 0.9418 0.9485]
Mean AUC: 0.9476 | Std Dev: 0.0038


In [38]:
# validating the neural network model with shuffle test and k fold cross validation
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score
import numpy as np

# Assume X and y are ready and preprocessed (quantitative only, scaled)
# X should be the same one used for the tuned MLP
X_nn = X.copy()
y_nn = y.copy()

# =========================
# Shuffle Test
# =========================
_, y_shuffled = shuffle(X_nn, y_nn)

X_train, X_test, y_train_shuff, y_test_shuff = train_test_split(
    X_nn, y_shuffled, test_size=0.3, stratify=y_shuffled
)

mlp_shuff = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', max_iter=1000)
mlp_shuff.fit(X_train, y_train_shuff)
y_proba_shuff = mlp_shuff.predict_proba(X_test)[:, 1]

print("MLP – Shuffle Test")
print(f"ROC AUC (Shuffled): {roc_auc_score(y_test_shuff, y_proba_shuff):.4f}\n")

# =========================
# 5-Fold Cross-Validation
# =========================
kf = StratifiedKFold(n_splits=5, shuffle=True)
auc_scores = []

for train_index, test_index in kf.split(X_nn, y_nn):
    X_train, X_test = X_nn.iloc[train_index], X_nn.iloc[test_index]
    y_train, y_test = y_nn.iloc[train_index], y_nn.iloc[test_index]
    
    model = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', max_iter=1000)
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
    auc_scores.append(auc)

print("MLP – 5-Fold Cross-Validation")
print("AUCs:", np.round(auc_scores, 4))
print(f"Mean AUC: {np.mean(auc_scores):.4f} | Std Dev: {np.std(auc_scores):.4f}")


MLP – Shuffle Test
ROC AUC (Shuffled): 0.5011

MLP – 5-Fold Cross-Validation
AUCs: [0.8519 0.8484 0.8626 0.8712 0.8875]
Mean AUC: 0.8643 | Std Dev: 0.0141
