In [1]:
# Library import

import pandas as pd
import numpy as np
import random
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV

random_seed = 123
random.seed(random_seed)
np.random.seed(random_seed)

In [2]:
# Loading data
X_train = pd.read_csv("../Data/Split_Data/X_train.csv")
X_train_PCA = pd.read_csv("../Data/Split_Data/X_train_PCA.csv")
y_train = pd.read_csv("../Data/Split_Data/y_train.csv")
X_valid = pd.read_csv("../Data/Split_Data/X_valid.csv")
X_valid_PCA = pd.read_csv("../Data/Split_Data/X_valid_PCA.csv")
y_valid = pd.read_csv("../Data/Split_Data/y_valid.csv")
X_test = pd.read_csv("../Data/Split_Data/X_test.csv")
X_test_PCA = pd.read_csv("../Data/Split_Data/X_test_PCA.csv")
y_test = pd.read_csv("../Data/Split_Data/y_test.csv")

In [3]:
# Training dimensions
X_train_new = pd.concat([X_train, X_valid])
y_train_new = pd.concat([y_train, y_valid])
print(X_train_new.shape)
print(y_train_new.shape)

(1136, 1124)
(1136, 1)


In [4]:
# Training PCA dimensions
X_train_PCA_new = pd.concat([X_train_PCA, X_valid_PCA])
print(X_train_PCA_new.shape)

(1136, 75)


In [5]:
# Model test function
def test(models, X_train, y_train, cv = 5):
    results = {}
    kf = KFold(n_splits=cv)
    for i in models:
        predicted_vals = []
        actual_vals = []
        for j, (train_index, test_index) in enumerate(kf.split(X_train)):
            train_x = X_train.iloc[train_index,:]
            train_y = y_train.iloc[train_index,:]
            test_x = X_train.iloc[test_index,:]
            test_y = y_train.iloc[test_index,:]
            predicted_vals.extend(models[i].fit(train_x, train_y.values.ravel()).predict(test_x))
            actual_vals.extend(test_y.values.ravel())
        results[i] = [roc_auc_score(actual_vals, predicted_vals)]
    return pd.DataFrame(results)

In [6]:
# Full dataset models
reg_params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}

models = {'OLS': LogisticRegression(solver='saga', penalty = "none"),
           'Lasso': GridSearchCV(LogisticRegression(solver='saga', penalty = "l1"), 
                               param_grid=reg_params,
                               cv = 10,
                               scoring = "neg_log_loss").fit(X_train_new, y_train_new.values.ravel()).best_estimator_,
           'Ridge': GridSearchCV(LogisticRegression(solver='saga', penalty = "l2"), 
                               param_grid=reg_params,
                               cv = 10,
                               scoring = "neg_log_loss").fit(X_train_new, y_train_new.values.ravel()).best_estimator_,}



In [7]:
reg_params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}

PCA_models = {'OLS': LogisticRegression(solver='saga', penalty = "none"),
           'Lasso': GridSearchCV(LogisticRegression(solver='saga', penalty = "l1"), 
                               param_grid=reg_params,
                               cv = 10,
                               scoring = "neg_log_loss").fit(X_train_PCA_new, y_train_new.values.ravel()).best_estimator_,
           'Ridge': GridSearchCV(LogisticRegression(solver='saga', penalty = "l2"), 
                               param_grid=reg_params,
                               cv = 10,
                               scoring = "neg_log_loss").fit(X_train_PCA_new, y_train_new.values.ravel()).best_estimator_,}



In [8]:
# Full dataset cv
test(models, X_train_new, y_train_new)



Unnamed: 0,OLS,Lasso,Ridge
0,0.725503,0.799881,0.738723


In [9]:
# PCA models cv
test(PCA_models, X_train_PCA_new, y_train_new)



Unnamed: 0,OLS,Lasso,Ridge
0,0.800522,0.800522,0.800522


In [10]:
# Full dataset best AUC
pred_y = models['Lasso'].fit(X_train_new, y_train_new.values.ravel()).predict(X_test)
results = roc_auc_score(y_test, pred_y)
print("AUC for the full LASSO model: " + str(round(results, 3)))

AUC for the full LASSO model: 0.75




In [11]:
# PCA dataset best AUC
pred_y_PCA = PCA_models['Lasso'].fit(X_train_PCA_new, y_train_new.values.ravel()).predict(X_test_PCA)
results_PCA = roc_auc_score(y_test, pred_y_PCA)
print("AUC for the PCA LASSO model: " + str(round(results_PCA, 3)))



AUC for the PCA LASSO model: 0.817
