In [None]:
# to say to jupyter notebook to plot/show the charts on it 
%matplotlib inline 
import pandas as pd # data frames
import numpy as np # matrix algebra
from matplotlib import pyplot as plt # normal charts
import seaborn as sns # fancy charts

# Regression

## Organize data

### Load data

In [None]:
# read data
df = pd.read_csv("https://raw.githubusercontent.com/askoshiyama/mli-cohort/master/boston.csv")
df.head()

In [None]:
# defining vars
input_vars = ["V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13"]
output_var = ["T1"]

# dictionary of variables
pd.read_table("https://raw.githubusercontent.com/askoshiyama/mli-cohort3/master/Boston%20-%20Dictionary%20of%20Variables.txt")

### Some "integrity" checkings

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.corr().loc[["T1"]]

In [None]:
df.plot(y=output_var, x="V2", kind="scatter")

In [None]:
sns.jointplot(df["V13"], df["T1"], kind="reg")

## Modelling

### Settings

In [None]:
from sklearn import metrics
from sklearn.model_selection import KFold

# 5-fold cv
k_folds = KFold(n_splits=5, random_state=10, shuffle=True)

# performance metrics
# homework: https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics
def mse_score(y_true, y_pred):
    return np.mean( (y_true - y_pred) ** 2.0)

def rmse_score(y_true, y_pred):
    return np.sqrt(mse_score(y_true, y_pred))

def mad_score(y_true, y_pred):
    return np.mean( np.abs(y_true - y_pred) )

def pr2_score(y_true, y_pred):
    return np.corrcoef(y_true, y_pred)[0, 1] ** 2.0

#def pr2_score(y_true, y_pred):
#    return (np.cov(y_true, y_pred)[0, 1] / (np.std(y_true) * np.std(y_pred))) ** 2.0

def r2_score(y_true, y_pred):
    y_bar = np.mean(y_true)
    return 1 - np.sum((y_true - y_pred)**2) / np.sum((y_true - y_bar)**2)

#def r2_score(y_true, y_pred):
#    y_bar = np.mean(y_true)
#    return np.sum((y_pred - y_bar)**2) / np.sum((y_true - y_bar)**2)

def adjpr2_score(y_true, y_pred, p):
    return ( np.corrcoef(y_true, y_pred)[0, 1] ** 2.0 ) * ((y_true.shape[0] - p)/y_true.shape[0])

perf_metrics = {"MSE": mse_score, 
                "RMSE": rmse_score, 
                "Pseudo-R2": pr2_score,
                "R2": r2_score,
                "MAD": mad_score
               }

### Dummy/Baseline Model - Average

In [None]:
# pre-instantiation
target = ["T1"]
df_metrics = pd.DataFrame(index=[0], columns=["Fold", "Metric", "Train", "Test"])

# main loop
k, f = 0, 0
for (train, test) in k_folds.split(df.drop(labels=target, axis=1)):
    f += 1
    print(f)
    # separate variables and folds
    x_train = df.drop(labels=target, axis=1).values[train]
    x_test = df.drop(labels=target, axis=1).values[test]
    y_train = df[target].values[train]
    y_test = df[target].values[test]
    
    # "fit" model
    model_pred = y_train.mean()

    # get predictions
    y_train_pred = np.repeat(model_pred, y_train.shape[0])
    y_test_pred = np.repeat(model_pred, y_test.shape[0])

    # compute metrics
    for pf in list(perf_metrics.keys()):
        df_metrics.loc[k, "Fold"] = f
        df_metrics.loc[k, "Metric"] = pf
        df_metrics.loc[k, "Train"] = perf_metrics[pf](y_train.ravel(), y_train_pred.ravel())
        df_metrics.loc[k, "Test"] = perf_metrics[pf](y_test.ravel(), y_test_pred.ravel())
        k += 1
        
    # if using sklearn: from from sklearn.dummy import DummyRegressor
    #ml = DummyRegressor().fit(x_train, y_train)
    #pred_train = ml.predict(x_train)
    #pred_test = ml.predict(x_test)

# final organization
df_metrics = df_metrics.apply(pd.to_numeric, errors="ignore")

In [None]:
df_metrics

In [None]:
df_metrics.pivot_table(index="Metric", aggfunc="mean")

### Simple Linear Regression

In [None]:
# pre-instantiation
target = ["T1"]
df_metrics = pd.DataFrame(index=[0], columns=["Fold", "Metric", "Train", "Test"])

# main loop
k, f = 0, 0
for (train, test) in k_folds.split(df.drop(labels=target, axis=1)):
    f += 1
    print(f)
    # separate variables and folds
    x_train = df.drop(labels=target, axis=1).values[train]
    x_train = np.hstack([np.ones((x_train.shape[0], 1)), x_train])
    
    x_test = df.drop(labels=target, axis=1).values[test]
    x_test = np.hstack([np.ones((x_test.shape[0], 1)), x_test])
    
    y_train = df[target].values[train]
    y_test = df[target].values[test]
    
    # fit model
    # train model - coefs = (X_train'X_train)^-1 X_train'y_train
    inv_component = np.linalg.pinv(np.matmul(x_train.transpose(), x_train))
    coefs = np.matmul(inv_component, np.matmul(x_train.transpose(), y_train))

    # get predictions
    y_train_pred = np.matmul(x_train, coefs) # X_train*coefs
    y_test_pred = np.matmul(x_test, coefs) # X_test*coefs

    # compute metrics
    for pf in list(perf_metrics.keys()):
        df_metrics.loc[k, "Fold"] = f
        df_metrics.loc[k, "Metric"] = pf
        df_metrics.loc[k, "Train"] = perf_metrics[pf](y_train.ravel(), y_train_pred.ravel())
        df_metrics.loc[k, "Test"] = perf_metrics[pf](y_test.ravel(), y_test_pred.ravel())
        k += 1
        
    # if using sklearn: from sklearn.linear_model import LinearRegression
    #ml = LinearRegression().fit(x_train, y_train)
    #pred_train = ml.predict(x_train)
    #pred_test = ml.predict(x_test)

# final organization
df_metrics = df_metrics.apply(pd.to_numeric, errors="ignore")

#### Stats

In [None]:
df_metrics.pivot_table(index="Metric")

### Linear Regression with Polynomial Features

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_params = {"degree": 2,
               "interaction_only": False,
               "include_bias": True
              }

In [None]:
x_train.shape

In [None]:
poly_func = PolynomialFeatures(**poly_params).fit(df.drop(labels=target, axis=1))
x_train = poly_func.transform(df.drop(labels=target, axis=1))

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
# pre-instantiation
target = ["T1"]
df_metrics = pd.DataFrame(index=[0], columns=["Fold", "Metric", "Train", "Test"])

# main loop
k, f = 0, 0
for (train, test) in k_folds.split(df.drop(labels=target, axis=1)):
    f += 1
    print(f)
    # separate variables and folds
    x_train = df.drop(labels=target, axis=1).values[train]
    x_test = df.drop(labels=target, axis=1).values[test]
    y_train = df[target].values[train]
    y_test = df[target].values[test]
    
    # polynomial features - will include bias automatically
    poly_func = PolynomialFeatures(**poly_params).fit(x_train)
    x_train = poly_func.transform(x_train)
    x_test = poly_func.transform(x_test)
        
    # fit model
    # train model 
    inv_component = np.linalg.pinv(np.matmul(x_train.transpose(), x_train))
    coefs = np.matmul(inv_component, np.matmul(x_train.transpose(), y_train))

    # get predictions
    y_train_pred = np.matmul(x_train, coefs)
    y_test_pred = np.matmul(x_test, coefs)

    # compute metrics
    for pf in list(perf_metrics.keys()):
        df_metrics.loc[k, "Fold"] = f
        df_metrics.loc[k, "Metric"] = pf
        df_metrics.loc[k, "Train"] = perf_metrics[pf](y_train.ravel(), y_train_pred.ravel())
        df_metrics.loc[k, "Test"] = perf_metrics[pf](y_test.ravel(), y_test_pred.ravel())
        k += 1

    # if using sklearn: from sklearn.linear_model import LinearRegression
    #ml = LinearRegression().fit(x_train, y_train)
    #pred_train = ml.predict(x_train)
    #pred_test = ml.predict(x_test)

# final organization
df_metrics = df_metrics.apply(pd.to_numeric, errors="ignore")

In [None]:
df_metrics.pivot_table(index="Metric")

In [None]:
1-3.95/9.21

### Polynomial Regression with Backward Elimination

In [None]:
# pre-instantiation
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
poly_params = {"degree": 2,
               "interaction_only": False,
               "include_bias": True
              }
target = ["T1"]
df_metrics = pd.DataFrame(index=[0], columns=["Fold", "Metric", "Train", "Test"])

# main loop
k, f = 0, 0
for (train, test) in k_folds.split(df.drop(labels=target, axis=1)):
    f += 1
    print(f)
    # separate variables and folds
    x_train = df.drop(labels=target, axis=1).values[train]
    x_test = df.drop(labels=target, axis=1).values[test]
    y_train = df[target].values[train]
    y_test = df[target].values[test]
    
    # polynomial features - will include bias automatically
    poly_func = PolynomialFeatures(**poly_params).fit(x_train)
    x_train = poly_func.transform(x_train)
    x_test = poly_func.transform(x_test)
        
    # fit model
    model = RFE(LinearRegression()).fit(x_train, y_train)
    #model = RFECV(model, step=1, cv=5).fit(x_train, y_train)

    # get predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # compute metrics
    for pf in list(perf_metrics.keys()):
        df_metrics.loc[k, "Fold"] = f
        df_metrics.loc[k, "Metric"] = pf
        df_metrics.loc[k, "Train"] = perf_metrics[pf](y_train.ravel(), y_train_pred.ravel())
        df_metrics.loc[k, "Test"] = perf_metrics[pf](y_test.ravel(), y_test_pred.ravel())
        k += 1

# final organization
df_metrics = df_metrics.apply(pd.to_numeric, errors="ignore")

In [None]:
x_train[:, model.support_].shape

In [None]:
model.ranking_

In [None]:
model.n_features_

In [None]:
df_metrics.pivot_table(index="Metric")

#### Different implementation

In [None]:
# pre-instantiation
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
poly_params = {"degree": 2,
               "interaction_only": False,
               "include_bias": True
              }
target = ["T1"]
df_metrics = pd.DataFrame(index=[0], columns=["Fold", "Metric", "Train", "Test"])

# main loop
k, f = 0, 0
for (train, test) in k_folds.split(df.drop(labels=target, axis=1)):
    f += 1
    print(f)
    # separate variables and folds
    x_train = df.drop(labels=target, axis=1).values[train]
    x_test = df.drop(labels=target, axis=1).values[test]
    y_train = df[target].values[train]
    y_test = df[target].values[test]
        
    # fit model
    model = Pipeline([("Poly", PolynomialFeatures(**poly_params)),
                      ("LR-BE", RFE(LinearRegression()))])
    model.fit(x_train, y_train)

    # get predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # compute metrics
    for pf in list(perf_metrics.keys()):
        df_metrics.loc[k, "Fold"] = f
        df_metrics.loc[k, "Metric"] = pf
        df_metrics.loc[k, "Train"] = perf_metrics[pf](y_train.ravel(), y_train_pred.ravel())
        df_metrics.loc[k, "Test"] = perf_metrics[pf](y_test.ravel(), y_test_pred.ravel())
        k += 1

# final organization
df_metrics = df_metrics.apply(pd.to_numeric, errors="ignore")

In [None]:
df_metrics.pivot_table(index="Metric")

### Ridge Regression

In [None]:
from sklearn.preprocessing import StandardScaler
# pre-instantiation
ridge_shrinkage = np.linspace(-0.1, 0.4, num=200)
target = ["T1"]
df_metrics = pd.DataFrame(index=[0], columns=["Fold", "Shrinkage", "Metric", "Train", "Test"])

# main loop
k, f = 0, 0
for (train, test) in k_folds.split(df.drop(labels=target, axis=1)):
    f += 1
    print(f)
    # separate variables and folds
    x_train = df.drop(labels=target, axis=1).values[train]
    x_test = df.drop(labels=target, axis=1).values[test]
    y_train = df[target].values[train]
    y_test = df[target].values[test]
    
    # scale  variables
    scaler_x = StandardScaler(with_mean=True, with_std=True).fit(x_train)
    x_train = np.hstack([np.ones((x_train.shape[0], 1)), scaler_x.transform(x_train)])
    x_test = np.hstack([np.ones((x_test.shape[0], 1)), scaler_x.transform(x_test)])
    
    # fit model
    for l in ridge_shrinkage:
        # train model - coefs = (X'X + I * \lambda * p)^-1 X'y
        inv_component = np.linalg.pinv(np.matmul(x_train.transpose(), x_train) + np.eye(x_train.shape[1]) * l * x_train.shape[1])
        coefs = np.matmul(inv_component, np.matmul(x_train.transpose(), y_train))
        
        # get predictions
        y_train_pred = np.matmul(x_train, coefs) # X * coefs
        y_test_pred = np.matmul(x_test, coefs)
        
        # compute metrics
        for pf in list(perf_metrics.keys()):
            df_metrics.loc[k, "Fold"] = f
            df_metrics.loc[k, "Metric"] = pf
            df_metrics.loc[k, "Shrinkage"] = l
            df_metrics.loc[k, "Train"] = perf_metrics[pf](y_train.ravel(), y_train_pred.ravel())
            df_metrics.loc[k, "Test"] = perf_metrics[pf](y_test.ravel(), y_test_pred.ravel())
            k += 1
        
        # if using sklearn: from sklearn.linear_model import Ridge
        #ml = Ridge(alpha=l).fit(x_train, y_train)
        #pred_train = ml.predict(x_train)
        #pred_test = ml.predict(x_test)

# final organization
df_metrics = df_metrics.apply(pd.to_numeric, errors="ignore")

#### Stats

In [None]:
df_metrics.pivot_table(index="Shrinkage", columns="Metric", values="Test").head()

In [None]:
df_agg_metrics = df_metrics.pivot_table(index="Shrinkage", columns="Metric", values="Test")

In [None]:
df_agg_metrics[["RMSE"]].plot()

In [None]:
df_agg_metrics.loc[df_agg_metrics["Pseudo-R2"].idxmax()]

### Ridge + Polynomial Features

In [None]:
# pre-instantiation
from sklearn.linear_model import Ridge
poly_params = {"degree": 2,
               "interaction_only": False,
               "include_bias": False
              }
ridge_shrinkage = np.linspace(8.0, 14.0, num=200)
target = ["T1"]
df_metrics = pd.DataFrame(index=[0], columns=["Fold", "Shrinkage", "Metric", "Train", "Test"])

# main loop
k, f = 0, 0
for (train, test) in k_folds.split(df.drop(labels=target, axis=1)):
    f += 1
    print(f)
    # separate variables and folds
    x_train = df.drop(labels=target, axis=1).values[train]
    x_test = df.drop(labels=target, axis=1).values[test]
    y_train = df[target].values[train]
    y_test = df[target].values[test]
    
    # scale variables
    #scaler_x = StandardScaler().fit(x_train)
    #x_train = scaler_x.transform(x_train)
    #x_test = scaler_x.transform(x_test)
    
    # polynomial features - will include bias automatically
    #poly_func = PolynomialFeatures(**poly_params).fit(x_train)
    #x_train = poly_func.transform(x_train)
    #x_test = poly_func.transform(x_test)
            
    # fit model
    for l in ridge_shrinkage:
        # train model - min ||coefs||^2, subject to: X coefs = y -- L(\lambda) = ||coefs||^1 + \lambda * (X coefs - y)
        #inv_component = np.linalg.pinv(np.matmul(x_train.transpose(), x_train) + np.eye(x_train.shape[1]) * l * x_train.shape[1])
        #coefs = np.matmul(inv_component, np.matmul(x_train.transpose(), y_train))
        
        # fit model
        model = Pipeline([("StdScaler", StandardScaler()),
                          ("Poly", PolynomialFeatures(**poly_params)),
                          ("Ridge", Ridge(alpha=l))])
        model.fit(x_train, y_train)
        
        # get predictions
        #y_train_pred = np.matmul(x_train, coefs)
        y_train_pred = model.predict(x_train)
        #y_test_pred = np.matmul(x_test, coefs)
        y_test_pred = model.predict(x_test)
        
        # compute metrics
        for pf in list(perf_metrics.keys()):
            df_metrics.loc[k, "Fold"] = f
            df_metrics.loc[k, "Metric"] = pf
            df_metrics.loc[k, "Shrinkage"] = l
            df_metrics.loc[k, "Train"] = perf_metrics[pf](y_train.ravel(), y_train_pred.ravel())
            df_metrics.loc[k, "Test"] = perf_metrics[pf](y_test.ravel(), y_test_pred.ravel())
            k += 1
        
        # if using sklearn: from sklearn.linear_model import Ridge, Lasso
        #ml = Ridge(alpha=l).fit(x_train, y_train)
        #pred_train = ml.predict(x_train)
        #pred_test = ml.predict(x_test)
        
        

# final organization
df_metrics = df_metrics.apply(pd.to_numeric, errors="ignore")

#### Stats

In [None]:
df_agg_metrics = df_metrics.pivot_table(index="Shrinkage", columns="Metric", values="Test")

In [None]:
df_agg_metrics[["RMSE"]].plot()

In [None]:
df_agg_metrics.loc[df_agg_metrics["RMSE"].idxmin()]

In [None]:
3.46/9.21

# Classification

## Organize data

### Load data

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/askoshiyama/mli-cohort/master/german_credit.csv")
df.head()

### Pre-process variables

In [None]:
# variable types
df.dtypes

In [None]:
# categorical variables
cat_variables = ['account_check_status', 'credit_history', 'purpose', 'savings', 'present_emp_since', 'personal_status_sex',
                'property', 'other_installment_plans', 'housing', 'job', 'telephone', "other_debtors", 'foreign_worker']

# target variable
output_variable = ["default"]

# other integer variables
int_variables = ['credits_this_bank', 'present_res_since', 'duration_in_month', 'people_under_maintenance', 
                 'installment_as_income_perc', 'age', 'credit_amount']
# list(set(df.columns) - set(output_variable) - set(cat_variables))

In [None]:
# pre-allocation
df_cat = pd.DataFrame(index=df.index)

# one-hot encoding of categorical variables
from sklearn.preprocessing import OneHotEncoder

# I will do a loop for pedagogical reasons, but it is not entirely necessary
for cat in cat_variables:
    # one-hot encoding fitting
    one_hot_func = OneHotEncoder().fit(df[[cat]])
    
    # mapping
    cat_mapped = one_hot_func.transform(df[[cat]]).toarray()
    
    # storing
    for (k, cat_label) in enumerate(one_hot_func.categories_[0]):
        df_cat[cat + "_" + cat_label] = cat_mapped[:, k]

# quick check
df_cat.mean()

In [None]:
# bracketing integer variable - age
from sklearn.preprocessing import KBinsDiscretizer
kbin_func = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='quantile').fit(df[["age"]])
df_age = pd.DataFrame(kbin_func.transform(df[["age"]]).toarray(), columns=["young", "adult", "senior"])

# checking
pd.concat([df_age, df[["age"]]], axis=1).head(10)

In [None]:
# other types of transformations possible - log transformation 
from sklearn.preprocessing import FunctionTransformer
log_mapping = FunctionTransformer(func=np.log, inverse_func=np.exp)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.hist(df["credit_amount"])
ax1.set_title("Pre-transformation")

ax2.hist(log_mapping.transform(df[["credit_amount"]].values))
ax2.set_title("After log-transformation")
df_log = pd.DataFrame(log_mapping.transform(df[["credit_amount"]].values), columns=["log(credit)"])

### Consolidating a final dataset

In [None]:
df_model = pd.concat([df[int_variables[:-2]], df_cat, df_age, df_log, df[output_variable]], axis=1)
df.shape, df_model.shape

In [None]:
df_model.head()

## Modelling

In [None]:
df_model[output_variable].describe()

### Settings

In [None]:
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

# 3-fold cv
k_folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=10)

# performance metrics - 
# homework: https://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
# homework: https://en.wikipedia.org/wiki/Confusion_matrix, 
perf_metrics = {"Accuracy": metrics.accuracy_score, 
                "Precision": metrics.precision_score, 
                "Recall": metrics.recall_score,
                "AUC": metrics.roc_auc_score, 
                "F1-Score": metrics.f1_score, 
                "Brier": metrics.brier_score_loss
               }

### Logistic Regression - From "Scratch"

In [None]:
# some functions
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def logistic_prediction(X, theta):
    z = np.dot(X, theta)
    h = sigmoid(z)
    return h

def loss(h, y):
    h = np.clip(h, 1e-15, 1 - 1e-15) # too close to zero or one
    return -(y * np.log(h) + (1 - y) * np.log(1 - h)).mean()

In [None]:
# quick test
theta = np.random.normal(size=df_model.drop(labels=output_variable, axis=1).shape[1])
pred = logistic_prediction(df_model.drop(labels=output_variable, axis=1).values, theta)
loss(pred, df_model[output_variable].values)

In [None]:
# main loop
# pre-instantiation
n_vars = df_model.drop(labels=output_variable, axis=1).shape[1] # number of variables
step_size = 0.01 # finite difference step-size
lr = 0.01 # learning rate -- dictate the speed
max_iter = 30 # maximum number of iterations
df_metrics = pd.DataFrame(index=[0], columns=["Fold", "Metric", "Train", "Test"])

# main loop
k, f = 0, 0
for (train, test) in k_folds.split(df_model.drop(labels=output_variable, axis=1), df_model[output_variable]):
    f += 1
    # separate variables and folds
    x_train = df_model.drop(labels=output_variable, axis=1).values[train]
    x_test = df_model.drop(labels=output_variable, axis=1).values[test]
    y_train = df_model[output_variable].values[train]
    y_test = df_model[output_variable].values[test]
    
    # scale variables
    #scaler_x = StandardScaler().fit(x_train)
    #x_train = np.hstack([np.ones((x_train.shape[0], 1)), scaler_x.transform(x_train)])
    #x_test = np.hstack([np.ones((x_test.shape[0], 1)), scaler_x.transform(x_test)])
    
    # fit model
    # initialization
    theta = np.random.normal(size=n_vars) # random coefs
    pred = logistic_prediction(x_train, theta)
    loss_iter = loss(pred, y_train)
    print("fold: %d, iter: %d, loss: %.4f" % (f, 0, loss_iter), end="\r")
    
    # training
    for it in range(1, max_iter):
        log_grad = np.zeros(shape=(n_vars,))
        # compute numerical gradient - finite differences
        for c in range(n_vars):
            new_theta = np.copy(theta)
            new_theta[c] = new_theta[c] + step_size
            new_pred = logistic_prediction(x_train, new_theta)
            c_loss = loss(new_pred, y_train)
            log_grad[c] = (c_loss - loss_iter)/(step_size) # f'(x) ~ (f(x+e) - f(x-e))/2*e
        
        # update model
        theta = theta - lr * log_grad
        
        # get current loss
        pred = logistic_prediction(x_train, theta)
        diff_loss = np.abs(loss_iter - loss(pred, y_train))
        loss_iter = loss(pred, y_train)
        print("fold: %d, iter: %d, loss: %.4f, grad_norm: %.4f" % (f, it, loss_iter, np.linalg.norm(log_grad)), end="\r")
        if diff_loss < 1e-3:
            break
    
    # compute final predictions
    y_train_pred = logistic_prediction(x_train, theta)
    y_test_pred = logistic_prediction(x_test, theta)
    
    # store results
    for pf in list(perf_metrics.keys()):
        df_metrics.loc[k, "Fold"] = f
        df_metrics.loc[k, "Metric"] = pf
        if pf in ["Accuracy", "Precision", "Recall", "F1-Score"]: # put a threshold
            df_metrics.loc[k, "Train"] = perf_metrics[pf](y_train, y_train_pred > 0.5)
            df_metrics.loc[k, "Test"] = perf_metrics[pf](y_test, y_test_pred > 0.5)
        else:
            df_metrics.loc[k, "Train"] = perf_metrics[pf](y_train, y_train_pred)
            df_metrics.loc[k, "Test"] = perf_metrics[pf](y_test, y_test_pred)

        k += 1
        
        # if using sklearn: from sklearn.linear_model import Ridge
        #ml = LogisticRegression(penalty="None").fit(x_train, y_train)
        #y_train_pred = ml.predict(x_train)
        #y_test_pred = ml.predict(x_test)

# final organization
df_metrics = df_metrics.apply(pd.to_numeric, errors="ignore")

In [None]:
df_metrics.pivot_table(index="Metric", values=["Train", "Test"])

### Sparse Logistic Regression (a.k.a. Ridge Logistic)

In [None]:
# pre-instantiation
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
shrink = np.linspace(0.001, 2.0, num=100)
df_metrics = pd.DataFrame(index=[0], columns=["Fold", "Shrinkage", "Metric", "Train", "Test"])

# main loop
k, f = 0, 0
for (train, test) in k_folds.split(df_model.drop(labels=output_variable, axis=1), df_model[output_variable]):
    f += 1
    # separate variables and folds
    x_train = df_model.drop(labels=output_variable, axis=1).values[train]
    x_test = df_model.drop(labels=output_variable, axis=1).values[test]
    y_train = df_model[output_variable].values[train]
    y_test = df_model[output_variable].values[test]
    
    # scale  variables
    scaler_x = StandardScaler().fit(x_train)
    x_train = scaler_x.transform(x_train)
    x_test = scaler_x.transform(x_test)
    
    # fit model
    for l in shrink:
        # train model
        model = LogisticRegression(penalty="l2", C=l, solver="lbfgs").fit(x_train, y_train.ravel())
        
        # get predictions
        y_train_pred = model.predict(x_train)
        y_test_pred = model.predict(x_test)
        
        # compute metrics
        for pf in list(perf_metrics.keys()):
            df_metrics.loc[k, "Fold"] = f
            df_metrics.loc[k, "Metric"] = pf
            df_metrics.loc[k, "Shrinkage"] = l
            if pf in ["Accuracy", "Precision", "Recall", "F1-Score"]: # put a threshold
                df_metrics.loc[k, "Train"] = perf_metrics[pf](y_train, y_train_pred > 0.5)
                df_metrics.loc[k, "Test"] = perf_metrics[pf](y_test, y_test_pred > 0.5)
            else:
                df_metrics.loc[k, "Train"] = perf_metrics[pf](y_train, y_train_pred)
                df_metrics.loc[k, "Test"] = perf_metrics[pf](y_test, y_test_pred)

            k += 1
    print("fold: %d" % (f))

# final organization
df_metrics = df_metrics.apply(pd.to_numeric, errors="ignore")

In [None]:
df_metrics.pivot_table(index=["Shrinkage"], columns=["Metric"], values=["Test"]).head()

In [None]:
df_metrics.pivot_table(index=["Shrinkage"], columns=["Metric"], values=["Test"])["Test"]["Accuracy"].plot()