## Cross validation of different models

In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold

In [5]:
#load data from txt file with pandas
data = pd.read_csv('case1Data.txt', sep=", ", engine='python')
y = data['y']

X = pd.read_csv('case1Data_one_hot.csv').to_numpy()
X_new = pd.read_csv('case1Data_Xnew_one_hot.csv').to_numpy()
y_vec = y.values
print('y_vec shape:', y_vec.shape)

def transform_to_categorical_bool(inp_array):
    N = inp_array.shape[0]
    for i in range(N):
        for j in range(4):
            idx = np.argmax(inp_array[i, 95+(j*5) : 95+(j+1)*5 ])
            inp_array[i, 95+(j*5) : 95+(j+1)*5 ] = 0,0,0,0,0
            inp_array[i, 95+(j*5) + idx] = 1
    return inp_array

def custom_scale(scaler, data_X, data_Xnew):
    """ 
        Scale both X and Xnew based only on their continous values.
    """
    scaler.fit(np.concatenate((data_X[:,:95], data_Xnew[:,:95]), axis = 0))
    X_con, Xnew_con = scaler.transform(data_X[:,:95]), scaler.transform(data_Xnew[:,:95])
    data_X_norm = np.concatenate((X_con, data_X[:,95:]), axis = 1)
    data_Xnew_norm = np.concatenate((Xnew_con, data_Xnew[:,95:]), axis = 1)
    all_data = np.concatenate((data_X_norm, data_Xnew_norm), axis = 0)

    return scaler, data_X_norm, all_data

def custom_transform(scaler, data):
    return np.concatenate((scaler.transform(data[:,:95]), data[:,95:]), axis = 1)


y_vec shape: (100,)


## Lets try the setup for CV of imputation and model at the same time 

### Ridge regression

In [10]:
from sklearn.linear_model import Ridge 
K = 10
kf = KFold(n_splits=K)
alphas = np.linspace(5, 10, 20)
neighbors = range(5, 20, 1)
methods = ['uniform', 'distance']
errors = np.zeros((K, len(neighbors), len(methods), len(alphas)))

for k, (train_index, test_index) in enumerate(kf.split(X)):
    print(k)
    scaler, Xtrain_norm, all_data = custom_scale(StandardScaler(), X[train_index], X_new) #Scale based on train data (leaving k'th fold out) and new data
    Xtest_norm = custom_transform(scaler, X[test_index])
    ytrain, ytest = y_vec[train_index], y_vec[test_index]

    for i, neigh in enumerate(neighbors):
        for m, method in enumerate(methods):
            imputer = KNNImputer(n_neighbors=neigh, weights=method).fit(all_data)
            #Impute data
            Xtrain_imputed = transform_to_categorical_bool(imputer.transform(Xtrain_norm))
            Xtest_imputed = transform_to_categorical_bool(imputer.transform(Xtest_norm))

            for j, alpha in enumerate(alphas):
                model = Ridge(alpha=alpha).fit(Xtrain_imputed, ytrain)
                preds = model.predict(Xtest_imputed)
                errors[k, i, m, j] = np.sum((preds - ytest)**2) #Sum up all squared error

mean_error = errors.mean(axis = 0)
idx = np.unravel_index(np.argmin(mean_error, axis=None), mean_error.shape)

print('Optimal neighbors: ', neighbors[idx[0]])
print('Optimal method:', methods[idx[1]])
print('optimal alpha: ', alphas[idx[2]])
print('Optimal RMSE', np.sqrt(mean_error[idx] / len(X)))

0
1
2
3
4
5
6
7
8
9
Optimal neighbors:  8
Optimal method: distance
optimal alpha:  7.631578947368421
Optimal RMSE 7.306692197378842


## KNN regressor:

In [19]:
from sklearn.neighbors import KNeighborsRegressor
K = 10
kf = KFold(n_splits=K)
neighbors1 = range(1, 20, 1)
methods1 = ['uniform', 'distance']

neighbors2 = range(1, 20, 1)
methods2 = ['uniform', 'distance']
errors = np.zeros((K, len(neighbors1), len(methods1), len(neighbors2), len(methods2)))

for k, (train_index, test_index) in enumerate(kf.split(X)):
    print(k)
    scaler, Xtrain_norm, all_data = custom_scale(StandardScaler(), X[train_index], X_new) #Scale based on train data (leaving k'th fold out) and new data
    Xtest_norm = custom_transform(scaler, X[test_index])
    ytrain, ytest = y_vec[train_index], y_vec[test_index]

    for i1, neigh1 in enumerate(neighbors1):
        for m1, method1 in enumerate(methods1):
            imputer = KNNImputer(n_neighbors=neigh1, weights=method1).fit(all_data)
            #Impute data
            Xtrain_imputed = transform_to_categorical_bool(imputer.transform(Xtrain_norm))
            Xtest_imputed = transform_to_categorical_bool(imputer.transform(Xtest_norm))

            for i2, neigh2 in enumerate(neighbors2):
                for m2, method2 in enumerate(methods2):
                    knn_reg = KNeighborsRegressor(n_neighbors=neigh2, weights=method2).fit(Xtrain_imputed, ytrain)
                    preds = knn_reg.predict(Xtest_imputed)
                    errors[k, i1, m1, i2, m2] = np.sum((preds - ytest)**2) #Sum up all squared error

mean_error = errors.mean(axis = 0)
idx = np.unravel_index(np.argmin(mean_error, axis=None), mean_error.shape)

print('Optimal neighbors in imputation: ', neighbors1[idx[0]])
print('Optimal method in imputation:', methods1[idx[1]])
print('Optimal neighbors in prediction: ', neighbors2[idx[2]])
print('Optimal method  in prediction:', methods2[idx[3]])
print('Optimal RMSE', np.sqrt(mean_error[idx] / len(X)))

0
1
2
3
4
5
6
7
8
9
Optimal neighbors in imputation:  2
Optimal method in imputation: uniform
Optimal neighbors in prediction:  4
Optimal method  in prediction: uniform
Optimal RMSE 11.488682352926261


## Lasso regression:

In [18]:
from sklearn.linear_model import Lasso

K = 10
kf = KFold(n_splits=K)
alphas = np.linspace(0.1, 5, 20)
neighbors = range(5, 20, 1)
methods = ['uniform', 'distance']
errors = np.zeros((K, len(neighbors), len(methods), len(alphas)))

for k, (train_index, test_index) in enumerate(kf.split(X)):
    print(k)
    scaler, Xtrain_norm, all_data = custom_scale(StandardScaler(), X[train_index], X_new) #Scale based on train data (leaving k'th fold out) and new data
    Xtest_norm = custom_transform(scaler, X[test_index])
    ytrain, ytest = y_vec[train_index], y_vec[test_index]

    for i, neigh in enumerate(neighbors):
        for m, method in enumerate(methods):
            imputer = KNNImputer(n_neighbors=neigh, weights=method).fit(all_data)
            #Impute data
            Xtrain_imputed = transform_to_categorical_bool(imputer.transform(Xtrain_norm))
            Xtest_imputed = transform_to_categorical_bool(imputer.transform(Xtest_norm))

            for j, alpha in enumerate(alphas):
                model = Lasso(alpha=alpha, max_iter=10000).fit(Xtrain_imputed, ytrain)
                preds = model.predict(Xtest_imputed)
                errors[k, i, m, j] = np.sum((preds - ytest)**2) #Sum up all squared error

mean_error = errors.mean(axis = 0)
idx = np.unravel_index(np.argmin(mean_error, axis=None), mean_error.shape)

print('Optimal neighbors: ', neighbors[idx[0]])
print('Optimal method:', methods[idx[1]])
print('optimal alpha: ', alphas[idx[2]])
print('Optimal RMSE', np.sqrt(mean_error[idx] / len(X)))

0
1
2
3
4
5
6
7
8
9
Optimal neighbors:  9
Optimal method: distance
optimal alpha:  1.1315789473684212
Optimal RMSE 6.378234813416324


### Elastic net regression (L1 + L2 combined regularization)

In [27]:
from sklearn.linear_model import ElasticNet # Slow to run this CV loop!

K = 10
kf = KFold(n_splits=K)
alphas = np.linspace(0.1, 5, 10)
l1_ratios = np.linspace(0.1, 1, 10)
neighbors = range(5, 21, 2)
methods = ['uniform', 'distance']
errors = np.zeros((K, len(neighbors), len(methods), len(alphas), len(l1_ratios)))
c = 0
for k, (train_index, test_index) in enumerate(kf.split(X)):
    print(k)
    scaler, Xtrain_norm, all_data = custom_scale(StandardScaler(), X[train_index], X_new) #Scale based on train data (leaving k'th fold out) and new data
    Xtest_norm = custom_transform(scaler, X[test_index])
    ytrain, ytest = y_vec[train_index], y_vec[test_index]

    for i, neigh in enumerate(neighbors):
        for m, method in enumerate(methods):
            imputer = KNNImputer(n_neighbors=neigh, weights=method).fit(all_data)
            #Impute data
            Xtrain_imputed = transform_to_categorical_bool(imputer.transform(Xtrain_norm))
            Xtest_imputed = transform_to_categorical_bool(imputer.transform(Xtest_norm))

            for j, alpha in enumerate(alphas):
                for l, l1 in enumerate(l1_ratios):
                    c+=1
                    if c % 500 == 0:
                        print(c)
                    model = ElasticNet(alpha=alpha, l1_ratio = l1, max_iter=10000, tol = 1e-3, warm_start=True).fit(Xtrain_imputed, ytrain)
                    preds = model.predict(Xtest_imputed)
                    errors[k, i, m, j, l] = np.sum((preds - ytest)**2) #Sum up all squared error

mean_error = errors.mean(axis = 0)
idx = np.unravel_index(np.argmin(mean_error, axis=None), mean_error.shape)

print('Optimal neighbors: ', neighbors[idx[0]])
print('Optimal method:', methods[idx[1]])
print('optimal alpha: ', alphas[idx[2]])
print('optimal l1 ratio: ', l1_ratios[idx[3]])
print('Optimal RMSE', np.sqrt(mean_error[idx] / len(X)))

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
2
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
3
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
4
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
5
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
6
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
7
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
8
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
9
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
Optimal neighbors:  9
Optimal method: distance
optimal alpha:  1.188888888888889
optimal l1 ratio:  1.0
Optimal RMSE 6.363758869

In [32]:
from sklearn.linear_model import Lars

K = 10
kf = KFold(n_splits=K)
nonzero_coefs = range(5, 121, 5)
neighbors = range(5, 20, 1)
methods = ['uniform', 'distance']
errors = np.zeros((K, len(neighbors), len(methods), len(nonzero_coefs)))
c = 0
for k, (train_index, test_index) in enumerate(kf.split(X)):
    print(k)
    scaler, Xtrain_norm, all_data = custom_scale(StandardScaler(), X[train_index], X_new) #Scale based on train data (leaving k'th fold out) and new data
    Xtest_norm = custom_transform(scaler, X[test_index])
    ytrain, ytest = y_vec[train_index], y_vec[test_index]

    for i, neigh in enumerate(neighbors):
        for m, method in enumerate(methods):
            imputer = KNNImputer(n_neighbors=neigh, weights=method).fit(all_data)
            #Impute data
            Xtrain_imputed = transform_to_categorical_bool(imputer.transform(Xtrain_norm))
            Xtest_imputed = transform_to_categorical_bool(imputer.transform(Xtest_norm))

            for j, non_zero in enumerate(nonzero_coefs):
                    model = Lars(n_nonzero_coefs=non_zero, eps = 1e-5).fit(Xtrain_imputed, ytrain)
                    preds = model.predict(Xtest_imputed)
                    errors[k, i, m, j] = np.sum((preds - ytest)**2) #Sum up all squared error

mean_error = errors.mean(axis = 0)
idx = np.unravel_index(np.argmin(mean_error, axis=None), mean_error.shape)

print('Optimal neighbors: ', neighbors[idx[0]])
print('Optimal method:', methods[idx[1]])
print('optimal non_zero coefs: ', nonzero_coefs[idx[2]])
print('Optimal RMSE', np.sqrt(mean_error[idx] / len(X)))

0
1
2
3
4
5
6
7
8
9
Optimal neighbors:  9
Optimal method: uniform
optimal non_zero coefs:  20
Optimal RMSE 7.416760065567319


In [51]:
from sklearn.linear_model import BayesianRidge

K = 10
kf = KFold(n_splits=K)
alphas1 = np.linspace(1e-6, 1e-4, 5)
alphas2 = np.linspace(1e-6, 1e-4, 5)
neighbors = range(1, 10, 1)
methods = ['uniform', 'distance']
errors = np.zeros((K, len(neighbors), len(methods), len(alphas1), len(alphas2)))
c = 0
for k, (train_index, test_index) in enumerate(kf.split(X)):
    print(k)
    scaler, Xtrain_norm, all_data = custom_scale(StandardScaler(), X[train_index], X_new) #Scale based on train data (leaving k'th fold out) and new data
    Xtest_norm = custom_transform(scaler, X[test_index])
    ytrain, ytest = y_vec[train_index], y_vec[test_index]

    for i, neigh in enumerate(neighbors):
        for m, method in enumerate(methods):
            imputer = KNNImputer(n_neighbors=neigh, weights=method).fit(all_data)
            #Impute data
            Xtrain_imputed = transform_to_categorical_bool(imputer.transform(Xtrain_norm))
            Xtest_imputed = transform_to_categorical_bool(imputer.transform(Xtest_norm))

            for a1, alpha1 in enumerate(alphas1):
                for a2, alpha2 in enumerate(alphas2):
                    model = BayesianRidge(alpha_1=alpha1, alpha_2=alpha2).fit(Xtrain_imputed, ytrain)
                    preds = model.predict(Xtest_imputed)
                    errors[k, i, m, a1, a2] = np.sum((preds - ytest)**2) #Sum up all squared error

mean_error = errors.mean(axis = 0)
idx = np.unravel_index(np.argmin(mean_error, axis=None), mean_error.shape)

print('Optimal neighbors: ', neighbors[idx[0]])
print('Optimal method:', methods[idx[1]])
print('optimal alpha1: ', alphas1[idx[2]])
print('optimal alpha2: ', alphas2[idx[3]])
print('Optimal RMSE', np.sqrt(mean_error[idx] / len(X)))

0
1
2
3
4
5
6
7
8
9
Optimal neighbors:  4
Optimal method: uniform
optimal alpha1:  1e-06
optimal alpha2:  0.0001
Optimal RMSE 7.34616877255294


In [53]:
from sklearn.ensemble import RandomForestRegressor

K = 10
kf = KFold(n_splits=K)
estimators = range(10, 100, 10)
neighbors = range(1, 10, 1)
methods = ['uniform', 'distance']
errors = np.zeros((K, len(neighbors), len(methods), len(estimators)))
c = 0
for k, (train_index, test_index) in enumerate(kf.split(X)):
    print(k)
    scaler, Xtrain_norm, all_data = custom_scale(StandardScaler(), X[train_index], X_new) #Scale based on train data (leaving k'th fold out) and new data
    Xtest_norm = custom_transform(scaler, X[test_index])
    ytrain, ytest = y_vec[train_index], y_vec[test_index]

    for i, neigh in enumerate(neighbors):
        for m, method in enumerate(methods):
            imputer = KNNImputer(n_neighbors=neigh, weights=method).fit(all_data)
            #Impute data
            Xtrain_imputed = transform_to_categorical_bool(imputer.transform(Xtrain_norm))
            Xtest_imputed = transform_to_categorical_bool(imputer.transform(Xtest_norm))

            for e, estimator in enumerate(estimators):
                    model = RandomForestRegressor(n_estimators=estimator).fit(Xtrain_imputed, ytrain)
                    preds = model.predict(Xtest_imputed)
                    errors[k, i, m, e] = np.sum((preds - ytest)**2) #Sum up all squared error

mean_error = errors.mean(axis = 0)
idx = np.unravel_index(np.argmin(mean_error, axis=None), mean_error.shape)

print('Optimal neighbors: ', neighbors[idx[0]])
print('Optimal method:', methods[idx[1]])
print('optimal number of estimators: ', estimators[idx[2]])
print('Optimal RMSE', np.sqrt(mean_error[idx] / len(X)))

0
1
2
3
4
5
6
7
8
9
Optimal neighbors:  9
Optimal method: distance
optimal number of estimators:  10
Optimal RMSE 10.739905716764365
