# Nested CV using sklearn pipelines

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold, GridSearchCV

In [6]:
#load data from txt file with pandas
data = pd.read_csv('case1Data.txt', sep=", ", engine='python')
y = data['y']

X = pd.read_csv('case1Data_one_hot.csv').to_numpy()
X_new = pd.read_csv('case1Data_Xnew_one_hot.csv').to_numpy()
y_vec = y.values

In [99]:
class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, X_new = None):
        self.X_new = X_new
        self.scaler = StandardScaler()

    def fit(self, X, y = None):
        self.scaler.fit(np.concatenate((X[:,:95], self.X_new[:,:95]), axis = 0))
        #print('scaler fit')
        return self

    def transform(self, X, y = None):
        X_con = self.scaler.transform(X[:,:95])
        X = np.concatenate((X_con, X[:,95:]), axis = 1)
        return X
    
class CustomKNNImputer(BaseEstimator, TransformerMixin):
    def __init__(self, n_neighbors=1, weights='distance', X_new = None):
        #print('CustomKNNImputer init')
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.X_new = X_new

    def fit(self, X, y = None):
        self.scaler = CustomScaler(X_new = self.X_new).fit(X)
        X_new_norm = self.scaler.transform(self.X_new)
        data = np.concatenate((X, X_new_norm), axis = 0)
        self.imputer = KNNImputer(n_neighbors = self.n_neighbors, weights = self.weights)
        self.imputer.fit(data)
        return self

    def transform(self, X, y = None):
        X = self.imputer.transform(X)
        N = X.shape[0]
        for i in range(N):
            for j in range(4):
                idx = np.argmax(X[i, 95+(j*5) : 95+(j+1)*5 ])
                X[i, 95+(j*5) : 95+(j+1)*5 ] = 0,0,0,0,0
                X[i, 95+(j*5) + idx] = 1
        return X
    
class Debugger(BaseEstimator, TransformerMixin):

    def transform(self, data):
        print("Shape of data at this point", data.shape)
        print(pd.DataFrame(data).head())
        return data

    def fit(self, data, y=None, **fit_params):
        # No need to fit anything, because this is not an actual  transformation. 
        return self

In [108]:
from sklearn.linear_model import Lasso

# Define the pipeline
scaler = CustomScaler(X_new=X_new)
imputer = CustomKNNImputer(X_new = X_new)
model = Lasso(max_iter=10000)
pipe = Pipeline(steps=[('scale', scaler), ('impute', imputer), ('model', model)])

param_grid = {'impute__weights': ['uniform', 'distance'], 
              'impute__n_neighbors': range(3, 10), 
              'model__alpha': np.linspace(0.1, 3, 10)
              }

# Define the cross-validation
K_outer = 5
K_inner = 3
outer_cv = KFold(n_splits=K_outer, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=K_inner, shuffle=True, random_state=42)

# Perform the grid search
RMSE = np.zeros(K_outer)
for i, (train_idx, test_idx) in enumerate(outer_cv.split(X)):
    print(f"Running outer fold {i+1}/{K_outer}")
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y_vec[train_idx], y_vec[test_idx]
    search = GridSearchCV(pipe, param_grid, cv=inner_cv, scoring='neg_mean_squared_error', n_jobs=-1, refit = True)
    search.fit(X_train, y_train)
    best_params = search.best_params_

    pipe.set_params(**best_params)
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)

    #best_model = result.best_estimator_
    #preds = best_model.predict(X_test)

    RMSE[i] = np.sqrt(np.mean((preds - y_test)**2))
    print("Best parameter (CV score=%0.3f):" % search.best_score_, search.best_params_, 'RMSE:', f"{RMSE[i]:.3f}")


Running outer fold 1/5
scaler fit
scaler fit
scaler fit
scaler fit
Best parameter (CV score=-588.466): {'impute__n_neighbors': 3, 'impute__weights': 'distance', 'model__alpha': 1.3888888888888888} RMSE: 17.307
Running outer fold 2/5
scaler fit
scaler fit
scaler fit
scaler fit
Best parameter (CV score=-410.916): {'impute__n_neighbors': 3, 'impute__weights': 'distance', 'model__alpha': 1.3888888888888888} RMSE: 22.883
Running outer fold 3/5
scaler fit
scaler fit
scaler fit
scaler fit
Best parameter (CV score=-541.304): {'impute__n_neighbors': 4, 'impute__weights': 'distance', 'model__alpha': 1.0666666666666667} RMSE: 17.711
Running outer fold 4/5
scaler fit
scaler fit
scaler fit
scaler fit
Best parameter (CV score=-527.397): {'impute__n_neighbors': 4, 'impute__weights': 'distance', 'model__alpha': 0.7444444444444444} RMSE: 24.460
Running outer fold 5/5
scaler fit
scaler fit
scaler fit
scaler fit
Best parameter (CV score=-424.396): {'impute__n_neighbors': 4, 'impute__weights': 'distance',

## Without using pipeline

In [40]:
def transform_to_categorical_bool(inp_array):
    N = inp_array.shape[0]
    for i in range(N):
        for j in range(4):
            idx = np.argmax(inp_array[i, 95+(j*5) : 95+(j+1)*5 ])
            inp_array[i, 95+(j*5) : 95+(j+1)*5 ] = 0,0,0,0,0
            inp_array[i, 95+(j*5) + idx] = 1
    return inp_array

def custom_scale(scaler, data_X, data_Xnew):
    """ 
        Scale both X and Xnew based only on their continous values.
    """
    scaler.fit(np.concatenate((data_X[:,:95], data_Xnew[:,:95]), axis = 0))
    X_con, Xnew_con = scaler.transform(data_X[:,:95]), scaler.transform(data_Xnew[:,:95])
    data_X_norm = np.concatenate((X_con, data_X[:,95:]), axis = 1)
    data_Xnew_norm = np.concatenate((Xnew_con, data_Xnew[:,95:]), axis = 1)
    all_data = np.concatenate((data_X_norm, data_Xnew_norm), axis = 0)

    return scaler, data_X_norm, all_data

def custom_transform(scaler, data):
    return np.concatenate((scaler.transform(data[:,:95]), data[:,95:]), axis = 1)


In [70]:
from sklearn.linear_model import Lasso

K_outer = 15
K_inner = 5
cv_outer = KFold(n_splits=K_outer, shuffle=True, random_state=42)
cv_inner = KFold(n_splits=K_inner, shuffle=True, random_state=42)
alphas = np.linspace(0.1, 3, 10)
neighbors = range(2, 20, 2)
methods = ['uniform', 'distance']
RMSE = np.zeros(K_outer)

for k1, (train_index, test_index) in enumerate(cv_outer.split(X)):
    print(k1)
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y_vec[train_index], y_vec[test_index]

    errors_inner = np.zeros((K_inner, len(neighbors), len(methods), len(alphas)))
    for k2, (train_index_inner, test_index_inner) in enumerate(cv_inner.split(Xtrain)):
        Xtrain_inner, Xtest_inner = Xtrain[train_index_inner], Xtrain[test_index_inner]
        ytrain_inner, ytest_inner = ytrain[train_index_inner], ytrain[test_index_inner]
        scaler, Xtrain_inner_norm, all_inner_data = custom_scale(StandardScaler(), Xtrain_inner, X_new)
        Xtest_inner_norm = custom_transform(scaler, Xtest_inner)
        
        for i, neigh in enumerate(neighbors):
            for m, method in enumerate(methods):
                imputer = KNNImputer(n_neighbors=neigh, weights=method).fit(all_inner_data)
                #Impute data
                Xtrain_inner_imputed = transform_to_categorical_bool(imputer.transform(Xtrain_inner_norm))
                Xtest_inner_imputed = transform_to_categorical_bool(imputer.transform(Xtest_inner_norm))

                for j, alpha in enumerate(alphas):
                    model = Lasso(alpha=alpha, max_iter=10000).fit(Xtrain_inner_imputed, ytrain_inner)
                    preds_inner = model.predict(Xtest_inner_imputed)
                    errors_inner[k2, i, m, j] = np.sum((preds_inner - ytest_inner)**2) #Sum up all squared error

    mean_error_inner = errors_inner.mean(axis = 0)
    idx = np.unravel_index(np.argmin(mean_error_inner, axis=None), mean_error_inner.shape)
    print('Optimal neighbors: ', neighbors[idx[0]], ', optimal method:', methods[idx[1]], ', optimal alpha: ', alphas[idx[2]])

    scaler, Xtrain_norm, all_data = custom_scale(StandardScaler(), Xtrain, X_new)
    Xtest_norm = custom_transform(scaler, Xtest)
    imputer = KNNImputer(n_neighbors=neighbors[idx[0]], weights=methods[idx[1]]).fit(all_data)
    Xtrain_imputed = transform_to_categorical_bool(imputer.transform(Xtrain_norm))
    Xtest_imputed = transform_to_categorical_bool(imputer.transform(Xtest_norm))
    best_inner_model = Lasso(alpha=alphas[idx[2]], max_iter=10000).fit(Xtrain_imputed, ytrain)
    preds_outer = best_inner_model.predict(Xtest_imputed)

    RMSE[k1] = np.sqrt(np.mean((preds_outer - ytest)**2))
    print('RMSE:', RMSE[k1])

print('Mean RMSE:', RMSE.mean())

0
Optimal neighbors:  16 , optimal method: distance , optimal alpha:  1.0666666666666667
RMSE: 20.798210960978814
1
Optimal neighbors:  8 , optimal method: distance , optimal alpha:  0.42222222222222217
RMSE: 20.264205977390173
2
Optimal neighbors:  8 , optimal method: uniform , optimal alpha:  1.3888888888888888
RMSE: 16.7453649349476
3
Optimal neighbors:  8 , optimal method: distance , optimal alpha:  1.3888888888888888
RMSE: 22.987312124738324
4
Optimal neighbors:  8 , optimal method: uniform , optimal alpha:  1.3888888888888888
RMSE: 20.49045269384252
5
Optimal neighbors:  16 , optimal method: distance , optimal alpha:  1.3888888888888888
RMSE: 19.483760629671032
6
Optimal neighbors:  8 , optimal method: uniform , optimal alpha:  1.0666666666666667
RMSE: 18.2331960250399
7
Optimal neighbors:  8 , optimal method: uniform , optimal alpha:  0.7444444444444444
RMSE: 18.309026233627
8
Optimal neighbors:  4 , optimal method: distance , optimal alpha:  1.3888888888888888
RMSE: 14.27161002