In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import rbf_kernel

In [2]:
def get_data(path):
    '''
    --------------------------
    Load data from source
    Input: Path to data
    Output: Data-frame
    -------------------------
    '''
    df = pd.read_csv(path)
    return(df)

In [3]:
def train_test_split(df, train_percent=0.666666):
    '''
    --------------------------
    Load data from source
    Input: Path to data
    Output: Data-frame
    -------------------------
    '''
    np.random.seed(4109)
    df = df.sample(frac=1).reset_index(drop=True)
    
    n_train = round(train_percent*len(df))
    df_train = df[:n_train]
    df_test = df[n_train:]
    
    return(df_train, df_test)

In [4]:
def get_grid_search_vals(start_g, stop_g, start_s, stop_s):
    '''
    --------------------------
    Load data from source
    Input: Path to data
    Output: Data-frame
    -------------------------
    '''
    gamma_powers = np.arange(start_g, stop_g + 1, 1)
    sigma_powers = np.arange(start_s, stop_s + 1, 0.5)
    
    gamma = np.power(0.5, gamma_powers)
    sigma = np.power(0.5, sigma_powers)
    
    return(gamma, sigma)

In [98]:
def get_pairwise_dist(A, B):
    '''
    --------------------------
    Load data from source
    Input: Path to data
    Output: Data-frame
    
    References for einsum:
    https://ajcr.net/Basic-guide-to-einsum/
    https://stackoverflow.com/questions/45896939/using-python-numpy-einsum-to-obtain-dot-product-between-2-matrices
    -------------------------
    '''
    dist = np.einsum('ij,ij->i',A,B) + np.einsum('ij,ij->i',B,B) - 2*np.dot(A,B.T)
    return(dist)

In [None]:
def test_pairwise_dist(dist, A, B):
        '''
    --------------------------
    Load data from source
    Input: Path to data
    Output: Data-frame
    
    References for einsum:
    https://ajcr.net/Basic-guide-to-einsum/
    https://stackoverflow.com/questions/45896939/using-python-numpy-einsum-to-obtain-dot-product-between-2-matrices
    -------------------------
    '''
    dist_sk = euclidean_distances(A, B)**2
    return(np.allclose(dist, dist_sk))

In [102]:
def get_gaussian_kernel(X, X_test, sigma):
    '''
    --------------------------
    Load data from source
    Input: Path to data
    Output: Data-frame
    -------------------------
    '''
    dist, _, _ = get_pairwise_dist(A, B) 
    scale = -0.5/np.power(sigma, 2)
    kernel = np.exp(dist*scale)
    return(kernel)

In [None]:
def test_gaussian_kernel(X, X_test, kernel):
    '''
    --------------------------
    Load data from source
    Input: Path to data
    Output: Data-frame
    -------------------------
    '''
    kernel_sk = rbf_kernel(X, X_test, -0.5)
    return(np.allclose(kernel, kernel_sk))

In [103]:
def get_kernel_ridge_coeff(K, gamma, Y):
    '''
    --------------------------
    Load data from source
    Input: Path to data
    Output: Data-frame
    -------------------------
    '''
    m = max(Y.shape)
    alpha = np.linalg.solve(K + gamma*np.identity(m), Y)
    return(alpha)

In [None]:
def get_kernel_ridge_prediction(alpha, X, X_test, df, sigma):
    '''
    --------------------------
    Load data from source
    Input: Path to data
    Output: Data-frame
    -------------------------
    '''
    K_test = get_gaussian_kernel(X, X_test, sigma)
    y_pred = np.einsum('i,ij->i',alpha, K_test)
    return(y_pred)

In [None]:
def get_k_folds(df_train, k = 5):
    '''
    --------------------------
    Load data from source
    Input: Path to data
    Output: Data-frame
    -------------------------
    '''
    # Set the seed to make sure reproducible
    # Then create a list of folds
    np.random.seed(43890)
    fold_size = round(len(df_train)/k)
    df_train = df_train.sample(frac=1).reset_index(drop=True)
    df_folds = [df_train.iloc[i:i+fold_size] for i in range(0,len(df_train)-fold_size+1,fold_size)]
    return(df_folds)

In [None]:
def get_mse(y, y_hat):
    '''
    --------------------------
    Load data from source
    Input: Path to data
    Output: Data-frame
    -------------------------
    '''
    return(1/max(y.shape)*np.sum(np.power((y-y_pred),2)))

In [None]:
def plot_mse(gamma, sigma):
    '''
    --------------------------
    Load data from source
    Input: Path to data
    Output: Data-frame
    -------------------------
    '''
    pass

In [None]:
def get_best_params(history):
    '''
    --------------------------
    Load data from source
    Input: Path to data
    Output: Data-frame
    -------------------------
    '''
    best_result = history['val_mse'].argmin()
    best_alpha, best_sigma = history[best_result]['alpha'], history[best_result]['sigma']
    best_mse = history['val_mse'].min()
    
    return(best_result, best_mse, best_params)

In [None]:
def train_kernel_ridge(X_train, X_val, Y_train, Y_val, sigma, gamma):
    '''
    --------------------------
    Load data from source
    Input: Path to data
    Output: Data-frame
    -------------------------
    '''
    alpha = get_kernel_ridge_coeff()
    y_pred_train = get_kernel_ridge_predictions()
    y_pred_val = get_kernel_ridge_predictions()
    
    mse_train = get_mse(Y_train, y_pred_train)
    mse_val = get_mse(Y_val, y_pred_val)
    
    return

In [None]:
def main():
    '''
    --------------------------
    Load data from source
    Input: Path to data
    Output: Data-frame
    -------------------------
    '''
    path = 'http://www0.cs.ucl.ac.uk/staff/M.Herbster/boston-filter/Boston-filtered.csv'
    
    df = get_data(path)
    df_train, df_test = train_test_split(df)
    gamma, sigma = get_grid_search_vals()

In [None]:
main()