# Prediction using Clustering + Kernel ridge regression

In this notebook, we explore the provided data to build intuition on which models to use, which features to retain and more generally on the data challenge.

### Packages

In [None]:
# os libraries
import time
import os

In [None]:
# numerical libraries
import numpy as np
import pandas as pd

In [None]:
# statistical learning libraries
import sklearn.preprocessing as pr
import sklearn.linear_model as lm
import sklearn.kernel_ridge as kr
import sklearn.cluster as cl

In [None]:
# neural networks libraries
import torch
from torch import nn

In [None]:
# visualisation libraires
import matplotlib.pyplot as plt

### Functions

In [None]:
def MSE(model, X, y):
    '''
    Get MSE of model on test data.
    
    Arguments:
        model: prediction model
        
    Returns:
        score: MSE loss
    '''
    
    # compute number of points in data
    n = y.shape[0]
    
    # return loss
    return (1/n) * np.sum(np.square(model.predict(X) - y))

In [None]:
def export_results(model, X):
    '''
    Export results into CSV file for submission.
    
    Arguments:
        model: regression model
    '''
    
    # obtain predictions
    pred = model.predict(X)
    
    # obtain index of data
    idx = X.index
    
    # set in dataframe
    df_results = pd.DataFrame({'_ID': idx, '0': pred})
    
    # save dataframe
    df_results.to_csv('submissions/submit.csv', sep=',', index=False, index_label='_ID')

### Data Loading

In [None]:
# read X_train
df_X_train = pd.read_csv('data/input_training.csv', sep=',', header=0, index_col=0)
X_train = df_X_train.values

In [None]:
# read y_train
df_y_train = pd.read_csv('data/output_training.csv', sep=',', header=0, index_col=0)
y_train = df_y_train.values.ravel()

In [None]:
# read X_test
df_X_test = pd.read_csv('data/input_testing.csv', sep=',', header=0, index_col=0)
X_test = df_X_test.values

### Data Normalisation

In [None]:
# concatenate train and test datasets
df = pd.concat([df_X_train, df_X_test])

### Exploration and creation of an augmented dataset

In [None]:
# create summary train dataset
summary = pd.DataFrame(columns=['Mean', 'Standard deviation', 'Range', 'Number of values', 'Values'], index=df.columns)

# create Pandas summary train dataset
summary_df = df.describe()

# compute statistics for each feature
for feature in df.columns:
    mean = summary_df[feature][1]
    std = summary_df[feature][2]
    min = summary_df[feature][3]
    max = summary_df[feature][7]
    values = set(df[feature])
    n_values = len(set(values))
    
    # populate dataset if n_values <= 10
    if n_values <= 50:
        summary.loc[feature] = pd.Series({'Mean':'{:0.2f}'.format(mean),\
                                          'Standard deviation':'{:0.2f}'.format(std),\
                                          'Range':'[{:0.2f}, {:0.2f}]'.format(min, max),\
                                          'Number of values':'{:0.0f}'.format(n_values),\
                                          'Values':', '.join(["{:0.2f}".format(x) for x in sorted(values)])})
        
    
    # populate dataset otherwise
    else:
        summary.loc[feature] = pd.Series({'Mean':'{:0.2f}'.format(mean),\
                                          'Standard deviation':'{:0.2f}'.format(std),\
                                          'Range':'[{:0.2f}, {:0.2f}]'.format(min, max),\
                                          'Number of values':'{:0.0f}'.format(n_values),\
                                          'Values':'NA'})

In [None]:
summary

In [None]:
# set list of categorical features
categorical_features = ['X3', 'X6', 'X11', 'X15', 'X16', 'X18', 'X19', 'X22', 'X28', 'X32', 'X33', 'X35', 'X36',
                        'X42', 'X49', 'X56', 'X58', 'X60', 'X62', 'X64', 'X68', 'X73', 'X74', 'X83', 'X86', 'X90',
                        'X104', 'X108', 'X109', 'X116', 'X117', 'X122', 'X130', 'X137', 'X139', 'X140', 'X141',
                        'X143', 'X144', 'X148', 'X149', 'X151', 'X162', 'X168', 'X169', 'X172', 'X174', 'X176',
                        'X177', 'X182', 'X184', 'X186', 'X187', 'X192', 'X193', 'X195', 'X196', 'X197', 'X199',
                        'X206', 'X209', 'X217', 'X219', 'X222', 'X231', 'X235', 'X238', 'X242', 'X246', 'X256',
                        'X260', 'X270', 'X275', 'X281', 'X285', 'X286', 'X291', 'X298', 'X301', 'X303', 'X304',
                        'X307', 'X308', 'X312', 'X314', 'X318', 'X330', 'X332', 'X336', 'X337', 'X338']

In [None]:
# set list of categorical features with exactly two possible values
categorical_features_two = summary[summary['Number of values'].astype(int) == 2].index

In [None]:
# set list of categorical features with strictly more than two possible values
categorical_features_more_than_two = [x for x in categorical_features if x not in categorical_features_two]

In [None]:
# create augmented train dataset by one-hot encoding features with strictly more than two possible values
df_augmented = df.copy()
for feature in categorical_features_more_than_two:
    _ = pd.get_dummies(df[feature])
    _.columns = [feature+'-'+str(i) for i in range(1, len(_.columns)+1)]
    df_augmented = df_augmented.drop(feature, axis = 1)
    df_augmented = df_augmented.join(_)

In [None]:
# truncate to retrieve df_X_train
df_X_train_augmented = df_augmented.truncate(before=None, after=df_X_train.shape[0])
X_train_augmented = df_X_train_augmented.values

# truncate to retrieve df_X_test
df_X_test_augmented = df_augmented.truncate(before=df_X_train.shape[0]+1, after=None)
X_test_augmented = df_X_test_augmented.values

In [None]:
# create validation dataset
Xt, Xv, yt, yv = ms.train_test_split(X_train, y_train, test_size=0.15)

In [None]:
# create validation augmented dataset
Xta, Xva, yta, yva = ms.train_test_split(X_train_augmented, y_train, test_size=0.15)

### Feature selection

In [None]:
# print shape of datasets
print('Train data shape:', Xt.shape)
print('Train data (augmented) shape:', Xta.shape)

### Prediction

In [None]:
# initialise list
df_results_list = []

for k in range(2, 20):

    # compute k-means over dataset
    clu = cl.KMeans(n_clusters = k).fit(df_augmented)
    
    batch_scores = []
    
    for batch in range(20):
    
        # create validation augmented dataset
        Xta, Xva, yta, yva = ms.train_test_split(df_X_train_augmented, df_y_train, test_size=1/5)

        # compute labels
        kmeans_labels_train = clu.predict(Xta)
        kmeans_labels_test = clu.predict(Xva)

        # compute predictions for each cluster
        for c in range(k):

            # set KernelRidge regressor
            krr = kr.KernelRidge(alpha=0.06, kernel = "laplacian")

            # set train datasets
            X_train_k = Xta[kmeans_labels_train == c]
            y_train_k = yta[kmeans_labels_train == c]

            # set test dataset
            X_test_k = Xva[kmeans_labels_test == c]
            X_test_idx_k = X_test_k.index
            
            # compute only if cluster in test set
            if X_test_k.shape[0] > 0:

                # fit KernelRidge
                krr.fit(X_train_k, y_train_k)

                # predict cluster
                y_pred_k = krr.predict(X_test_k).squeeze()
                
                # set dataframe
                df_y_pred_k = pd.DataFrame({'_ID': X_test_idx_k, 'YP': y_pred_k})

                # concatenate to results DataFrame
                df_results_list.append(df_y_pred_k)

        # create results dataset
        df_results = pd.concat(df_results_list)

        df_results = df_results.set_index('_ID')

        # compute score
        df_score = yva.merge(df_results, left_index=True, right_index=True, how='inner')
        df_score['S'] = df_score['Y'] - df_score['YP']
        
        score = (1/df_score['S'].shape[0]) * np.sum(np.square(df_score['S']))
        
        batch_scores.append(score)
        
    batch_score = np.mean(batch_scores)
    print(k, batch_score)