# k Nearest Neighbours and cross-validation

In [None]:
import math
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics
from sklearn.model_selection import ParameterGrid
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('house-prices-train.csv')
data.SalePrice = np.log1p(data.SalePrice)
data.info()

## data cleaning (copy/pasted from the previous tutorial)

In [None]:
from sklearn.preprocessing import LabelEncoder
def encode_categories(df, mappers, dummies=False):
    le = LabelEncoder()
    for col in df.select_dtypes('object').columns:
        if col not in mappers and df[col].nunique() < 30:
            df[col] = df[col].fillna('NaN')
            df[col] = le.fit_transform(df[col])
            if dummies:
                prefix = 'd_' + col
                df = pd.concat([df.drop(columns=[col]), pd.get_dummies(df[col], prefix=prefix)], axis=1)
        elif col in mappers:
            df[col] = df[col].replace(mappers[col])
    return df

In [None]:
data = pd.read_csv('house-prices-train.csv')
data.SalePrice = np.log1p(data.SalePrice)
ordinal_cols_mappers = {
    'KitchenQual': {'Po' : 0, 'Fa' : 1, 'TA' : 2, 'Gd' : 3, 'Ex' : 4}
}
data = encode_categories(data, ordinal_cols_mappers, True)
data.shape

  * The nature of kNN algorithms means that using kNN with nominal features is troublesome.
  * To overcome this, one can adopt one of these strategies:
    * Drop nominal features (and possibly keep the ordinal one if there is some meaning for measuring the distance).
    * Replace nominal features with dummies using one-hot encoding.
    * Use some [more sophisticated metrics](https://www.researchgate.net/publication/220907006_Similarity_Measures_for_Categorical_Data_A_Comparative_Evaluation) capable of measuring the similarity of nominal features.
  * We will give a try to the first two approaches.

## First attempt

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)

In [None]:
knn.fit(data.drop(columns=['SalePrice']), data.SalePrice)

  * There is a problem with missing values of numeric features.

In [None]:
data.loc[:,data.isnull().sum() > 0].isnull().sum()

What we can do:
  * Drop the data points with missing values. But we do not have enough data for this.
  * We can replace the missings with respective means. But it is too simple, and we have some dignity!
  * We can predict the missing values from the rest of the data! That's it! We will use the kNN algorithm.

### Task: predict the missing values using kNN

The idea is this (assume we want to fill missing values in `LotFrontage` column):
  * Split the dataset into two parts: 
    * `D1` = contaning the lines with missing values in `LotFrontage` column, 
    * `D2` = the rest of the data.
  * Save the column `D2.LotFrontage` to `Y` and the remaining columns to `X` (exclude some columns if needed). The same columns of `D1` save to `X2`.
  * Fit a model (we use the kNN) to predict `Y` using `X`.
  * Use this model to predict the missing values of `LotFrontage` using the `X2` data.

In [None]:
def replace_nans(df, cols_nan, params):
    ### your code goes here
    for col_nan in cols_nan:
        Y = df[df[col_nan].notnull()][col_nan]
        X = df[df[col_nan].notnull()].drop(columns=np.append(cols_nan.values,'Id'))
        X2 = df[df[col_nan].isnull()].drop(columns=np.append(cols_nan.values,'Id'))
        Y2idx = df[df[col_nan].isnull()].index 
        # this is optional, but some treatment of nominal features is needed
        X = X.select_dtypes(['float64', 'int64']) 
        X2 = X2.select_dtypes(['float64', 'int64']) 
        kNN = KNeighborsRegressor(**params)
        kNN.fit(X,Y)
        Ypredict = kNN.predict(X2)
        df.loc[Y2idx,col_nan] = Ypredict
    ###
    return df

Let us check that we have some meaningful results:

In [None]:
df = data.copy()
cols_nan = df.loc[:,data.isnull().sum() > 0].columns
params = {
        'n_neighbors': 5
}
df = replace_nans(df, cols_nan, params)
display(data[cols_nan].describe())
display(dataNoNan[cols_nan].describe())

## Cross-validation and hyperparameter tuning

  * Assume we want to go through the following values of the kNN hyperparameters.
  * Beside this, we also want to see the effect of different strategies of 
    * how to deal with nominal features (ignoring them, using dummies), 
    * how to normalise the data (no normalising vs normalising).

In [None]:
data = df.copy()
data = data.drop(columns=['Id'])

### Task: implement cross validation

In [None]:
def cross_val(X, y, folds, model, dummies = False):
    averageRMSLE = 0
    np.random.seed(seed=654) # this must be here, explain WHY!
    ### Your code goes here
    if not dummies:
        X = X.loc[:, X.nunique() > 3]
    idx = np.random.randint(folds, size=y.shape[0])
    X['cv_fold'] = idx
    for k in range(folds):
        idx_train = X[X.cv_fold != k].index
        idx_val = X[X.cv_fold == k].index
        Xt = X.loc[idx_train,:]
        Xv = X.loc[idx_val,:]
        yt = y[idx_train]
        yv = y[idx_val]
        model.fit(Xt.drop(columns=['cv_fold']), yt)
        ypred = model.predict(Xv.drop(columns=['cv_fold']))
        RMSLE = math.sqrt(metrics.mean_squared_error(yv, ypred))
        averageRMSLE = averageRMSLE + RMSLE/folds
    X.drop(columns=['cv_fold'])
    ###
    return averageRMSLE

### Task: try kNN with and without normalisation/dummies

In [None]:
from sklearn.model_selection import ParameterGrid, train_test_split
param_grid = {
    'n_neighbors' : range(1,20),
    'p': range(1,5),
    'weights': ['uniform', 'distance']
}
dummies = True
param_comb = ParameterGrid(param_grid)
Xtrain, Xtest, ytrain, ytest = train_test_split(data.drop(columns=['SalePrice']), 
                                                data.SalePrice, 
                                                test_size=0.25, 
                                                random_state=6548)
### your code doing normalisation goes here:
# to avoid devision by zero:
one_val_cols = Xtrain.loc[:,Xtrain.max(axis=0) - Xtrain.min(axis=0) == 0].columns 
Xtrain.drop(columns=one_val_cols, inplace=True)
Xtest.drop(columns=one_val_cols, inplace=True)
Xtrain = (Xtrain - Xtrain.min(axis=0)) / (Xtrain.max(axis=0) - Xtrain.min(axis=0))
Xtest = (Xtest - Xtest.min(axis=0)) / (Xtest.max(axis=0) - Xtest.min(axis=0))
###
crossval_err = []
for params in param_comb:
    kNN = KNeighborsRegressor(**params)
    averageRMSLE = cross_val(Xtrain.copy(), ytrain, 12, kNN, dummies)
    crossval_err.append(averageRMSLE)
crossval_err

In [None]:
%%time
best_params = param_comb[np.argmin(crossval_err)]
kNN = KNeighborsRegressor(**best_params)
if not dummies:
    Xtrain = Xtrain.loc[:, Xtrain.nunique() > 3]
    Xtest = Xtest.loc[:, Xtrain.columns]
Xtest.fillna(0, inplace=True)
print(Xtrain.shape, Xtest.shape)

kNN.fit(Xtrain, ytrain)
ypred = kNN.predict(Xtest)
best_RMSLE = math.sqrt(metrics.mean_squared_error(ytest, ypred))
print('RMSLE (test): {0:.6f}'.format(best_RMSLE))
print('best parameters:', best_params)

There are of course packages in `sklearn` for Cross-Validation and normalisation:
  * [MinMaxScaler](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)
  * [cross_val_score](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html)
  * [cross_validate](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html)
  * [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

### Curse of dimensionality

  * Normalised data are all localised in the $n$-dimensional cube with sides of length one.
  * The diagonal of this cube equals $\sqrt{n}$.
  * The curse of dimensionality says that higher the dimension the nearest neighbours get further and further.
  * To measure this effect, we will increase the dimension and observe the ration of the diagonal and the mean distance of the nearest neighbours.

**Try to experiment with the `n_neighbors` parameter!** What is the influence of the number of neigbours and the mean distance?

In [None]:
# Xtrain and Xtest should be normalized here
mean_dist_ratio = []
for k in range(1,30):
    kNN = KNeighborsRegressor(n_neighbors=150, p=2)
    kNN.fit(Xtrain.iloc[:,0:k], ytrain)
    dist, nn = kNN.kneighbors(Xtest.iloc[:,0:k])
    mean_dist_ratio.append(np.mean(dist)/math.sqrt(k))

In [None]:
plt.figure(figsize=(12,5))
plt.xlabel('dimensions')
plt.plot(range(1,len(mean_dist_ratio)+1),mean_dist_ratio,'bo-')