# KNN 

In [1]:
#load dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import RFECV
from sklearn.inspection import permutation_importance
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
import sklearn_relief as sr
import pickle

sns.set_style("whitegrid")

#set Pandas display option 
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows", 500)

#set figure size
plt.rcParams['figure.figsize'] = (8, 6)

#ensure that code is reproducible by setting random seed
np.random.seed(1916) 

In [2]:
#project setup
input_data_path = "../data/input"
output_data_path = "../data/output"

#load data
df = pd.read_csv(f"{output_data_path}/after_encoding_train.csv", index_col=0)
df_test = pd.read_csv(f"{output_data_path}/after_encoding_test.csv", index_col=0)

fr = pd.read_csv(f"{output_data_path}/feature_ranking.csv", index_col=0)

We omit standardization because we have encoded categorical variables

#### Searching for "good enough" model to feature selection

Mutual Information

In [3]:
var = fr.mi_score.sort_values(ascending=False).index.tolist()[0:10]

In [4]:
print(var)

["('never',).5", "('Kid(s)',)", "('Partner',)", "('gt8',)", "('2h',)", "('Restaurant(<20)',)", "('Legal',)", "('7AM',)", "('Single',)", "('Healthcare Practitioners & Technical',)"]


In [5]:
df.shape[0] ** (0.5)

100.7323185477233

In [6]:
param = {
    "n_neighbors": [5, 7, 10, 12, 15, 25, 40, 50, 100],
    "weights": ["uniform", "distance"],
    "metric": ["minkowski", "manhattan", "chebyshev"],
    "p": [1, 2],
}

In [7]:
mse = make_scorer(mean_squared_error, greater_is_better=False)

In [9]:
model = KNeighborsClassifier()
grid_CV = GridSearchCV(
    model, param, cv=2, scoring=mse, return_train_score=True, n_jobs=-1
)
grid_CV.fit(df.loc[:, var].values, df.loc[:, "target"].values.ravel())

In [10]:
grid_CV.best_params_

{'metric': 'minkowski', 'n_neighbors': 100, 'p': 1, 'weights': 'distance'}

In [11]:
grid_CV.cv_results_

{'mean_fit_time': array([0.00781453, 0.0078156 , 0.01562285, 0.01562464, 0.0107435 ,
        0.00781167, 0.01650631, 0.01446378, 0.0136174 , 0.00781238,
        0.01562607, 0.01562703, 0.01055586, 0.01562738, 0.01468635,
        0.00289977, 0.01561201, 0.00781453, 0.01328504, 0.01638758,
        0.0114547 , 0.01529574, 0.01562595, 0.01562595, 0.00781286,
        0.01562512, 0.01587808, 0.01615703, 0.01854789, 0.01464629,
        0.03071797, 0.01495755, 0.01420009, 0.0193094 , 0.01537311,
        0.01562488, 0.0129801 , 0.01507008, 0.01592791, 0.01394904,
        0.03044498, 0.01446497, 0.01923227, 0.01562476, 0.01283097,
        0.01527023, 0.00781679, 0.02194619, 0.01556695, 0.01562464,
        0.00781226, 0.01641965, 0.01877975, 0.01562548, 0.01171076,
        0.01562834, 0.03165269, 0.02288103, 0.01562321, 0.01348042,
        0.01507664, 0.02338016, 0.01988935, 0.01562858, 0.01562655,
        0.01562595, 0.0176363 , 0.0148586 , 0.01403403, 0.02322102,
        0.01562655, 0.01727319,

Right now (temporary) we will this hyperparameters as the best one:

In [12]:
grid_CV.best_params_

{'metric': 'minkowski', 'n_neighbors': 100, 'p': 1, 'weights': 'distance'}

#### Feature selection

Feature ranking

In [13]:
fr.sort_values("mi_score", ascending=False, inplace=True)

In [14]:
fr.head()

Unnamed: 0,mi_score,Correlation
"('never',).5",0.046534,0.312901
"('Kid(s)',)",0.025818,0.041959
"('Partner',)",0.017346,0.017552
"('gt8',)",0.016947,0.0
"('2h',)",0.01685,0.136174


In [15]:
mi_features = fr.iloc[0:20].index.tolist()

In [16]:
fr["corr_abs"] = np.abs(fr["Correlation"])
fr.sort_values("corr_abs", ascending=False, inplace=True)
corr_features = fr.iloc[0:20].index.tolist()

Forward elimination

In [17]:
forward_elimination = [
 "('Home',)",
 "('No Urgent Place',)",
 "('Work',)",
 "('Alone',)",
 "('Friend(s)',)",
 "('Kid(s)',)",
 "('Partner',)",
 "('Rainy',)",
 "('Snowy',)",
 "('Sunny',)",
 '(30,)',
 '(55,)',
 '(80,)',
 "('10AM',)",
 "('10PM',)",
 "('2PM',)",
 "('6PM',)",
 "('7AM',)",
 "('Bar',)",
 "('Carry out & Take away',)",
 "('Coffee House',)",
 "('Restaurant(20-50)',)",
 "('Restaurant(<20)',)",
 "('1d',)",
 "('2h',)",
 "('Female',)",
 "('Male',)",
 "('21',)",
 "('26',)",
 "('31',)",
 "('36',)",
 "('41',)",
 "('46',)",
 "('50plus',)",
 "('below21',)",
 "('Divorced',)",
 "('Married partner',)",
 "('Single',)",
 "('Unmarried partner',)",
 "('Widowed',)",
 "('Associates degree',)",
 "('Bachelors degree',)",
 "('Graduate degree (Masters or Doctorate)',)",
 "('High School Graduate',)",
 "('Some High School',)",
 "('Some college - no degree',)",
 "('Architecture & Engineering',)",
 "('Arts Design Entertainment Sports & Media',)",
 "('Building & Grounds Cleaning & Maintenance',)",
 "('Business & Financial',)",
 "('Community & Social Services',)",
 "('Computer & Mathematical',)",
 "('Construction & Extraction',)",
 "('Education&Training&Library',)",
 "('Farming Fishing & Forestry',)",
 "('Food Preparation & Serving Related',)",
 "('Healthcare Practitioners & Technical',)",
 "('Healthcare Support',)",
 "('Installation Maintenance & Repair',)",
 "('Legal',)",
 "('Life Physical Social Science',)",
 "('Management',)",
 "('Office & Administrative Support',)",
 "('Personal Care & Service',)",
 "('Production Occupations',)",
 "('Protective Service',)",
 "('Retired',)",
 "('Sales & Related',)",
 "('Student',)",
 "('Transportation & Material Moving',)",
 "('Unemployed',)",
 "('$100000 or More',)",
 "('$12500 - $24999',)",
 "('$25000 - $37499',)",
 "('$37500 - $49999',)",
 "('$50000 - $62499',)",
 "('$62500 - $74999',)",
 "('$75000 - $87499',)",
 "('$87500 - $99999',)",
 "('Less than $12500',)",
 "('1~3',)",
 "('4~8',)",
 "('gt8',)",
 "('less1',)",
 "('never',)",
 "('1~3',).1",
 "('4~8',).1",
 "('gt8',).1",
 "('less1',).1",
 "('never',).1",
 "('1~3',).2",
 "('4~8',).2",
 "('gt8',).2",
 "('less1',).2",
 "('never',).2",
 "('1~3',).3",
 "('4~8',).3",
 "('gt8',).3",
 "('less1',).3",
 "('never',).3",
 "('1~3',).4",
 "('4~8',).4",
 "('gt8',).4",
 "('less1',).4",
 "('never',).4",
 "('1~3',).5",
 "('4~8',).5",
 "('gt8',).5",
 "('less1',).5",
 "('never',).5",
 "('High_Acceptance',)",
 "('Low_Acceptance',)",
 "('Medium_Acceptance',)",
 "('Medium_High_Acceptance',)",
 "('Medium_Low_Acceptance',)",
 'has_children',
 'toCoupon_GEQ15min',
 'toCoupon_GEQ25min',
 'direction_same',
 'to_Coupon',
]

In [18]:
candidates = [i for i in forward_elimination if "]" not in i]

In [19]:
grid_CV.best_params_

{'metric': 'minkowski', 'n_neighbors': 100, 'p': 1, 'weights': 'distance'}

In [20]:
model = KNeighborsClassifier(**grid_CV.best_params_)

In [21]:
sf = SFS(
    model,
    n_features_to_select=5,
    direction='forward',
    scoring=mse,
    cv=2,
    n_jobs=-1,
)

In [23]:
sffit = sf.fit(
    df.loc[:, candidates].values, df.loc[:, "target"].values.ravel()
)

In [24]:
sf_features_mask = sffit.get_support()
sf_features = df.loc[:, candidates].columns[sf_features_mask]
sf_features

Index(['('Partner',)', '('Carry out & Take away',)', '('Restaurant(20-50)',)',
       '('1~3',).1', '('never',).5'],
      dtype='object')

#### Hyperparametes tunning for each group of variables

In [25]:
param = {
    "n_neighbors": [3],
    "weights": ["uniform", "distance"],
    "metric": ["minkowski", "manhattan", "chebyshev"],
    "p": [1, 2],
}
mse = make_scorer(mean_squared_error, greater_is_better=True)

In [26]:
def cv_proc(var):
    model = KNeighborsClassifier()
    grid_CV = GridSearchCV(
        model, param, cv=2, scoring=mse, return_train_score=True, n_jobs=-1
    )
    grid_CV.fit(df.loc[:, var].values, df.loc[:, "target"].values.ravel())
    print(grid_CV.best_params_)
    print(grid_CV.best_score_)

In [27]:
cv_proc(mi_features)

{'metric': 'chebyshev', 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
0.37025696024483223


In [28]:
cv_proc(corr_features)

{'metric': 'minkowski', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
0.34079158903578893


In [29]:
cv_proc(sf_features)

{'metric': 'chebyshev', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
0.447616766047399


#### Final models comparison - winner obtaining

In [30]:
def proper_CV(x, y, model, display_res=False):
    train_score = list()
    valid_score = list()
    kf = KFold(n_splits=6, shuffle=True, random_state=42)

    for train_index, valid_index in kf.split(x):
        train_x, valid_x = x.iloc[train_index], x.iloc[valid_index]
        train_y, valid_y = y.iloc[train_index], y.iloc[valid_index]

        model.fit(train_x.values, train_y.values.ravel())

        pred_y_train = model.predict(train_x.values)
        rmse_train = np.sqrt(mean_squared_error(train_y, pred_y_train))
        train_score.append(rmse_train)

        pred_y_val = model.predict(valid_x.values)
        rmse_val = np.sqrt(mean_squared_error(valid_y, pred_y_val))
        valid_score.append(rmse_val)

    if display_res:
        view = pd.DataFrame([train_score, valid_score]).T.rename(
            columns={0: "cv_train", 1: "cv_val"}
        )
        display(view)
        return train_score, valid_score, view
    else:
        return train_score, valid_score

In [170]:
df = df.sort_values(by="rok").reset_index(drop=True)

In [32]:
hp = [
    {'metric': 'chebyshev', 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'},
    {'metric': 'minkowski', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'},
    {'metric': 'chebyshev', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'},
    ]

In [34]:
model = KNeighborsClassifier(**hp[0])
var = mi_features
cv_output0 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.531781,0.618375
1,0.536841,0.613759
2,0.533415,0.579055
3,0.532083,0.606002
4,0.532972,0.583632
5,0.565276,0.579565


In [39]:
model = KNeighborsClassifier(**hp[1])
var = corr_features
cv_output1 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.533779,0.590508
1,0.525823,0.588174
2,0.530525,0.584139
3,0.531861,0.584139
4,0.528179,0.582618
5,0.529521,0.580075


In [40]:
model = KNeighborsClassifier(**hp[2])
var = sf_features
cv_output2 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.63367,0.644581
1,0.63531,0.636928
2,0.624231,0.631332
3,0.623188,0.624268
4,0.644366,0.650251
5,0.579021,0.558256


In [41]:
pd.DataFrame(
    [
        cv_output0[2].mean().tolist(),
        cv_output1[2].mean().tolist(),
        cv_output2[2].mean().tolist(),
        
    ],
    columns=["train_mean", "test_mean"],
)

Unnamed: 0,train_mean,test_mean
0,0.538728,0.596731
1,0.529948,0.584942
2,0.623298,0.624269


In [42]:
pd.DataFrame(
    [
        cv_output0[2].std().tolist(),
        cv_output1[2].std().tolist(),
        cv_output2[2].std().tolist(),
        
    ],
    columns=["train_std", "test_std"],
)

Unnamed: 0,train_std,test_std
0,0.013131,0.018017
1,0.002793,0.00379
2,0.023053,0.033632


Second model seems to be the best one 

In [43]:
print(corr_features)

["('never',).5", "('Carry out & Take away',)", "('4~8',).5", "('Restaurant(<20)',)", "('1~3',).5", "('Bar',)", "('2h',)", "('1d',)", "('Friend(s)',)", "('No Urgent Place',)", "('never',).1", 'to_Coupon', "('1~3',).1", 'toCoupon_GEQ25min', "('Alone',)", "('Sunny',)", "('Coffee House',)", "('gt8',).5", "('Restaurant(20-50)',)", "('2PM',)"]


#### Fit final model and save it

In [44]:
model = KNeighborsClassifier(**hp[1])
model.fit(df.loc[:, corr_features].values, df.loc[:, "target"].values.ravel())

In [49]:
filename = "../models/knn.sav"

In [50]:
pickle.dump(model, open(filename, "wb"))