# Building tableau for KNN model

This notebook serves as documentation of construction of the tableau for KNN model. 

In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.feature_selection import f_classif
import itertools
import sys
import importlib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, make_scorer
from joblib import Parallel, delayed, parallel_backend
from threadpoolctl import threadpool_limits
import matplotlib.pylab as plt
import os
from sklearn.inspection import permutation_importance
from scipy.stats import norm, t
from sklearn.base import clone

sys.path.append("../")
from proj_mod import training
importlib.reload(training);

## Preparation 

In [2]:
str_data="../data/raw.csv"
df=pd.read_csv(str_data)
#Set raw features used 
features=["X1","X3","X4","X6"] 
#Set target 
target=["Y"] 

#Set dataframes 
feat=df[features]
tar=df[target]

#Set splits
n_split = 5
n_repeats = 20
RSKF = RepeatedStratifiedKFold(n_splits=n_split, random_state=420, n_repeats=n_repeats)

#Set hyper-parameters 
force=["X1","X6","F_w_mean"] 
nn=5 

#Make pipeline 
pipe=Pipeline([
    ("DataCreate", training.data_creator()),
    ("DataSelector", training.data_selector(force=force)),
    ("scale",StandardScaler()),
    ("KNN",KNeighborsClassifier(n_neighbors=nn))]
)

## Train and save model 

In [3]:
enu_split=enumerate(list(RSKF.split(X=feat, y=tar)))
def train_once(index:int, train_index, pipe: Pipeline): 
    x_tr=feat.iloc[train_index]
    y_tr=tar.iloc[train_index]
    y_tr=np.ravel(y_tr.values)
    
    fitted=clone(pipe)
    fitted.fit(X=x_tr,y=y_tr)
    
    return index, fitted 
    
models=Parallel(n_jobs=-1,backend="loky",verbose=10)(
    delayed(train_once)(
        index=index, 
        train_index=train_index,
        pipe=pipe
    )
    for index, (train_index, _) in enu_split
)

model_dict=dict(models)

model_dict

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.19069265047350503s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  80 out of 100 | elapsed:    2.5s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  91 out of 100 | elapsed:    2.6s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.6s finished


{0: Pipeline(steps=[('DataCreate', data_creator()),
                 ('DataSelector', data_selector(force=['X1', 'X6', 'F_w_mean'])),
                 ('scale', StandardScaler()), ('KNN', KNeighborsClassifier())]),
 1: Pipeline(steps=[('DataCreate', data_creator()),
                 ('DataSelector', data_selector(force=['X1', 'X6', 'F_w_mean'])),
                 ('scale', StandardScaler()), ('KNN', KNeighborsClassifier())]),
 2: Pipeline(steps=[('DataCreate', data_creator()),
                 ('DataSelector', data_selector(force=['X1', 'X6', 'F_w_mean'])),
                 ('scale', StandardScaler()), ('KNN', KNeighborsClassifier())]),
 3: Pipeline(steps=[('DataCreate', data_creator()),
                 ('DataSelector', data_selector(force=['X1', 'X6', 'F_w_mean'])),
                 ('scale', StandardScaler()), ('KNN', KNeighborsClassifier())]),
 4: Pipeline(steps=[('DataCreate', data_creator()),
                 ('DataSelector', data_selector(force=['X1', 'X6', 'F_w_mean'])),
      

## Set change, evaluate, and make record. 

In [5]:

def eva_once(index: int, test_index, in_dict_deltas: dict): 
    #Take in test features and create altered data
    x_te_o=feat.iloc[test_index].copy(deep=True)    
    x_te_i=x_te_o.copy(deep=True)
    arr_o=np.array([x_te_i[feature] for feature in features]).transpose()
    arr_delt=np.array([in_dict_deltas[feature] for feature in features])
    arr_i=arr_o+arr_delt
    arr_i=arr_i.clip(min=np.full(shape=4,fill_value=0),max=np.full(shape=4,fill_value=5))
    x_te_i=pd.DataFrame(dict(zip(x_te_i.columns, arr_i.transpose())))
    
    #Load in the right model 
    cur_pipe=model_dict[index]
    
    #Make predictions and produce expected improvement pct 
    pos_o=cur_pipe.predict(X=x_te_o).mean() 
    pos_i=cur_pipe.predict(X=x_te_i).mean()
    
    #Load in target values
    y_te_o=tar.iloc[test_index]
    y_te_o=np.ravel(y_te_o.values)
    pos_acc=y_te_o.mean()
    
    return (pos_i-pos_o)/pos_o, (pos_i-pos_acc)/pos_acc

In [9]:
x1_delta=[1,0,0,0]
x3_delta=[0,1,0,0]
x4_delta=[0,0,1,0]
x6_delta=[0,0,0,1]

for n in range(4): 
    dict_deltas={
        "X1": x1_delta[n], 
        "X3": x3_delta[n], 
        "X4": x4_delta[n], 
        "X6": x6_delta[n]
    }

    enu_split=enumerate(list(RSKF.split(X=feat, y=tar)))
    results = Parallel(n_jobs=-1, backend="loky", verbose=10)(
        delayed(eva_once)(
            index=index, 
            test_index=test_index,
            in_dict_deltas=dict_deltas
        )
        for index, (_, test_index) in enu_split 
    )

    imp_pct=np.array(results)

    df_whatif=pd.DataFrame(dict(zip(["rel pred", "rel truth"],imp_pct.transpose())))

    imp_feat=features[n]

    df_whatif.to_csv(f"tableau_assets/improvement_{imp_feat}.csv", index=False)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0834810733795166s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  58 out of 100 | elapsed:    1.6s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  80 out of 100 | elapsed:    2.0s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.07076740264892578s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  58 out of 100 | elaps