# Find steps needed to return 'evidence' and 'labels' in proper format

In [56]:
import json
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv('shopping.csv')
df

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.000000,0.000000,0.100000,0.000000,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.050000,0.140000,0.000000,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.500000,0.020000,0.050000,0.000000,0.0,Feb,3,3,1,4,Returning_Visitor,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,145.0,0,0.0,53,1783.791667,0.007143,0.029031,12.241717,0.0,Dec,4,6,1,1,Returning_Visitor,True,False
12326,0,0.0,0,0.0,5,465.750000,0.000000,0.021333,0.000000,0.0,Nov,3,2,1,8,Returning_Visitor,True,False
12327,0,0.0,0,0.0,6,184.250000,0.083333,0.086667,0.000000,0.0,Nov,3,2,1,13,Returning_Visitor,True,False
12328,4,75.0,0,0.0,15,346.000000,0.000000,0.021053,0.000000,0.0,Nov,2,2,3,11,Returning_Visitor,False,False


In [9]:
df.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                 object
Weekend                       bool
Revenue                       bool
dtype: object

In [15]:
def transform_df(df):
    # transform months
    mapping = {
        'Month': {
            'Jan': 0, 'Feb': 1, 'Mar': 2,
            'Apr': 3, 'May': 4, 'June': 5,
            'Jul': 6, 'Aug': 7, 'Sep': 8,
            'Oct': 9, 'Nov': 10, 'Dec': 11
        },
        'VisitorType': {
            'Returning_Visitor': 1,
            'New_Visitor': 0,
            'Other': 0
        },
        'Weekend': {
            True: 1,
            False: 0
        },
        'Revenue': {
            True: 1,
            False: 0
        }
    }
    df = df.replace(mapping)
    return df

In [16]:
df_transformed = transform_df(df)
df_transformed

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,1,1,1,1,1,1,0,0
1,0,0.0,0,0.0,2,64.000000,0.000000,0.100000,0.000000,0.0,1,2,2,1,2,1,0,0
2,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,1,4,1,9,3,1,0,0
3,0,0.0,0,0.0,2,2.666667,0.050000,0.140000,0.000000,0.0,1,3,2,2,4,1,0,0
4,0,0.0,0,0.0,10,627.500000,0.020000,0.050000,0.000000,0.0,1,3,3,1,4,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,145.0,0,0.0,53,1783.791667,0.007143,0.029031,12.241717,0.0,11,4,6,1,1,1,1,0
12326,0,0.0,0,0.0,5,465.750000,0.000000,0.021333,0.000000,0.0,10,3,2,1,8,1,1,0
12327,0,0.0,0,0.0,6,184.250000,0.083333,0.086667,0.000000,0.0,10,3,2,1,13,1,1,0
12328,4,75.0,0,0.0,15,346.000000,0.000000,0.021053,0.000000,0.0,10,2,2,3,11,1,0,0


In [18]:
df_transformed.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                        int64
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                  int64
Weekend                      int64
Revenue                      int64
dtype: object

In [19]:
X = df_transformed.iloc[:,: -1]
y = df_transformed.iloc[:,-1]
print(X.shape)
print(y.shape)

(12330, 17)
(12330,)


In [53]:
def dataframe_to_list_of_lists(df):
    """
    Converts DataFrame to list of lists, while preserving dtypes.
    """
    j = df.to_json(orient='split')
    return json.loads(j)['data']

In [58]:
evidence = dataframe_to_list_of_lists(X)
labels = dataframe_to_list_of_lists(y)

# Find best hyperparameters for the classifier

In [67]:
X_train, X_test, y_train, y_test = train_test_split(
    evidence, labels, test_size=0.4
)
print(len(X_train), len(X_train[0]))
print(len(X_test), len(X_test[0]))
print(len(y_train))
print(len(y_test))

7398 17
4932 17
7398
4932


In [68]:
from sklearn.model_selection import GridSearchCV

In [69]:
model = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=10)
grid_search.fit(X, y)
# print available information
print(grid_search.cv_results_.keys())
print(grid_search.cv_results_)

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_n_neighbors', 'param_weights', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])
{'mean_fit_time': array([0.27953978, 0.23686419, 0.2232722 , 0.24695833, 0.21277869,
       0.20548077, 0.23496578, 0.3220674 , 0.2406625 , 0.23346643]), 'std_fit_time': array([0.07941065, 0.03378887, 0.04080362, 0.02972997, 0.0335856 ,
       0.02270951, 0.03173564, 0.13884667, 0.0145233 , 0.04480996]), 'mean_score_time': array([0.18249621, 0.0669626 , 0.15031621, 0.06276541, 0.1600091 ,
       0.09444695, 0.17350128, 0.11413653, 0.17779875, 0.08765032]), 'std_score_time': array([0.06441007, 0.01828168, 0.0308522 , 0.01687715, 0.03385827,
       0.01261295, 0.01822868, 0.05441273, 0.02624696, 0.0

In [71]:
grid_search.best_score_

0.8637469586374695

In [72]:
grid_search.best_params_

{'n_neighbors': 9, 'weights': 'distance'}

In [73]:
grid_search2 = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1', cv=10)
grid_search2.fit(X, y)
# print available information
print(grid_search2.best_score_)
print(grid_search2.best_params_)

0.4351833539526672
{'n_neighbors': 3, 'weights': 'uniform'}
