In [118]:
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from matplotlib import pyplot
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt  
import seaborn as sns
from numpy import set_printoptions
from statistics import mean
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample

In [43]:
df = pd.read_csv("online_shoppers_intention.csv")
preprocessed_df = pd.read_csv("preprocessedData.csv")

In [107]:
categorical_features = ['VisitorType_New_Visitor', 'VisitorType_Other', 'VisitorType_Returning_Visitor', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep']
target = "Revenue"

preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 

In [108]:
preprocessed_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   Unnamed: 0                     12330 non-null  int64   
 1   PageValues                     12330 non-null  float64 
 2   ExitRates                      12330 non-null  float64 
 3   ProductRelated                 12330 non-null  float64 
 4   Administrative                 12330 non-null  float64 
 5   Informational                  12330 non-null  float64 
 6   SpecialDay                     12330 non-null  float64 
 7   Revenue                        12330 non-null  category
 8   Month_Aug                      12330 non-null  category
 9   Month_Dec                      12330 non-null  category
 10  Month_Feb                      12330 non-null  category
 11  Month_Jul                      12330 non-null  category
 12  Month_June                     1

In [109]:
# Split dataframe into x and y

X = preprocessed_df.drop(columns=["Revenue"])
y = preprocessed_df["Revenue"].values

### Hyperparameter Tuning

In [110]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

# Parameters for Hyperparameter Tuning
grid_params = { 'n_neighbors' : [2,3,4,5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)
g_res = gs.fit(X_train, y_train)
g_res.best_params_

Fitting 3 folds for each of 54 candidates, totalling 162 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:    9.6s finished


{'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'uniform'}

### Building 10 KNN Models

In [145]:
# Split dataframe into train and test data
# Note: Stratify preserves the propotion of Revenue of T/F in the testing and training sets

y_predictions = []
accuracies = []
confusionMatrices = []
f1_scores = []
precisions = []
recalls = []

for i in range(10):
    print("__________________________________________________________________________________Iteration:"+str(i))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

    # Create KNN classifier
    knn = KNeighborsClassifier(n_neighbors=13, algorithm = 'brute', metric='manhattan', weights='uniform')

    # Fit the classifier to the data
    knn.fit(X_train, y_train)

    # Make predications on the test data
    y_preds_knn = knn.predict(X_test)
    y_predictions.append(y_preds_knn)

    # Confusion Matrix
    confusionMatrices.append(confusion_matrix(y_test, y_preds_knn))

    # Accuracy Scores
    accuracies.append(accuracy_score(y_test, y_preds_knn))
    f1_scores.append(f1_score(y_test, y_preds_knn))
    recalls.append(recall_score(y_test, y_preds_knn))
    precisions.append(precision_score(y_test, y_preds_knn))
    
    # Print some results
    print('Overall F1-Score                                    : %.6f'%f1_score(y_test, y_preds_knn))
    print('\nClassification Report                       : ')
    print(classification_report(y_test, y_preds_knn))

__________________________________________________________________________________Iteration:0
Overall F1-Score                                    : 0.539146

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.55      0.57      0.56       573
           1       0.55      0.53      0.54       572

    accuracy                           0.55      1145
   macro avg       0.55      0.55      0.55      1145
weighted avg       0.55      0.55      0.55      1145

__________________________________________________________________________________Iteration:1
Overall F1-Score                                    : 0.576153

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.58      0.57      0.57       573
           1       0.57      0.58      0.58       572

    accuracy                           0.57      1145
   macro avg       0.57      0.57      0.57  

In [146]:
print("Average Accuracy: "+str(mean(accuracies)))
print("Average F1-Score: "+str(mean(f1_scores)))
print("Average Recall: "+str(mean(recalls)))
print("Average Precision: "+str(mean(precisions)))

Average Accuracy: 0.5634934497816594
Average F1-Score: 0.5611794229726882
Average Recall: 0.558606707428697
Average Precision: 0.5641018213751819


In [147]:
data = {"Accuracy":accuracies, "F1_Score":f1_scores, "Recall":recalls, "Precision":precisions}
knn_df = pd.DataFrame(data)  
knn_df.to_csv("knn_evaluation.csv")

data_y = {"Y_true":y_test}
for i in  range(10):
    data_y['Y_pred_'+str(i)] = y_predictions[i]
    
knn_df_y = pd.DataFrame(data_y) 
knn_df_y.to_csv("knn_predictions.csv")

knn_df_y.shape

(1145, 11)

###  Confusion Matrices

In [149]:
for i in range(len(confusionMatrices)):
    print("____________________________")
    print("KNN Model: "+str(i))
    print(confusionMatrices[i])

____________________________
KNN Model: 0
[[324 249]
 [269 303]]
____________________________
KNN Model: 1
[[327 246]
 [241 331]]
____________________________
KNN Model: 2
[[323 249]
 [243 330]]
____________________________
KNN Model: 3
[[310 262]
 [240 333]]
____________________________
KNN Model: 4
[[331 242]
 [259 313]]
____________________________
KNN Model: 5
[[348 224]
 [273 300]]
____________________________
KNN Model: 6
[[329 244]
 [247 325]]
____________________________
KNN Model: 7
[[318 254]
 [268 305]]
____________________________
KNN Model: 8
[[310 262]
 [254 319]]
____________________________
KNN Model: 9
[[334 239]
 [233 339]]


# Oversampling

In [150]:
# Class count
count_class_0, count_class_1 = preprocessed_df['Revenue'].value_counts()

# Divide by class
df_class_0 = preprocessed_df[preprocessed_df['Revenue'] == 0]
df_class_1 = preprocessed_df[preprocessed_df['Revenue'] == 1]

df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over['Revenue'].value_counts())

X = df_test_over.drop(columns=["Revenue"])
y = df_test_over["Revenue"].values

Random over-sampling:
1    10422
0    10422
Name: Revenue, dtype: int64


In [126]:
# Split dataframe into train and test data
# Note: Stratify preserves the propotion of Revenue of T/F in the testing and training sets

y_predictions_over = []
accuracies_over = []
confusionMatrices_over = []
f1_scores_over = []
precisions_over = []
recalls_over = []

for i in range(10):
    print("__________________________________________________________________________________Iteration:"+str(i))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

    # Create KNN classifier
    knn = KNeighborsClassifier(n_neighbors=13, algorithm = 'brute', metric='manhattan', weights='uniform')

    # Fit the classifier to the data
    knn.fit(X_train, y_train)

    # Make predications on the test data
    y_preds_knn = knn.predict(X_test)
    y_predictions.append(y_preds_knn)

    # Confusion Matrix
    confusionMatrices.append(confusion_matrix(y_test, y_preds_knn))

    # Accuracy Scores
    accuracies.append(accuracy_score(y_test, y_preds_knn))
    f1_scores.append(f1_score(y_test, y_preds_knn))
    recalls.append(recall_score(y_test, y_preds_knn))
    precisions.append(precision_score(y_test, y_preds_knn))
    
    # Print some results
    print('Overall F1-Score                                    : %.6f'%f1_score(y_test, y_preds_knn))
    print('\nClassification Report                       : ')
    print(classification_report(y_test, y_preds_knn))

__________________________________________________________________________________Iteration:0
Overall F1-Score                                    : 0.696847

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.70      0.67      0.68      3127
           1       0.68      0.71      0.70      3127

    accuracy                           0.69      6254
   macro avg       0.69      0.69      0.69      6254
weighted avg       0.69      0.69      0.69      6254

__________________________________________________________________________________Iteration:1
Overall F1-Score                                    : 0.697342

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.70      0.66      0.68      3127
           1       0.68      0.72      0.70      3127

    accuracy                           0.69      6254
   macro avg       0.69      0.69      0.69  

In [127]:
print("Average Accuracy: "+str(mean(accuracies)))
print("Average F1-Score: "+str(mean(f1_scores)))
print("Average Recall: "+str(mean(recalls)))
print("Average Precision: "+str(mean(precisions)))

Average Accuracy: 0.6920690757914935
Average F1-Score: 0.698431727039988
Average Recall: 0.7132075471698114
Average Precision: 0.6842824973385483


In [128]:
data = {"Accuracy":accuracies, "F1_Score":f1_scores, "Recall":recalls, "Precision":precisions}
knn_df = pd.DataFrame(data)  
knn_df.to_csv("knn_evaluation_over.csv")

data_y = {"Y_true":y_test}
for i in  range(10):
    data_y['Y_pred_'+str(i)] = y_predictions[i]
    
knn_df_y = pd.DataFrame(data_y) 
knn_df_y.to_csv("knn_predictions_over.csv")

knn_df_y.shape

(6254, 11)

In [141]:
for i in range(len(confusionMatrices)):
    print("____________________________")
    print("KNN Model (undersampling): "+str(i))
    print(confusionMatrices[i])

___________________________________________KNN Model (oversampling): 0
[[323 249]
 [277 296]]
___________________________________________KNN Model (oversampling): 1
[[327 245]
 [247 326]]
___________________________________________KNN Model (oversampling): 2
[[336 237]
 [252 320]]
___________________________________________KNN Model (oversampling): 3
[[317 256]
 [248 324]]
___________________________________________KNN Model (oversampling): 4
[[312 260]
 [275 298]]
___________________________________________KNN Model (oversampling): 5
[[316 256]
 [249 324]]
___________________________________________KNN Model (oversampling): 6
[[312 260]
 [241 332]]
___________________________________________KNN Model (oversampling): 7
[[328 245]
 [266 306]]
___________________________________________KNN Model (oversampling): 8
[[332 241]
 [244 328]]
___________________________________________KNN Model (oversampling): 9
[[342 231]
 [253 319]]


# Undersampling

In [135]:
# Class count
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_test_under['Revenue'].value_counts())


X = df_test_under.drop(columns=["Revenue"])
y = df_test_under["Revenue"].values

Random under-sampling:
1    1908
0    1908
Name: Revenue, dtype: int64


In [136]:
# Split dataframe into train and test data
# Note: Stratify preserves the propotion of Revenue of T/F in the testing and training sets

y_predictions = []
accuracies = []
confusionMatrices = []
f1_scores = []
precisions = []
recalls = []

for i in range(10):
    print("__________________________________________________________________________________Iteration:"+str(i))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

    # Create KNN classifier
    knn = KNeighborsClassifier(n_neighbors=13, algorithm = 'brute', metric='manhattan', weights='uniform')

    # Fit the classifier to the data
    knn.fit(X_train, y_train)

    # Make predications on the test data
    y_preds_knn = knn.predict(X_test)
    y_predictions.append(y_preds_knn)

    # Confusion Matrix
    confusionMatrices.append(confusion_matrix(y_test, y_preds_knn))

    # Accuracy Scores
    accuracies.append(accuracy_score(y_test, y_preds_knn))
    f1_scores.append(f1_score(y_test, y_preds_knn))
    recalls.append(recall_score(y_test, y_preds_knn))
    precisions.append(precision_score(y_test, y_preds_knn))
    
    # Print some results
    print('Overall F1-Score                                    : %.6f'%f1_score(y_test, y_preds_knn))
    print('\nClassification Report                       : ')
    print(classification_report(y_test, y_preds_knn))

__________________________________________________________________________________Iteration:0
Overall F1-Score                                    : 0.529517

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.54      0.56      0.55       572
           1       0.54      0.52      0.53       573

    accuracy                           0.54      1145
   macro avg       0.54      0.54      0.54      1145
weighted avg       0.54      0.54      0.54      1145

__________________________________________________________________________________Iteration:1
Overall F1-Score                                    : 0.569930

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.57      0.57      0.57       572
           1       0.57      0.57      0.57       573

    accuracy                           0.57      1145
   macro avg       0.57      0.57      0.57  

In [137]:
print("Average Accuracy: "+str(mean(accuracies)))
print("Average F1-Score: "+str(mean(f1_scores)))
print("Average Recall: "+str(mean(recalls)))
print("Average Precision: "+str(mean(precisions)))

Average Accuracy: 0.5605240174672489
Average F1-Score: 0.5576287465820228
Average Recall: 0.5542394342132562
Average Precision: 0.5612459929320167


In [139]:
data = {"Accuracy":accuracies, "F1_Score":f1_scores, "Recall":recalls, "Precision":precisions}
knn_df = pd.DataFrame(data)  
knn_df.to_csv("knn_evaluation_under.csv")

data_y = {"Y_true":y_test}
for i in  range(10):
    data_y['Y_pred_'+str(i)] = y_predictions[i]
    
knn_df_y = pd.DataFrame(data_y) 
knn_df_y.to_csv("knn_predictions_under.csv")

knn_df_y.shape

(1145, 11)

In [144]:
for i in range(len(confusionMatrices)):
    print("____________________________")
    print("KNN Model (undersampling): "+str(i))
    print(confusionMatrices[i])

____________________________
KNN Model (undersampling): 0
[[323 249]
 [277 296]]
____________________________
KNN Model (undersampling): 1
[[327 245]
 [247 326]]
____________________________
KNN Model (undersampling): 2
[[336 237]
 [252 320]]
____________________________
KNN Model (undersampling): 3
[[317 256]
 [248 324]]
____________________________
KNN Model (undersampling): 4
[[312 260]
 [275 298]]
____________________________
KNN Model (undersampling): 5
[[316 256]
 [249 324]]
____________________________
KNN Model (undersampling): 6
[[312 260]
 [241 332]]
____________________________
KNN Model (undersampling): 7
[[328 245]
 [266 306]]
____________________________
KNN Model (undersampling): 8
[[332 241]
 [244 328]]
____________________________
KNN Model (undersampling): 9
[[342 231]
 [253 319]]
