In [1]:
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from matplotlib import pyplot
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt  
import seaborn as sns
from numpy import set_printoptions
from statistics import mean
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample

In [2]:
df = pd.read_csv("online_shoppers_intention.csv")
preprocessed_df = pd.read_csv("preprocessedData.csv")

In [3]:
categorical_features = ['VisitorType_New_Visitor', 'VisitorType_Other', 'VisitorType_Returning_Visitor', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep']
target = "Revenue"

preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 

In [4]:
preprocessed_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   Unnamed: 0                     12330 non-null  int64   
 1   PageValues                     12330 non-null  float64 
 2   ExitRates                      12330 non-null  float64 
 3   ProductRelated                 12330 non-null  float64 
 4   Administrative                 12330 non-null  float64 
 5   Informational                  12330 non-null  float64 
 6   SpecialDay                     12330 non-null  float64 
 7   Revenue                        12330 non-null  category
 8   Month_Aug                      12330 non-null  category
 9   Month_Dec                      12330 non-null  category
 10  Month_Feb                      12330 non-null  category
 11  Month_Jul                      12330 non-null  category
 12  Month_June                     1

In [5]:
# Split dataframe into x and y

X = preprocessed_df.drop(columns=["Revenue"])
y = preprocessed_df["Revenue"].values

### Hyperparameter Tuning

In [6]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

# Parameters for Hyperparameter Tuning
grid_params = { 'n_neighbors' : [2,3,4,5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)
g_res = gs.fit(X_train, y_train)
g_res.best_params_

Fitting 3 folds for each of 54 candidates, totalling 162 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:   11.2s finished


{'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'uniform'}

### Building 10 KNN Models

In [7]:
# Split dataframe into train and test data
# Note: Stratify preserves the propotion of Revenue of T/F in the testing and training sets

y_predictions = []
accuracies = []
confusionMatrices = []
f1_scores = []
precisions = []
recalls = []

for i in range(10):
    print("__________________________________________________________________________________Iteration:"+str(i))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

    # Create KNN classifier
    knn = KNeighborsClassifier(n_neighbors=13, algorithm = 'brute', metric='manhattan', weights='uniform')

    # Fit the classifier to the data
    knn.fit(X_train, y_train)

    # Make predications on the test data
    y_preds_knn = knn.predict(X_test)
    y_predictions.append(y_preds_knn)

    # Confusion Matrix
    confusionMatrices.append(confusion_matrix(y_test, y_preds_knn))

    # Accuracy Scores
    accuracies.append(accuracy_score(y_test, y_preds_knn))
    f1_scores.append(f1_score(y_test, y_preds_knn))
    recalls.append(recall_score(y_test, y_preds_knn))
    precisions.append(precision_score(y_test, y_preds_knn))
    
    # Print some results
    print('Overall F1-Score                                    : %.6f'%f1_score(y_test, y_preds_knn))
    print('\nClassification Report                       : ')
    print(classification_report(y_test, y_preds_knn))

__________________________________________________________________________________Iteration:0
Overall F1-Score                                    : 0.010274

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.85      1.00      0.92      3127
           1       0.25      0.01      0.01       572

    accuracy                           0.84      3699
   macro avg       0.55      0.50      0.46      3699
weighted avg       0.75      0.84      0.78      3699

__________________________________________________________________________________Iteration:1
Overall F1-Score                                    : 0.006908

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.85      1.00      0.92      3127
           1       0.29      0.00      0.01       572

    accuracy                           0.84      3699
   macro avg       0.57      0.50      0.46  

In [8]:
print("Average Accuracy: "+str(mean(accuracies)))
print("Average F1-Score: "+str(mean(f1_scores)))
print("Average Recall: "+str(mean(recalls)))
print("Average Precision: "+str(mean(precisions)))

Average Accuracy: 0.8437145174371452
Average F1-Score: 0.007547876271452072
Average Recall: 0.0038461538461538464
Average Precision: 0.21032828282828284


In [9]:
data = {"Accuracy":accuracies, "F1_Score":f1_scores, "Recall":recalls, "Precision":precisions}
knn_df = pd.DataFrame(data)  
knn_df.to_csv("knn_evaluation.csv")

data_y = {"Y_true":y_test}
for i in  range(10):
    data_y['Y_pred_'+str(i)] = y_predictions[i]
    
knn_df_y = pd.DataFrame(data_y) 
knn_df_y.to_csv("knn_predictions.csv")

knn_df_y.shape

(3699, 11)

###  Confusion Matrices

In [10]:
for i in range(len(confusionMatrices)):
    print("____________________________")
    print("KNN Model: "+str(i))
    print(confusionMatrices[i])

____________________________
KNN Model: 0
[[3118    9]
 [ 569    3]]
____________________________
KNN Model: 1
[[3122    5]
 [ 570    2]]
____________________________
KNN Model: 2
[[3118    9]
 [ 572    0]]
____________________________
KNN Model: 3
[[3119    8]
 [ 569    3]]
____________________________
KNN Model: 4
[[3114   13]
 [ 570    2]]
____________________________
KNN Model: 5
[[3119    8]
 [ 570    2]]
____________________________
KNN Model: 6
[[3121    6]
 [ 568    4]]
____________________________
KNN Model: 7
[[3120    7]
 [ 570    2]]
____________________________
KNN Model: 8
[[3116   11]
 [ 569    3]]
____________________________
KNN Model: 9
[[3120    7]
 [ 571    1]]


# Oversampling

In [11]:
# Class count
count_class_0, count_class_1 = preprocessed_df['Revenue'].value_counts()

# Divide by class
df_class_0 = preprocessed_df[preprocessed_df['Revenue'] == 0]
df_class_1 = preprocessed_df[preprocessed_df['Revenue'] == 1]

df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over['Revenue'].value_counts())

X = df_test_over.drop(columns=["Revenue"])
y = df_test_over["Revenue"].values

Random over-sampling:
1    10422
0    10422
Name: Revenue, dtype: int64


In [12]:
# Split dataframe into train and test data
# Note: Stratify preserves the propotion of Revenue of T/F in the testing and training sets

y_predictions_over = []
accuracies_over = []
confusionMatrices_over = []
f1_scores_over = []
precisions_over = []
recalls_over = []

for i in range(10):
    print("__________________________________________________________________________________Iteration:"+str(i))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

    # Create KNN classifier
    knn = KNeighborsClassifier(n_neighbors=13, algorithm = 'brute', metric='manhattan', weights='uniform')

    # Fit the classifier to the data
    knn.fit(X_train, y_train)

    # Make predications on the test data
    y_preds_knn = knn.predict(X_test)
    y_predictions_over.append(y_preds_knn)

    # Confusion Matrix
    confusionMatrices_over.append(confusion_matrix(y_test, y_preds_knn))

    # Accuracy Scores
    accuracies_over.append(accuracy_score(y_test, y_preds_knn))
    f1_scores_over.append(f1_score(y_test, y_preds_knn))
    recalls_over.append(recall_score(y_test, y_preds_knn))
    precisions_over.append(precision_score(y_test, y_preds_knn))
    
    # Print some results
    print('Overall F1-Score                                    : %.6f'%f1_score(y_test, y_preds_knn))
    print('\nClassification Report                       : ')
    print(classification_report(y_test, y_preds_knn))

__________________________________________________________________________________Iteration:0
Overall F1-Score                                    : 0.713535

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.72      0.67      0.69      3127
           1       0.69      0.74      0.71      3127

    accuracy                           0.70      6254
   macro avg       0.70      0.70      0.70      6254
weighted avg       0.70      0.70      0.70      6254

__________________________________________________________________________________Iteration:1
Overall F1-Score                                    : 0.705991

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.71      0.66      0.68      3127
           1       0.68      0.73      0.71      3127

    accuracy                           0.70      6254
   macro avg       0.70      0.70      0.70  

In [13]:
print("Average Accuracy: "+str(mean(accuracies_over)))
print("Average F1-Score: "+str(mean(f1_scores_over)))
print("Average Recall: "+str(mean(recalls_over)))
print("Average Precision: "+str(mean(precisions_over)))

Average Accuracy: 0.6981451870802686
Average F1-Score: 0.7077137774125685
Average Recall: 0.7309561880396547
Average Precision: 0.6859586022087348


In [14]:
data = {"Accuracy":accuracies_over, "F1_Score":f1_scores_over, "Recall":recalls_over, "Precision":precisions_over}
knn_df = pd.DataFrame(data)  
knn_df.to_csv("knn_evaluation_over.csv")

data_y = {"Y_true":y_test}
for i in  range(10):
    data_y['Y_pred_'+str(i)] = y_predictions_over[i]
    
knn_df_y = pd.DataFrame(data_y) 
knn_df_y.to_csv("knn_predictions_over.csv")

knn_df_y.shape

(6254, 11)

In [15]:
for i in range(len(confusionMatrices_over)):
    print("____________________________")
    print("KNN Model (oversampling): "+str(i))
    print(confusionMatrices_over[i])

____________________________
KNN Model (oversampling): 0
[[2091 1036]
 [ 818 2309]]
____________________________
KNN Model (oversampling): 1
[[2064 1063]
 [ 841 2286]]
____________________________
KNN Model (oversampling): 2
[[2091 1036]
 [ 821 2306]]
____________________________
KNN Model (oversampling): 3
[[2123 1004]
 [ 843 2284]]
____________________________
KNN Model (oversampling): 4
[[2054 1073]
 [ 826 2301]]
____________________________
KNN Model (oversampling): 5
[[2042 1085]
 [ 813 2314]]
____________________________
KNN Model (oversampling): 6
[[2100 1027]
 [ 891 2236]]
____________________________
KNN Model (oversampling): 7
[[2085 1042]
 [ 901 2226]]
____________________________
KNN Model (oversampling): 8
[[2085 1042]
 [ 833 2294]]
____________________________
KNN Model (oversampling): 9
[[2070 1057]
 [ 826 2301]]


# Undersampling

In [16]:
# Class count
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_test_under['Revenue'].value_counts())


X = df_test_under.drop(columns=["Revenue"])
y = df_test_under["Revenue"].values

Random under-sampling:
1    1908
0    1908
Name: Revenue, dtype: int64


In [17]:
# Split dataframe into train and test data
# Note: Stratify preserves the propotion of Revenue of T/F in the testing and training sets

y_predictions_under = []
accuracies_under = []
confusionMatrices_under = []
f1_scores_under = []
precisions_under = []
recalls_under = []

for i in range(10):
    print("__________________________________________________________________________________Iteration:"+str(i))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

    # Create KNN classifier
    knn = KNeighborsClassifier(n_neighbors=13, algorithm = 'brute', metric='manhattan', weights='uniform')

    # Fit the classifier to the data
    knn.fit(X_train, y_train)

    # Make predications on the test data
    y_preds_knn = knn.predict(X_test)
    y_predictions_under.append(y_preds_knn)

    # Confusion Matrix
    confusionMatrices_under.append(confusion_matrix(y_test, y_preds_knn))

    # Accuracy Scores
    accuracies_under.append(accuracy_score(y_test, y_preds_knn))
    f1_scores_under.append(f1_score(y_test, y_preds_knn))
    recalls_under.append(recall_score(y_test, y_preds_knn))
    precisions_under.append(precision_score(y_test, y_preds_knn))
    
    # Print some results
    print('Overall F1-Score                                    : %.6f'%f1_score(y_test, y_preds_knn))
    print('\nClassification Report                       : ')
    print(classification_report(y_test, y_preds_knn))

__________________________________________________________________________________Iteration:0
Overall F1-Score                                    : 0.566753

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.56      0.55      0.55       572
           1       0.56      0.57      0.57       573

    accuracy                           0.56      1145
   macro avg       0.56      0.56      0.56      1145
weighted avg       0.56      0.56      0.56      1145

__________________________________________________________________________________Iteration:1
Overall F1-Score                                    : 0.544474

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.55      0.59      0.57       572
           1       0.56      0.53      0.54       573

    accuracy                           0.56      1145
   macro avg       0.56      0.56      0.56  

In [18]:
print("Average Accuracy: "+str(mean(accuracies_under)))
print("Average F1-Score: "+str(mean(f1_scores_under)))
print("Average Recall: "+str(mean(recalls_under)))
print("Average Precision: "+str(mean(precisions_under)))

Average Accuracy: 0.5566812227074236
Average F1-Score: 0.5570144247458094
Average Recall: 0.5575528746994716
Average Precision: 0.5567806476899141


In [19]:
data = {"Accuracy":accuracies_under, "F1_Score":f1_scores_under, "Recall":recalls_under, "Precision":precisions_under}
knn_df = pd.DataFrame(data)  
knn_df.to_csv("knn_evaluation_under.csv")

data_y = {"Y_true":y_test}
for i in  range(10):
    data_y['Y_pred_'+str(i)] = y_predictions_under[i]
    
knn_df_y = pd.DataFrame(data_y) 
knn_df_y.to_csv("knn_predictions_under.csv")

knn_df_y.shape

(1145, 11)

In [20]:
for i in range(len(confusionMatrices_under)):
    print("____________________________")
    print("KNN Model (undersampling): "+str(i))
    print(confusionMatrices_under[i])

____________________________
KNN Model (undersampling): 0
[[313 259]
 [244 329]]
____________________________
KNN Model (undersampling): 1
[[335 237]
 [270 303]]
____________________________
KNN Model (undersampling): 2
[[338 235]
 [260 312]]
____________________________
KNN Model (undersampling): 3
[[336 236]
 [247 326]]
____________________________
KNN Model (undersampling): 4
[[302 271]
 [254 318]]
____________________________
KNN Model (undersampling): 5
[[330 243]
 [246 326]]
____________________________
KNN Model (undersampling): 6
[[302 270]
 [260 313]]
____________________________
KNN Model (undersampling): 7
[[310 263]
 [269 303]]
____________________________
KNN Model (undersampling): 8
[[307 265]
 [241 332]]
____________________________
KNN Model (undersampling): 9
[[309 264]
 [242 330]]
