In [3]:
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from matplotlib import pyplot
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt  
import seaborn as sns
from numpy import set_printoptions
from statistics import mean
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample
import numpy as np

In [4]:
preprocessed_df = pd.read_csv("preprocessedData.csv")

In [5]:
categorical_features = ['VisitorType_New_Visitor', 'VisitorType_Other', 'VisitorType_Returning_Visitor', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep']
target = "Revenue"

preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 

In [6]:
preprocessed_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   Unnamed: 0                     12330 non-null  int64   
 1   PageValues                     12330 non-null  float64 
 2   ExitRates                      12330 non-null  float64 
 3   ProductRelated                 12330 non-null  float64 
 4   Administrative                 12330 non-null  float64 
 5   Informational                  12330 non-null  float64 
 6   SpecialDay                     12330 non-null  float64 
 7   Revenue                        12330 non-null  category
 8   Month_Aug                      12330 non-null  category
 9   Month_Dec                      12330 non-null  category
 10  Month_Feb                      12330 non-null  category
 11  Month_Jul                      12330 non-null  category
 12  Month_June                     1

In [7]:
# Split dataframe into x and y

X = preprocessed_df.drop(columns=["Revenue"])
y = preprocessed_df["Revenue"].values

### Hyperparameter Tuning

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

# Parameters for Hyperparameter Tuning
grid_params = { 'n_neighbors' : [2,3,4,5,7,9,11,13],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)
g_res = gs.fit(X_train, y_train)
g_res.best_params_

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   11.5s finished


{'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'uniform'}

### Building 10 KNN Models

In [None]:
# Split dataframe into train and test data
# Note: Stratify preserves the propotion of Revenue of T/F in the testing and training sets

preprocessed_df = pd.read_csv("preprocessedData.csv")
preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 


X = preprocessed_df.drop(columns=["Revenue"])
y = preprocessed_df["Revenue"].values

y_predictions = []
accuracies = []
confusionMatrices = []
f1_scores = []
precisions = []
recalls = []
y_pred_roc = []
y_tests = []

for i in range(10):
    print("__________________________________________________________________________________Iteration:"+str(i))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
    y_tests.append(y_test)

    # Create KNN classifier
    knn = KNeighborsClassifier(n_neighbors=13, algorithm = 'brute', metric='manhattan', weights='uniform')

    # Fit the classifier to the data
    knn.fit(X_train, y_train)

    # Make predications on the test data
    y_preds = knn.predict(X_test)
    y_predictions.append(y_preds)
    y_pred_roc.append(knn.predict_proba(X_test)[:,1])

    # Confusion Matrix
    confusionMatrices.append(confusion_matrix(y_test, y_preds))

    # Accuracy Scores
    accuracies.append(accuracy_score(y_test, y_preds))
    f1_scores.append(f1_score(y_test, y_preds))
    recalls.append(recall_score(y_test, y_preds))
    precisions.append(precision_score(y_test, y_preds))
    
    # Print some results
    print('Overall F1-Score                                    : %.6f'%f1_score(y_test, y_preds))
    print('\nClassification Report                       : ')
    print(classification_report(y_test, y_preds))

__________________________________________________________________________________Iteration:0
Overall F1-Score                                    : 0.003484

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.85      1.00      0.92      3127
           1       0.50      0.00      0.00       572

    accuracy                           0.85      3699
   macro avg       0.67      0.50      0.46      3699
weighted avg       0.79      0.85      0.78      3699

__________________________________________________________________________________Iteration:1
Overall F1-Score                                    : 0.006932

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.85      1.00      0.92      3127
           1       0.40      0.00      0.01       572

    accuracy                           0.85      3699
   macro avg       0.62      0.50      0.46  

In [None]:
print("Average Accuracy: "+str(mean(accuracies)))
print("Average F1-Score: "+str(mean(f1_scores)))
print("Average Recall: "+str(mean(recalls)))
print("Average Precision: "+str(mean(precisions)))

In [None]:
data = {"Accuracy":accuracies, "F1_Score":f1_scores, "Recall":recalls, "Precision":precisions}
knn_df = pd.DataFrame(data)  
knn_df.to_csv("knn_evaluation.csv")

max_f1 = f1_scores.index(max(f1_scores))

data_y = {"Y_true":y_tests[max_f1], "Y_Prob_Roc": y_pred_roc[max_f1]}
for i in  range(10):
    data_y['Y_pred_'+str(i)] = y_predictions[i]
    
knn_df_y = pd.DataFrame(data_y) 
knn_df_y.to_csv("knn_predictions.csv")

knn_df_y.shape

###  Confusion Matrices

In [None]:
for i in range(len(confusionMatrices)):
    print("____________________________")
    print("KNN Model: "+str(i))
    print(confusionMatrices[i])


In [None]:
# Draw confusion matrix with the best F1-Score
cf_matrix = confusionMatrices[f1_scores.index(max(f1_scores))]

ax= plt.subplot()
sns.heatmap(cf_matrix, annot=True, fmt='g', ax=ax);  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('KNN Confusion Matrix'); 
ax.xaxis.set_ticklabels(['False', 'True']); ax.yaxis.set_ticklabels(['False', 'True']);

plt.savefig('KNN_confusionMatrix.png')

# Oversampling

In [None]:
# Class count
count_class_0, count_class_1 = preprocessed_df['Revenue'].value_counts()

# Divide by class
df_class_0 = preprocessed_df[preprocessed_df['Revenue'] == 0]
df_class_1 = preprocessed_df[preprocessed_df['Revenue'] == 1]

df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over['Revenue'].value_counts())

X = df_test_over.drop(columns=["Revenue"])
y = df_test_over["Revenue"].values

In [None]:
# Split dataframe into train and test data
# Note: Stratify preserves the propotion of Revenue of T/F in the testing and training sets

y_predictions_over = []
accuracies_over = []
confusionMatrices_over = []
f1_scores_over = []
precisions_over = []
recalls_over = []
y_pred_roc_over = []
y_tests_over = []

for i in range(10):
    print("__________________________________________________________________________________Iteration:"+str(i))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
    y_tests_over.append(y_test)

    # Create KNN classifier
    knn = KNeighborsClassifier(n_neighbors=13, algorithm = 'brute', metric='manhattan', weights='uniform')

    # Fit the classifier to the data
    knn.fit(X_train, y_train)

    # Make predications on the test data
    y_preds = knn.predict(X_test)
    y_predictions_over.append(y_preds)
    y_pred_roc_over.append(knn.predict_proba(X_test)[:,1])

    # Confusion Matrix
    confusionMatrices_over.append(confusion_matrix(y_test, y_preds))

    # Accuracy Scores
    accuracies_over.append(accuracy_score(y_test, y_preds))
    f1_scores_over.append(f1_score(y_test, y_preds))
    recalls_over.append(recall_score(y_test, y_preds))
    precisions_over.append(precision_score(y_test, y_preds))
    
    # Print some results
    print('Overall F1-Score                                    : %.6f'%f1_score(y_test, y_preds))
    print('\nClassification Report                       : ')
    print(classification_report(y_test, y_preds))

In [None]:
print("Average Accuracy: "+str(mean(accuracies_over)))
print("Average F1-Score: "+str(mean(f1_scores_over)))
print("Average Recall: "+str(mean(recalls_over)))
print("Average Precision: "+str(mean(precisions_over)))

In [None]:
data = {"Accuracy":accuracies_over, "F1_Score":f1_scores_over, "Recall":recalls_over, "Precision":precisions_over}
knn_df = pd.DataFrame(data)  
knn_df.to_csv("knn_evaluation_over.csv")

max_f1_over = f1_scores_over.index(max(f1_scores_over))

data_y = {"Y_true":y_tests_over[max_f1_over], "Y_Prob_Roc": y_pred_roc_over[max_f1_over]}
for i in  range(10):
    data_y['Y_pred_'+str(i)] = y_predictions_over[i]
    
knn_df_y = pd.DataFrame(data_y) 
knn_df_y.to_csv("knn_predictions_over.csv")

knn_df_y.shape

In [None]:
for i in range(len(confusionMatrices_over)):
    print("____________________________")
    print("KNN Model (oversampling): "+str(i))
    print(confusionMatrices_over[i])

In [None]:
# Draw confusion matrix with the best F1-Score
cf_matrix_over = confusionMatrices_over[f1_scores_over.index(max(f1_scores_over))]

ax= plt.subplot()
sns.heatmap(cf_matrix_over, annot=True, fmt='g', ax=ax);  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix (Overfitting)'); 
ax.xaxis.set_ticklabels(['False', 'True']); ax.yaxis.set_ticklabels(['False', 'True']);

plt.savefig('KNN_over_confusionMatrix.png')

# Undersampling

In [None]:
# Class count
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_test_under['Revenue'].value_counts())


X = df_test_under.drop(columns=["Revenue"])
y = df_test_under["Revenue"].values

In [None]:
# Split dataframe into train and test data
# Note: Stratify preserves the propotion of Revenue of T/F in the testing and training sets

y_predictions_under = []
y_tests_under = []
accuracies_under = []
confusionMatrices_under = []
f1_scores_under = []
precisions_under = []
recalls_under = []
y_pred_roc_under = []


for i in range(10):
    print("__________________________________________________________________________________Iteration:"+str(i))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
    y_tests_under.append(y_test)

    # Create KNN classifier
    knn = KNeighborsClassifier(n_neighbors=13, algorithm = 'brute', metric='manhattan', weights='uniform')

    # Fit the classifier to the data
    knn.fit(X_train, y_train)

    # Make predications on the test data
    y_preds= knn.predict(X_test)
    y_predictions_under.append(y_preds)
    y_pred_roc_under.append(knn.predict_proba(X_test)[:,1])

    # Confusion Matrix
    confusionMatrices_under.append(confusion_matrix(y_test, y_preds))

    # Accuracy Scores
    accuracies_under.append(accuracy_score(y_test, y_preds))
    f1_scores_under.append(f1_score(y_test, y_preds))
    recalls_under.append(recall_score(y_test, y_preds))
    precisions_under.append(precision_score(y_test, y_preds))
    
    # Print some results
    print('Overall F1-Score                                    : %.6f'%f1_score(y_test, y_preds))
    print('\nClassification Report                       : ')
    print(classification_report(y_test, y_preds))

In [None]:
print("Average Accuracy: "+str(mean(accuracies_under)))
print("Average F1-Score: "+str(mean(f1_scores_under)))
print("Average Recall: "+str(mean(recalls_under)))
print("Average Precision: "+str(mean(precisions_under)))

In [None]:
data = {"Accuracy":accuracies_under, "F1_Score":f1_scores_under, "Recall":recalls_under, "Precision":precisions_under}
knn_df = pd.DataFrame(data)  
knn_df.to_csv("knn_evaluation_under.csv")

max_f1_under = f1_scores_under.index(max(f1_scores_under))

data_y = {"Y_true":y_tests_under[max_f1_under], "Y_Prob_Roc": y_pred_roc_under[max_f1_under]}
for i in  range(10):
    data_y['Y_pred_'+str(i)] = y_predictions_under[i]
    
knn_df_y = pd.DataFrame(data_y) 
knn_df_y.to_csv("knn_predictions_under.csv")

knn_df_y.shape

In [None]:
for i in range(len(confusionMatrices_under)):
    print("____________________________")
    print("KNN Model (undersampling): "+str(i))
    print(confusionMatrices_under[i])

In [None]:
# Draw confusion matrix with the best F1-Score
cf_matrix_under = confusionMatrices_under[f1_scores_under.index(max(f1_scores_under))]

ax= plt.subplot()
sns.heatmap(cf_matrix_under, annot=True, fmt='g', ax=ax);  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix (Underfitting)'); 
ax.xaxis.set_ticklabels(['False', 'True']); ax.yaxis.set_ticklabels(['False', 'True']);

plt.savefig('KNN_under_confusionMatrix.png')