In [42]:
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, precision_recall_fscore_support
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from matplotlib import pyplot
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt  
import seaborn as sns
from numpy import set_printoptions

In [43]:
df = pd.read_csv("online_shoppers_intention.csv")
preprocessed_df = pd.read_csv("preprocessedData.csv")

In [46]:
categorical_features = ['VisitorType_New_Visitor', 'VisitorType_Other', 'VisitorType_Returning_Visitor', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep']
target = "Revenue"

preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 

In [47]:
preprocessed_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   Unnamed: 0                     12330 non-null  int64   
 1   PageValues                     12330 non-null  float64 
 2   ExitRates                      12330 non-null  float64 
 3   ProductRelated                 12330 non-null  float64 
 4   Administrative                 12330 non-null  float64 
 5   Informational                  12330 non-null  float64 
 6   SpecialDay                     12330 non-null  float64 
 7   Revenue                        12330 non-null  category
 8   Month_Aug                      12330 non-null  category
 9   Month_Dec                      12330 non-null  category
 10  Month_Feb                      12330 non-null  category
 11  Month_Jul                      12330 non-null  category
 12  Month_June                     1

In [48]:
# Split dataframe into x and y

X = preprocessed_df.drop(columns=["Revenue"])
y = preprocessed_df["Revenue"].values

### Building 10 KNN Models

In [58]:
# Split dataframe into train and test data
# Note: Stratify preserves the propotion of Revenue of T/F in the testing and training sets

y_predictions = []
accuracies = []
confusionMatrices = []
f1_scores = []
precisions = []
recalls = []

for i in range(10):
    print("__________________________________________________________________________________Iteration:"+str(i))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

    # Create KNN classifier
    knn = KNeighborsClassifier(n_neighbors = 3)

    # Fit the classifier to the data
    knn.fit(X_train, y_train)

    # Make predications on the test data
    y_preds_knn = knn.predict(X_test)
    y_predictions.append(y_preds_knn)

    # Confusion Matrix
    confusionMatrices.append(confusion_matrix(y_test, y_preds_knn))

    # Accuracy Scores
    accuracies.append(knn.score(X_test, y_preds_knn))
    f1_scores.append(f1_score(y_test, y_preds_knn))
    recalls.append(recall_score(y_test, y_preds_knn))
    precisions.append(precision_score(y_test, y_preds_knn))
    
    # Print some results
    print('Overall F1-Score                                    : %.6f'%f1_score(y_test, y_preds_knn))
    print('\nClassification Report                       : ')
    print(classification_report(y_test, y_preds_knn))

__________________________________________________________________________________Iteration:0
Overall F1-Score                                    : 0.135204

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.85      0.95      0.90      3127
           1       0.25      0.09      0.14       572

    accuracy                           0.82      3699
   macro avg       0.55      0.52      0.52      3699
weighted avg       0.76      0.82      0.78      3699

__________________________________________________________________________________Iteration:1
Overall F1-Score                                    : 0.127438

Classification Report                       : 
              precision    recall  f1-score   support

           0       0.85      0.95      0.90      3127
           1       0.25      0.09      0.13       572

    accuracy                           0.82      3699
   macro avg       0.55      0.52      0.51  

###  Confusion Matrices

In [66]:
for i in range(len(confusionMatrices)):
    print("_______________________________________Iteration"+str(i))
    print(confusionMatrices[i])

[[2968  159]
 [ 519   53]]
[[2979  148]
 [ 523   49]]
[[2975  152]
 [ 515   57]]
[[2970  157]
 [ 530   42]]
[[2967  160]
 [ 529   43]]
[[2979  148]
 [ 521   51]]
[[2966  161]
 [ 532   40]]
[[2980  147]
 [ 520   52]]
[[2982  145]
 [ 523   49]]
[[2972  155]
 [ 516   56]]
