In [71]:
import os
import numpy as np
import pandas as pd
from numpy.random import RandomState
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [72]:
data = pd.read_csv('train_encode.csv', delimiter=',')
nRow, nCol = data.shape
print(f'There are {nRow} rows and {nCol} columns')
data.head(5)

There are 11615 rows and 21 columns


Unnamed: 0,CustomerID,FirstName,LastName,AddressLine1,City,StateProvinceName,CountryRegionName,PostalCode,PhoneNumber,BirthDate,...,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,AveMonthSpend,BikeBuyer
0,0,317,308,2734,197,31,0,73,52,4087,...,3,1,0,1,0,0,2,10182,64,0
1,1,218,90,1327,217,43,0,41,0,3892,...,3,1,1,0,1,3,3,7810,92,1
2,2,543,273,4632,112,38,0,107,74,3945,...,3,1,0,1,1,3,3,6918,98,0
3,3,136,319,866,167,24,0,21,52,4479,...,3,0,1,0,1,0,0,6384,25,0
4,4,283,6,1693,146,24,0,16,74,4019,...,3,0,1,1,1,0,0,6209,29,1


In [73]:
test = pd.read_csv('test_encode.csv', delimiter=',')
nRow, nCol = test.shape
print(f'There are {nRow} rows and {nCol} columns')
test.head(5)

There are 4904 rows and 20 columns


Unnamed: 0,CustomerID,FirstName,LastName,AddressLine1,City,StateProvinceName,CountryRegionName,PostalCode,PhoneNumber,BirthDate,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,AveMonthSpend
0,0,193,92,2854,236,18,0,29,21,2600,0,3,0,1,1,4,5,5,3139,66
1,1,318,181,2746,67,25,0,55,41,2214,0,3,1,1,1,1,0,0,3584,49
2,2,518,32,1251,91,25,0,67,0,2043,0,3,1,1,0,1,0,0,3711,45
3,3,145,113,139,67,25,0,55,7,1993,0,3,1,0,1,4,4,4,3186,117
4,4,542,19,3517,173,37,5,200,1122,2560,0,1,0,1,0,3,0,3,3658,20


In [74]:
X = data.iloc[:,:20]  
y = data.iloc[:, 20]

In [75]:
X.head(5)

Unnamed: 0,CustomerID,FirstName,LastName,AddressLine1,City,StateProvinceName,CountryRegionName,PostalCode,PhoneNumber,BirthDate,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,AveMonthSpend
0,0,317,308,2734,197,31,0,73,52,4087,0,3,1,0,1,0,0,2,10182,64
1,1,218,90,1327,217,43,0,41,0,3892,0,3,1,1,0,1,3,3,7810,92
2,2,543,273,4632,112,38,0,107,74,3945,0,3,1,0,1,1,3,3,6918,98
3,3,136,319,866,167,24,0,21,52,4479,0,3,0,1,0,1,0,0,6384,25
4,4,283,6,1693,146,24,0,16,74,4019,0,3,0,1,1,1,0,0,6209,29


In [76]:
y.head(5)

0    0
1    1
2    0
3    0
4    1
Name: BikeBuyer, dtype: int64

### Training Data

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
X_test.head(5)

Unnamed: 0,CustomerID,FirstName,LastName,AddressLine1,City,StateProvinceName,CountryRegionName,PostalCode,PhoneNumber,BirthDate,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,AveMonthSpend
4432,4432,625,306,1875,77,26,2,84,50,3632,3,0,1,1,1,2,2,2,3368,63
1320,1320,457,233,3429,147,43,0,35,64,3851,0,3,0,0,1,1,3,3,6573,57
2642,2642,323,131,4409,207,34,2,181,81,4761,0,0,1,0,1,0,0,0,4741,50
10527,10527,389,54,5747,261,8,4,311,70,3544,0,4,1,0,1,0,0,1,4176,54
6032,6032,368,287,8459,106,15,3,25,10,1505,2,3,0,0,1,2,1,4,5863,35


### Original Test Data

In [84]:
testX = test
solution = pd.read_csv('final/solution.csv', delimiter=',')
nRow, nCol = solution.shape
print(f'There are {nRow} rows and {nCol} columns')
testY = solution.iloc[:,1]

There are 4904 rows and 2 columns


### Naive Bayes

In [85]:
naiveBayes = GaussianNB()
nb_pred = naiveBayes.fit(X_train, y_train).predict(X_test)

In [105]:
print("Accuracy on training set: {:.3f}".format(naiveBayes.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(naiveBayes.score(X_test, y_test)))
print(classification_report(y_test, nb_pred))

Accuracy on training set: 0.765
Accuracy on test set: 0.775
              precision    recall  f1-score   support

           0       0.79      0.91      0.84      2332
           1       0.73      0.50      0.60      1153

    accuracy                           0.78      3485
   macro avg       0.76      0.71      0.72      3485
weighted avg       0.77      0.78      0.76      3485



In [106]:
nb_test_pred = naiveBayes.predict(testX)
print(classification_report(testY, nb_test_pred))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85      3312
           1       0.76      0.44      0.56      1592

    accuracy                           0.77      4904
   macro avg       0.77      0.69      0.70      4904
weighted avg       0.77      0.77      0.75      4904



In [107]:
nb_baseline = pd.read_csv('final/sample_submission.csv', delimiter=',')
nb_baseline['BikeBuyer'] = nb_test_pred
nb_baseline.to_csv('final/nb_baseline.csv', index=False)

### KNN

In [88]:
knn = KNeighborsClassifier()
knn_pred = knn.fit(X_train, y_train).predict(X_test)

In [89]:
print("Accuracy on training set: {:.3f}".format(knn.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(knn.score(X_test, y_test)))
print(classification_report(y_test, knn_pred))

Accuracy on training set: 0.784
Accuracy on test set: 0.689
              precision    recall  f1-score   support

           0       0.75      0.80      0.78      2332
           1       0.53      0.46      0.49      1153

    accuracy                           0.69      3485
   macro avg       0.64      0.63      0.63      3485
weighted avg       0.68      0.69      0.68      3485



In [90]:
knn_test_pred = knn.predict(testX)
print(classification_report(testY, knn_test_pred))

              precision    recall  f1-score   support

           0       0.67      0.92      0.78      3312
           1       0.31      0.08      0.12      1592

    accuracy                           0.64      4904
   macro avg       0.49      0.50      0.45      4904
weighted avg       0.55      0.64      0.56      4904



### Random Forest

In [91]:
randomForest = RandomForestClassifier()
rf_pred = randomForest.fit(X_train, y_train).predict(X_test)



In [92]:
print("Accuracy on training set: {:.3f}".format(randomForest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(randomForest.score(X_test, y_test)))
print(classification_report(y_test, rf_pred))

Accuracy on training set: 0.982
Accuracy on test set: 0.788
              precision    recall  f1-score   support

           0       0.80      0.91      0.85      2332
           1       0.74      0.55      0.63      1153

    accuracy                           0.79      3485
   macro avg       0.77      0.73      0.74      3485
weighted avg       0.78      0.79      0.78      3485



In [93]:
rf_test_pred = randomForest.predict(testX)
print(classification_report(testY, rf_test_pred))

              precision    recall  f1-score   support

           0       0.73      0.94      0.82      3312
           1       0.70      0.28      0.40      1592

    accuracy                           0.73      4904
   macro avg       0.72      0.61      0.61      4904
weighted avg       0.72      0.73      0.69      4904



In [104]:
rf_baseline = pd.read_csv('final/sample_submission.csv', delimiter=',')
rf_baseline['BikeBuyer'] = rf_test_pred
rf_baseline.to_csv('final/rf_baseline.csv', index=False)

### Decision Tree

In [96]:
decisionTree = DecisionTreeClassifier()
dt_pred = decisionTree.fit(X_train, y_train).predict(X_test)

In [97]:
print("Accuracy on training set: {:.3f}".format(decisionTree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(decisionTree.score(X_test, y_test)))
print(classification_report(y_test, dt_pred))

Accuracy on training set: 1.000
Accuracy on test set: 0.726
              precision    recall  f1-score   support

           0       0.80      0.78      0.79      2332
           1       0.58      0.61      0.60      1153

    accuracy                           0.73      3485
   macro avg       0.69      0.70      0.69      3485
weighted avg       0.73      0.73      0.73      3485



In [98]:
dt_test_pred = decisionTree.predict(testX)
print(classification_report(testY, dt_test_pred))

              precision    recall  f1-score   support

           0       0.77      0.78      0.77      3312
           1       0.53      0.50      0.51      1592

    accuracy                           0.69      4904
   macro avg       0.65      0.64      0.64      4904
weighted avg       0.69      0.69      0.69      4904



### Varification

In [112]:
print((solution.iloc[:,1] == rf_baseline.iloc[:,1]).sum())

3565


In [113]:
print(classification_report(solution.iloc[:,1], rf_baseline.iloc[:,1]))

              precision    recall  f1-score   support

           0       0.73      0.94      0.82      3312
           1       0.70      0.28      0.40      1592

    accuracy                           0.73      4904
   macro avg       0.72      0.61      0.61      4904
weighted avg       0.72      0.73      0.69      4904



In [114]:
print(classification_report(solution.iloc[:,1], nb_baseline.iloc[:,1]))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85      3312
           1       0.76      0.44      0.56      1592

    accuracy                           0.77      4904
   macro avg       0.77      0.69      0.70      4904
weighted avg       0.77      0.77      0.75      4904

