In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, recall_score, precision_score,\
precision_recall_curve, roc_curve, roc_auc_score ,auc, average_precision_score

In [3]:
df = pd.read_csv("../Dataset/bike_buyers_clean.csv")
df.head()

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
0,12496,Married,Female,40000,1,Bachelors,Skilled Manual,Yes,0,0-1 Miles,Europe,42,No
1,24107,Married,Male,30000,3,Partial College,Clerical,Yes,1,0-1 Miles,Europe,43,No
2,14177,Married,Male,80000,5,Partial College,Professional,No,2,2-5 Miles,Europe,60,No
3,24381,Single,Male,70000,0,Bachelors,Professional,Yes,1,5-10 Miles,Pacific,41,Yes
4,25597,Single,Male,30000,0,Bachelors,Clerical,No,0,0-1 Miles,Europe,36,Yes


In [4]:
df2 = df[["Children", "Cars", "Age"]]

In [5]:
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoder.set_output(transform="pandas")
one_hot_output = onehot_encoder.fit_transform(df[["Gender", "Marital Status", "Home Owner"]])

In [6]:
df2 = pd.concat([df2, one_hot_output], axis=1)

In [7]:
categories = []
for col in ["Education", "Occupation", "Commute Distance", "Region"]:
    categories.append(df[col].unique()) 
    print(df[col].unique())

['Bachelors' 'Partial College' 'High School' 'Partial High School'
 'Graduate Degree']
['Skilled Manual' 'Clerical' 'Professional' 'Manual' 'Management']
['0-1 Miles' '2-5 Miles' '5-10 Miles' '1-2 Miles' '10+ Miles']
['Europe' 'Pacific' 'North America']


In [8]:
categories[0] = categories[0][[3,2,1,0,4]]
categories[1] = categories[1][[1,2,4,0,3]]
categories[2] = categories[2][[0,2,3,1,4]]
categories[3] = categories[3][[0,1,2]]

categories

[array(['Partial High School', 'High School', 'Partial College',
        'Bachelors', 'Graduate Degree'], dtype=object),
 array(['Clerical', 'Professional', 'Management', 'Skilled Manual',
        'Manual'], dtype=object),
 array(['0-1 Miles', '5-10 Miles', '1-2 Miles', '2-5 Miles', '10+ Miles'],
       dtype=object),
 array(['Europe', 'Pacific', 'North America'], dtype=object)]

In [9]:
ordinal_encoder = OrdinalEncoder(categories=categories)
ordinal_encoder.set_output(transform="pandas")
ordinal_encoder_output =  ordinal_encoder.fit_transform(df[["Education", "Occupation", "Commute Distance", "Region"]])

In [10]:
df2 = pd.concat([df2, ordinal_encoder_output], axis=1)

In [11]:
data_x = df2
data_y = df["Purchased Bike"].factorize()[0]


In [12]:
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.2)

In [13]:
d_tree = DecisionTreeClassifier()
d_tree.fit(train_x, train_y)

In [14]:
y_pred = d_tree.predict(test_x)
accuracy_score(test_y, y_pred)

0.66

In [15]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 1,2,3,4,5,6,7,8,9,10],
    'min_samples_leaf': [1, 2, 4],
}

In [16]:
grid_search = GridSearchCV(estimator=d_tree, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=4)
grid_search.fit(train_x, train_y)
print("Best parameters found:", grid_search.best_params_)
best_d_tree = grid_search.best_estimator_
y_pred = best_d_tree.predict(test_x)
accuracy = accuracy_score(test_y, y_pred)
print("Accuracy score:", accuracy)

Fitting 5 folds for each of 66 candidates, totalling 330 fits
Best parameters found: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 4}
Accuracy score: 0.65


In [17]:
random_forest = RandomForestClassifier()
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 1,2,3,4,5,6,7,8,9,10],
    'min_samples_leaf': [1, 2,3, 4],
}

In [18]:
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=4)
grid_search.fit(train_x, train_y)
print("Best parameters found:", grid_search.best_params_)
best_d_tree = grid_search.best_estimator_
y_pred = best_d_tree.predict(test_x)
accuracy = accuracy_score(test_y, y_pred)
print("Accuracy score:", accuracy)

Fitting 5 folds for each of 88 candidates, totalling 440 fits
Best parameters found: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1}
Accuracy score: 0.7
