In [90]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, recall_score, precision_score,\
precision_recall_curve, roc_curve, roc_auc_score ,auc, average_precision_score

In [91]:
df = pd.read_csv("../Dataset/bike_buyers_clean.csv")

In [92]:
df.head()

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
0,12496,Married,Female,40000,1,Bachelors,Skilled Manual,Yes,0,0-1 Miles,Europe,42,No
1,24107,Married,Male,30000,3,Partial College,Clerical,Yes,1,0-1 Miles,Europe,43,No
2,14177,Married,Male,80000,5,Partial College,Professional,No,2,2-5 Miles,Europe,60,No
3,24381,Single,Male,70000,0,Bachelors,Professional,Yes,1,5-10 Miles,Pacific,41,Yes
4,25597,Single,Male,30000,0,Bachelors,Clerical,No,0,0-1 Miles,Europe,36,Yes


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ID                1000 non-null   int64 
 1   Marital Status    1000 non-null   object
 2   Gender            1000 non-null   object
 3   Income            1000 non-null   int64 
 4   Children          1000 non-null   int64 
 5   Education         1000 non-null   object
 6   Occupation        1000 non-null   object
 7   Home Owner        1000 non-null   object
 8   Cars              1000 non-null   int64 
 9   Commute Distance  1000 non-null   object
 10  Region            1000 non-null   object
 11  Age               1000 non-null   int64 
 12  Purchased Bike    1000 non-null   object
dtypes: int64(5), object(8)
memory usage: 101.7+ KB


In [94]:
df.shape

(1000, 13)

In [95]:
oneHot_cols = ["Marital Status", "Gender", "Home Owner", "Commute Distance", "Region"]
ordinal_cols = ["Education", "Occupation"]

In [96]:
categories = []
for col in ordinal_cols:
    categories.append(df[col].unique()) 
    print(df[col].unique())

['Bachelors' 'Partial College' 'High School' 'Partial High School'
 'Graduate Degree']
['Skilled Manual' 'Clerical' 'Professional' 'Manual' 'Management']


In [97]:
categories[0] = ['Partial High School', 'High School', 'Partial College', 'Bachelors', 'Graduate Degree']
categories[1] = ['Manual', 'Clerical', 'Skilled Manual', 'Professional', 'Management']

In [98]:
categories

[['Partial High School',
  'High School',
  'Partial College',
  'Bachelors',
  'Graduate Degree'],
 ['Manual', 'Clerical', 'Skilled Manual', 'Professional', 'Management']]

In [99]:
column_transformer = ColumnTransformer([
    ("one_hot", OneHotEncoder(sparse_output=False), oneHot_cols),
    ("ordinal", OrdinalEncoder(categories=categories), ordinal_cols)
], remainder="passthrough", sparse_threshold=0.0)
column_transformer.set_output(transform="pandas")

In [100]:
encoder_df = column_transformer.fit_transform(df)

In [101]:
encoder_df

Unnamed: 0,one_hot__Marital Status_Married,one_hot__Marital Status_Single,one_hot__Gender_Female,one_hot__Gender_Male,one_hot__Home Owner_No,one_hot__Home Owner_Yes,one_hot__Commute Distance_0-1 Miles,one_hot__Commute Distance_1-2 Miles,one_hot__Commute Distance_10+ Miles,one_hot__Commute Distance_2-5 Miles,...,one_hot__Region_North America,one_hot__Region_Pacific,ordinal__Education,ordinal__Occupation,remainder__ID,remainder__Income,remainder__Children,remainder__Cars,remainder__Age,remainder__Purchased Bike
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,3.0,2.0,12496,40000,1,0,42,No
1,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,2.0,1.0,24107,30000,3,1,43,No
2,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,2.0,3.0,14177,80000,5,2,60,No
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,3.0,3.0,24381,70000,0,1,41,Yes
4,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,3.0,1.0,25597,30000,0,0,36,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,3.0,23731,60000,2,2,54,Yes
996,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,4.0,3.0,28672,70000,4,0,35,Yes
997,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,3.0,2.0,11809,60000,2,0,38,Yes
998,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,3.0,4.0,19664,100000,3,3,38,No


In [102]:
data_x = encoder_df.drop(columns=["remainder__Purchased Bike"])
data_y = df["Purchased Bike"].factorize()[0]
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.3)

In [103]:
data_x.head()

Unnamed: 0,one_hot__Marital Status_Married,one_hot__Marital Status_Single,one_hot__Gender_Female,one_hot__Gender_Male,one_hot__Home Owner_No,one_hot__Home Owner_Yes,one_hot__Commute Distance_0-1 Miles,one_hot__Commute Distance_1-2 Miles,one_hot__Commute Distance_10+ Miles,one_hot__Commute Distance_2-5 Miles,...,one_hot__Region_Europe,one_hot__Region_North America,one_hot__Region_Pacific,ordinal__Education,ordinal__Occupation,remainder__ID,remainder__Income,remainder__Children,remainder__Cars,remainder__Age
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,3.0,2.0,12496,40000,1,0,42
1,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,2.0,1.0,24107,30000,3,1,43
2,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,2.0,3.0,14177,80000,5,2,60
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,3.0,3.0,24381,70000,0,1,41
4,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,3.0,1.0,25597,30000,0,0,36


In [104]:
d_tree = DecisionTreeClassifier()
d_tree.fit(train_x, train_y)

In [105]:
y_pred = d_tree.predict(test_x)
accuracy_score(test_y, y_pred)

0.6766666666666666

In [115]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None,1,2,3,4,5,6,7,8,10],
    # 'min_samples_leaf': [1, 2, 4, 6],
}

In [116]:
grid_search = GridSearchCV(estimator=d_tree, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=4)
grid_search.fit(train_x, train_y)
print("Best parameters found:", grid_search.best_params_)
best_d_tree = grid_search.best_estimator_
y_pred = best_d_tree.predict(test_x)
accuracy = accuracy_score(test_y, y_pred)
print("Accuracy score:", accuracy)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'criterion': 'entropy', 'max_depth': 8}
Accuracy score: 0.6833333333333333


In [117]:
random_forest = RandomForestClassifier()
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=4)
grid_search.fit(train_x, train_y)
print("Best parameters found:", grid_search.best_params_)
best_d_tree = grid_search.best_estimator_
y_pred = best_d_tree.predict(test_x)
accuracy = accuracy_score(test_y, y_pred)
print("Accuracy score:", accuracy)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'criterion': 'entropy', 'max_depth': None}
Accuracy score: 0.7233333333333334


In [118]:
logistic_r = LogisticRegression(max_iter=1000)
logistic_r.fit(train_x, train_y)

In [119]:
y_predict = logistic_r.predict(test_x)
accuracy_score(test_y, y_predict)

0.5766666666666667

In [120]:
param_grid_logtic = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l2']}
grid_search = GridSearchCV(estimator=logistic_r, param_grid=param_grid_logtic, cv=5, scoring='accuracy')
grid_search.fit(train_x, train_y)
print("Best parameters found:", grid_search.best_params_)
best_d_tree = grid_search.best_estimator_
y_pred = best_d_tree.predict(test_x)
accuracy = accuracy_score(test_y, y_pred)
print("Accuracy score:", accuracy)

Best parameters found: {'C': 0.01, 'penalty': 'l2'}
Accuracy score: 0.5666666666666667
