# Capstone Hotel reservation prediction

![Logo](img/hotel-logo.png)

## Modelling solvent guest
#### (VIP)

### Import required libraries

In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", None)

#modelling
from sklearn import preprocessing
from sklearn.datasets import make_classification
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict, RandomizedSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample

#visualisations
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")

import visuals as vs

%matplotlib inline

#### Loading cleaned dataset

In [2]:
df = pd.read_csv("data/dataset_w_feature.csv", index_col=0, encoding="iso-8859-15")

In [3]:
df["buchungsdatum"]      = pd.to_datetime(df["buchungsdatum"])
df["anreisedatum"]       = pd.to_datetime(df["anreisedatum"])
df["abreisedatum"]       = pd.to_datetime(df["abreisedatum"])

In [4]:
df.head()

Unnamed: 0,buchungsdatum,anreisedatum,abreisedatum,naechte_insgesamt,gebuchte_naechte,personen_anzahl,gesamtpreis,saison,buchung_telefon,buchung_online,buchung_reisebuero,zimmerarten,reise_adventure,reise_relax,reise_standard,reise_genuss,kind,entfernung,zahler,kunden_id,buchungsnr,buchungsnr_reisebuero,menue_code,zusatzleistung,destination,anrede,plz,ort,kundenclub,jahrgangsalter_klassiert,bundesland,einwohner_adjusted_plz,bev_m_plz,bev_w_plz,bev_insgesamt_ort,bev_weiblich_ort,bev_maennlich_ort,qkm_plz,flaeche_ort,einwohnerdichte_prokm2_plz,haushalte_regiog_2017_plz,kaufkraft_einwohner_in_eur_regio,kk_einwohner_index_regiog_2017,kk_hh_index_regiog_2017,gemeindetyp_regiostar_2016,wirtschaftskraftmr1,wirtschaftskraftmr2,buchungsjahr,buchungsquartal,buchungsmonat,buchungswoche,buchungstag,buchung_wochentag,anreisejahr,anreisequartal,anreisemonat,anreisewoche,anreisetag,anreise_wochentag,abreisejahr,abreisequartal,abreisemonat,abreisewoche,abreisetag,abreise_wochentag,buchungsmonat-jahr,buchungsquartal-jahr,anreisemonat-jahr,anreisequartal-jahr,abreisemonat-jahr,abreisequartal-jahr,buchungskanal,werbe_ban,anz_werbe_ban,aufenthalt,naechte_ausserhalb,anz_buch,buchungsvorlauf,preis_per,preis_nacht,preis_nacht_per,VIP,buch_zeilen,erstbuchung,folgebuchung,repeater
0,2010-01-29,2010-04-02,2010-04-10,8,8,2,2982,2010,0,0,1,classic,0,0,0,1,0,535.48,1,1,1151,456,5,5,linz,Herr,29664,Walsrode,0,55-64,Niedersachsen,25173,12540,12633,25173,11577,11491,344.326464,344.326464,73.108,11453,20641.89,92.819,98.104,Landregion - Zentrale Stadt,-0.333392,-0.388738,2010,1,1,4,29,Freitag,2010,2,4,13,2,Freitag,2010,2,4,14,10,Samstag,20101,20101,20104,20102,20104,20102,reisebuero,keine,0,8,0,1,63,1491.0,372.75,186.38,0,2,1,0,0
1,2010-01-14,2010-04-05,2010-04-10,5,5,2,1437,2010,0,0,1,classic,0,0,1,0,0,544.72,0,2,40,6413,5,5,linz,Frau,50226,Frechen,0,55-64,Nordrhein-Westfalen,52473,25645,26828,52473,26828,25645,45.11896,45.11896,1162.992,26227,25268.58,113.623,111.379,Landregion - Zentrale Stadt,0.853802,-0.345112,2010,1,1,2,14,Donnerstag,2010,2,4,14,5,Montag,2010,2,4,14,10,Samstag,20101,20101,20104,20102,20104,20102,reisebuero,generell,5,5,0,1,81,718.5,287.4,143.7,0,2,1,0,0
2,2010-03-01,2010-04-05,2010-04-10,5,5,2,1737,2010,0,0,1,classic,0,0,1,0,0,606.63,0,3,2134,4322,5,5,linz,Herr,23970,Wismar,0,75-84,Mecklenburg-Vorpommern,1423,692,731,46442,21844,20706,50.563027,83.131346,17.117,6355,18697.65,84.076,73.665,"Stadtregion - Kleinstädtischer, dörflicher Raum",-1.012536,-1.012536,2010,1,3,9,1,Montag,2010,2,4,14,5,Montag,2010,2,4,14,10,Samstag,20103,20101,20104,20102,20104,20102,reisebuero,keine,0,5,0,1,35,868.5,347.4,173.7,0,2,1,0,0
3,2010-01-25,2010-03-21,2010-03-25,4,4,2,1047,2010,0,0,1,classic,0,1,0,0,0,421.59,1,4,776,6414,69,9,linz,Frau,15732,Eichwalde,0,65-74,Brandenburg,14690,7302,7388,14690,4135,4087,11.965427,11.965427,1227.704,7159,23442.06,105.41,104.087,Landregion - Städtischer Raum,0.417226,0.417226,2010,1,1,4,25,Montag,2010,1,3,11,21,Sonntag,2010,1,3,12,25,Donnerstag,20101,20101,20103,20101,20103,20101,reisebuero,keine,0,4,0,1,55,523.5,261.75,130.88,0,1,1,0,0
4,2010-01-26,2010-03-07,2010-03-09,2,2,20,5279,2010,0,0,1,classic,1,0,0,0,0,37.88,0,5,813,2938,113,2,duesseldorf,Frau,53881,Euskirchen,1,45-54,Nordrhein-Westfalen,28531,13975,14556,57975,29578,28397,127.679129,139.430913,204.625,11683,23328.82,104.901,120.11,"Stadtregion - Kleinstädtischer, dörflicher Raum",0.212335,-0.579173,2010,1,1,4,26,Dienstag,2010,1,3,9,7,Sonntag,2010,1,3,10,9,Dienstag,20101,20101,20103,20101,20103,20101,reisebuero,keine,0,2,0,1,40,263.95,2639.5,131.98,0,7,1,0,1


## Preparation

#### Dropping redundant variables

In [5]:
df.drop("buchungskanal", axis=1, inplace=True)
df.drop("werbe_ban", axis=1, inplace=True)

#### Grouping variables

In [6]:
categorial = list(df.columns[df.dtypes==object])
datetime64 = ['buchungsdatum', 'anreisedatum', 'abreisedatum']
numeric    = list(df.select_dtypes(include=['int64', 'float64']).columns)

for entry in ['VIP', 'preis_nacht', 'preis_nacht_per', 'preis_per', 'gesamtpreis', 'naechte_insgesamt', 'aufenthalt', 'gebuchte_naechte', 'reise_standard', 'reise_relax', 'menue_code', 'zusatzleistung']: 
    numeric.remove(entry)
    
for entry in ['menue_code', 'zusatzleistung']: 
    categorial.append(entry)    

categorial.remove('ort')

# 11 = VIP, preis_nacht_per, preis_per, gesamtpreis, naechte_insgesamt, aufenthalt, gebuchte_naechte, reise_standard, reise_relax, ort
df.shape[1] == len(categorial) + len(datetime64) + len(numeric) + 11

True

#### renaming destinations

In [7]:
aenderung = { "saint etienne": "saint-etienne",
              "viana do castelo": "viana-do-castelo",
            }

df.destination.replace(aenderung, inplace=True)

aenderung2 = {
"Landregion - Kleinstädtischer, dörflicher Raum": "Kleinstädt_(Land)",
"Landregion - Städtischer Raum": "Stadt_(Land)",
"Landregion - Zentrale Stadt": "zen_Stadt_(Land)",
"Stadtregion - Kleinstädtischer, dörflicher Raum": "Kleinstädt_(Stadt)",
"Stadtregion - Metropole": "Metropole",
"Stadtregion - Mittelstadt, städtischer Raum": "Mittelstadt",
"Stadtregion -Regiopole und Großstadt": "Regiopole"
    }

df.gemeindetyp_regiostar_2016.replace(aenderung2, inplace=True)

In [8]:
df.menue_code     = df.menue_code.astype("str")
df.zusatzleistung = df.zusatzleistung.astype("str")

### Normalisation and dummy encoding

In [9]:
df = df.reset_index()
x = df[numeric]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
columns = df[numeric].columns
df_num = pd.DataFrame(x_scaled)
df_num.columns = columns

df_dum = pd.get_dummies(df[categorial])
df_dum.shape

(196989, 283)

## Logistic Regression
Basic (only numeric features)

In [10]:
model, acc, pre, rec, f1, roc = [], [], [], [], [], []

In [11]:
X = df[numeric]
y = df.VIP

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

lr1 = LogisticRegression(solver='liblinear', max_iter=10_000, random_state=1337)

y_pred1 = cross_val_predict(lr1, X, y, cv=5, n_jobs=-1, verbose=5)

model.append("Basic Logistic Regression")
acc.append(accuracy_score(y, y_pred1)*100)
pre.append(precision_score(y, y_pred1)*100)
rec.append(recall_score(y, y_pred1)*100)
f1.append(f1_score(y, y_pred1)*100)
#roc1 = roc_curve(y, y_pred1)*100
roc.append(roc_auc_score(y, y_pred1)*100) #ändern test oder train?

conf_matrix1 = confusion_matrix(y, y_pred1)
class_rep1 = classification_report(y, y_pred1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   54.6s remaining:  1.4min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.0min finished


only numeric features but normalised

In [12]:
X = df_num
y = df.VIP

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, shuffle = True, stratify = y)

lr2 = LogisticRegression(solver='liblinear', max_iter=10_000, random_state=1337)

y_pred2 = cross_val_predict(lr2, X, y, cv=5, n_jobs=-1, verbose=5)

model.append("Normalised Logistic Regression")
acc.append(accuracy_score(y, y_pred2)*100)
pre.append(precision_score(y, y_pred2)*100)
rec.append(recall_score(y, y_pred2)*100)
f1.append(f1_score(y, y_pred2)*100)
#roc2 = roc_curve(y, y_pred2)*100
roc.append(roc_auc_score(y, y_pred2)*100)

conf_matrix2 = confusion_matrix(y, y_pred2)
class_rep2 = classification_report(y, y_pred2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   14.1s remaining:   21.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.4s finished


### extended Logistic Regression
(categorial variables as dummies)

In [13]:
X = pd.concat([df_num, df_dum], axis=1)
y = df.VIP

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

lr3 = LogisticRegression(solver='liblinear', max_iter=10_000, random_state=1337)

y_pred3 = cross_val_predict(lr3, X, y, cv=5, n_jobs=-1, verbose=5)

model.append("Logistic Regression (w. dummies)")
acc.append(accuracy_score(y, y_pred3)*100)
pre.append(precision_score(y, y_pred3)*100)
rec.append(recall_score(y, y_pred3)*100)
f1.append(f1_score(y, y_pred3)*100)
#roc3 = roc_curve(y, y_pred3)*100
roc.append(roc_auc_score(y, y_pred3)*100)

conf_matrix3 = confusion_matrix(y, y_pred3)
class_rep3 = classification_report(y, y_pred3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   24.8s remaining:   37.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   26.4s finished


### Dealing with imbalace 
in target feature folgebuchung by resampling

In [14]:
df.VIP.value_counts()

0    147855
1     49134
Name: VIP, dtype: int64

New train/test split without normalisation

In [15]:
X = pd.concat([df[numeric], df_dum], axis=1)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1337)

#### Upsampling

In [16]:
# concatenate our training data back together
X = pd.concat([X, y], axis=1)

# separate minority and majority classes
k_VIP  = X[X.VIP==0]
VIP    = X[X.VIP==1]

# upsample minority
VIP_upsampled = resample(VIP, replace=True, n_samples=len(k_VIP), random_state=1337) 

# combine majority and upsampled minority
upsampled = pd.concat([k_VIP, VIP_upsampled])

upsampled.VIP.value_counts()

1    147855
0    147855
Name: VIP, dtype: int64

#### Downsampling

In [17]:
# downsample majority
k_VIP_downsampled = resample(k_VIP, replace=False, n_samples=len(VIP), random_state=1337) 

# combine majority and upsampled minority
downsampled = pd.concat([VIP, k_VIP_downsampled])

downsampled.VIP.value_counts()

1    49134
0    49134
Name: VIP, dtype: int64

### balanced Logistic Regression
with upsampled data and additional dummy variables

In [18]:
y = upsampled.VIP
X = upsampled.drop('VIP', axis=1)

lr4 = LogisticRegression(solver='liblinear', max_iter=10_000, n_jobs=-1, random_state=1337)

y_pred4 = cross_val_predict(lr4, X, y, cv=5, n_jobs=-1, verbose=5)

model.append("Balanced Logistic Regression (upsampled)")
acc.append(accuracy_score(y, y_pred4)*100)
pre.append(precision_score(y, y_pred4)*100)
rec.append(recall_score(y, y_pred4)*100)
f1.append(f1_score(y, y_pred4)*100)
#roc4 = roc_curve(y, y_pred4)*100
roc.append(roc_auc_score(y, y_pred4)*100)

conf_matrix4 = confusion_matrix(y, y_pred4)
class_rep4 = classification_report(y, y_pred4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.5min remaining:  2.3min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.7min finished


### 2nd balanced Logistic Regression
downsampled

In [19]:
y = downsampled.VIP
X = downsampled.drop('VIP', axis=1)

lr5 = LogisticRegression(solver='liblinear', max_iter=10_000, n_jobs=-1, random_state=1337)

y_pred5 = cross_val_predict(lr5, X, y, cv=5, n_jobs=-1, verbose=5)

model.append("Balanced Logistic Regression (downsampled)")
acc.append(accuracy_score(y, y_pred5)*100)
pre.append(precision_score(y, y_pred5)*100)
rec.append(recall_score(y, y_pred5)*100)
f1.append(f1_score(y, y_pred5)*100)
#roc5 = roc_curve(y, y_pred5)*100
roc.append(roc_auc_score(y, y_pred5)*100)

conf_matrix5 = confusion_matrix(y, y_pred5)
class_rep5 = classification_report(y, y_pred5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   29.6s remaining:   44.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   34.2s finished


## Advanced models
## Random forest
fixed settings after reandom search

In [20]:
X = pd.concat([df[numeric], df_dum], axis=1)
y = df.VIP

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1337)

lr6 = RandomForestClassifier(n_estimators= 120, min_samples_split= 5,
                             min_samples_leaf= 6, max_features= None,
                             max_depth= None, random_state= 1337)

y_pred6 = cross_val_predict(lr6, X, y, cv=5, n_jobs=-1, verbose=5)

model.append("Random Forest")
acc.append(accuracy_score(y, y_pred6)*100)
pre.append(precision_score(y, y_pred6)*100)
rec.append(recall_score(y, y_pred6)*100)
f1.append(f1_score(y, y_pred6)*100)
#roc6 = roc_curve(y, y_pred6)*100
roc.append(roc_auc_score(y, y_pred6)*100)

conf_matrix6 = confusion_matrix(y, y_pred6)
class_rep6 = classification_report(y, y_pred6)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 18.8min remaining: 28.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 19.6min finished


### GaussianNB

In [21]:
X = pd.concat([df[numeric], df_dum], axis=1)
y = df.VIP

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1337)

lr7 = GaussianNB()

y_pred7 = cross_val_predict(lr7, X, y, cv=5, n_jobs=-1, verbose=5)

model.append("GaussianNB")
acc.append(accuracy_score(y, y_pred7)*100)
pre.append(precision_score(y, y_pred7)*100)
rec.append(recall_score(y, y_pred7)*100)
f1.append(f1_score(y, y_pred7)*100)
#roc7 = roc_curve(y, y_pred7)*100
roc.append(roc_auc_score(y, y_pred7)*100)

conf_matrix7 = confusion_matrix(y, y_pred7)
class_rep7 = classification_report(y, y_pred7)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    3.3s remaining:    4.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.4s finished


### KNeighborsClassifier

In [22]:
lr8 = KNeighborsClassifier(n_neighbors=3)

y_pred8 = cross_val_predict(lr8, X, y, cv=5, n_jobs=-1, verbose=5)

model.append("KNeighborsClassifier")
acc.append(accuracy_score(y, y_pred8)*100)
pre.append(precision_score(y, y_pred8)*100)
rec.append(recall_score(y, y_pred8)*100)
f1.append(f1_score(y, y_pred8)*100)
#roc8 = roc_curve(y, y_pred8)*100
roc.append(roc_auc_score(y, y_pred8)*100)

conf_matrix8 = confusion_matrix(y, y_pred8)
class_rep8 = classification_report(y, y_pred8)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.4min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min finished


### DecisionTreeClassifier

In [23]:
lr9 = DecisionTreeClassifier(random_state=1337)

y_pred9 = cross_val_predict(lr9, X, y, cv=5, n_jobs=-1, verbose=5)

model.append("DecisionTreeClassifier")
acc.append(accuracy_score(y, y_pred9)*100)
pre.append(precision_score(y, y_pred9)*100)
rec.append(recall_score(y, y_pred9)*100)
f1.append(f1_score(y, y_pred9)*100)
#roc9 = roc_curve(y, y_pred9)*100
roc.append(roc_auc_score(y, y_pred9)*100)

conf_matrix9 = confusion_matrix(y, y_pred9)
class_rep9 = classification_report(y, y_pred9)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   14.6s remaining:   22.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   15.0s finished


### AdaBoost

In [24]:
lr10 = AdaBoostClassifier(DecisionTreeClassifier(random_state=1337))

y_pred10 = cross_val_predict(lr10, X, y, cv=5, n_jobs=-1, verbose=5)

model.append("AdaBoost")
acc.append(accuracy_score(y, y_pred10)*100)
pre.append(precision_score(y, y_pred10)*100)
rec.append(recall_score(y, y_pred10)*100)
f1.append(f1_score(y, y_pred10)*100)
#roc10 = roc_curve(y, y_pred10)*100
roc.append(roc_auc_score(y, y_pred10)*100)

conf_matrix10 = confusion_matrix(y, y_pred10)
class_rep10 = classification_report(y, y_pred10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   15.9s remaining:   24.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.5s finished


## Model comparsion

### Naive Predictor Performace

What if we chose a model that always predicted an individual as repeater, what would that model's accuracy, precision, recall and F-score be on this dataset?

In [25]:
# Calculate accuracy
VIP_c = df.query("VIP == 1").count()[0]
n_VIP_c = df.query("VIP == 0").count()[0]
accuracy = VIP_c / df.shape[0]

# Calculating precision
precision = VIP_c / (VIP_c + n_VIP_c)

#Calculating recall
recall = VIP_c / (VIP_c + 0)

# Calculate F-score using the formula above for beta = 0.5
fscore =  (1  + (0.5*0.5)) * ( precision * recall / (( 0.5*0.5 * (precision))+ recall))

# Print the results 
print (f"Naive Predictor:\n\tAccuracy score:\t{accuracy*100:.2f}\n\tPrecision score:{precision*100:.2f}\n\tRecall score:\t{recall*100:.2f}\n\tF-score:\t{fscore*100:.2f}")

Naive Predictor:
	Accuracy score:	24.94
	Precision score:24.94
	Recall score:	100.00
	F-score:	29.35


#### Confusion Matrix & Classification Report
<img src="img/conf_matrix.png" alt="Drawing" style="width: 400px;"/>

In [26]:
for entry in range(0,len(acc)):
    print(f"Model{entry+1} {model[entry]}:\n\tAccuracy:\t{round(acc[entry],2)}")
    print(f"\tPrecision:\t{round(pre[entry],2)}")
    print(f"\tRecall:\t\t{round(rec[entry],2)}")
    print(f"\tF1:\t\t{round(f1[entry],2)}")
    print(f"\tROC AUC:\t{round(roc[entry],2)}\n\n")

Model1 Basic Logistic Regression:
	Accuracy:	76.28
	Precision:	54.04
	Recall:		32.77
	F1:		40.8
	ROC AUC:	61.76


Model2 Normalised Logistic Regression:
	Accuracy:	79.06
	Precision:	60.62
	Recall:		45.87
	F1:		52.22
	ROC AUC:	67.98


Model3 Logistic Regression (w. dummies):
	Accuracy:	84.11
	Precision:	68.02
	Recall:		68.47
	F1:		68.24
	ROC AUC:	78.88


Model4 Balanced Logistic Regression (upsampled):
	Accuracy:	69.87
	Precision:	66.76
	Recall:		79.13
	F1:		72.42
	ROC AUC:	69.87


Model5 Balanced Logistic Regression (downsampled):
	Accuracy:	69.36
	Precision:	70.83
	Recall:		65.81
	F1:		68.23
	ROC AUC:	69.36


Model6 Random Forest:
	Accuracy:	64.08
	Precision:	33.68
	Recall:		45.4
	F1:		38.67
	ROC AUC:	57.85


Model7 GaussianNB:
	Accuracy:	68.39
	Precision:	40.95
	Recall:		60.44
	F1:		48.82
	ROC AUC:	65.74


Model8 KNeighborsClassifier:
	Accuracy:	53.09
	Precision:	22.79
	Recall:		36.89
	F1:		28.18
	ROC AUC:	47.68


Model9 DecisionTreeClassifier:
	Accuracy:	63.12
	Precision:	33.29
	Rec

## Best results

In [30]:
print("Logistic Regression (w. dummies)\n")
print(conf_matrix3)
print(f"\n{class_rep3}")

Logistic Regression (w. dummies)

[[132039  15816]
 [ 15494  33640]]

              precision    recall  f1-score   support

           0       0.89      0.89      0.89    147855
           1       0.68      0.68      0.68     49134

    accuracy                           0.84    196989
   macro avg       0.79      0.79      0.79    196989
weighted avg       0.84      0.84      0.84    196989



There first four most imporant features have a cumulative feature weight of 40%, 30 more features each add 2 more percent.