# Capstone Hotel reservation prediction

![Logo](img/hotel-logo.png)

## Modelling

### Import required libraries

In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", None)

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [2]:
#import eli5 # https://pypi.org/project/eli5/

#### Loading cleaned dataset

In [3]:
df = pd.read_csv("data/dataset_w_feature.csv", index_col=0, encoding="iso-8859-15")

In [4]:
df["buchungsdatum"]      = pd.to_datetime(df["buchungsdatum"])
df["anreisedatum"]       = pd.to_datetime(df["anreisedatum"])
df["abreisedatum"]       = pd.to_datetime(df["abreisedatum"])

In [5]:
df.head()

Unnamed: 0,buchungsdatum,anreisedatum,abreisedatum,naechte_insgesamt,gebuchte_naechte,personen_anzahl,gesamtpreis,saison,buchung_telefon,buchung_online,buchung_reisebuero,zimmerarten,reise_adventure,reise_relax,reise_standard,reise_genuss,kind,entfernung,zahler,kunden_id,buchungsnr,buchungsnr_reisebuero,menue_code,zusatzleistung,destination,anrede,plz,ort,kundenclub,jahrgangsalter_klassiert,bundesland,einwohner_adjusted_plz,bev_m_plz,bev_w_plz,bev_insgesamt_ort,bev_weiblich_ort,bev_maennlich_ort,qkm_plz,flaeche_ort,einwohnerdichte_prokm2_plz,haushalte_regiog_2017_plz,kaufkraft_einwohner_in_eur_regio,kk_einwohner_index_regiog_2017,kk_hh_index_regiog_2017,gemeindetyp_regiostar_2016,wirtschaftskraftmr1,wirtschaftskraftmr2,wochentag_buchung,buchungstag,buchungsmonat,buchungsjahr,buchungswoche,buchungsquartal,wochentag_anreise,anreisetag,anreisemonat,anreisejahr,anreisewoche,anreisequartal,wochentag_abreise,abreisetag,abreisemonat,abreisejahr,abreisewoche,abreisequartal,buchungskanal,werbe_ban,anz_werbe_ban,aufenthalt,naechte_ausserhalb,buchungsvorlauf,preis_per,preis_nacht,preis_nacht_per,buch_zeilen,erstbuchung,folgebuchung,repeater
0,2010-01-29,2010-04-02,2010-04-10,8,8,2,2982,2010,0,0,1,classic,0,0,0,1,0,535.48,1,1,1151,456,5,5,linz,Herr,29664,Walsrode,0,55-64,Niedersachsen,25173,12540,12633,25173,11577,11491,344.326464,344.326464,73.108,11453,20641.89,92.819,98.104,Landregion - Zentrale Stadt,-0.333392,-0.388738,Freitag,29,1,2010,4,1,Freitag,2,4,2010,13,2,Samstag,10,4,2010,14,2,reisebuero,keine,0,8,0,63,1491.0,372.75,186.38,2,1,0,0
1,2010-01-14,2010-04-05,2010-04-10,5,5,2,1437,2010,0,0,1,classic,0,0,1,0,0,544.72,0,2,40,6413,5,5,linz,Frau,50226,Frechen,0,55-64,Nordrhein-Westfalen,52473,25645,26828,52473,26828,25645,45.11896,45.11896,1162.992,26227,25268.58,113.623,111.379,Landregion - Zentrale Stadt,0.853802,-0.345112,Donnerstag,14,1,2010,2,1,Montag,5,4,2010,14,2,Samstag,10,4,2010,14,2,reisebuero,generell,5,5,0,81,718.5,287.4,143.7,2,1,0,0
2,2010-03-01,2010-04-05,2010-04-10,5,5,2,1737,2010,0,0,1,classic,0,0,1,0,0,606.63,0,3,2134,4322,5,5,linz,Herr,23970,Wismar,0,75-84,Mecklenburg-Vorpommern,1423,692,731,46442,21844,20706,50.563027,83.131346,17.117,6355,18697.65,84.076,73.665,"Stadtregion - Kleinstädtischer, dörflicher Raum",-1.012536,-1.012536,Montag,1,3,2010,9,1,Montag,5,4,2010,14,2,Samstag,10,4,2010,14,2,reisebuero,keine,0,5,0,35,868.5,347.4,173.7,1,1,0,0
3,2010-01-25,2010-03-21,2010-03-25,4,4,2,1047,2010,0,0,1,classic,0,1,0,0,0,421.59,1,4,776,6414,69,9,linz,Frau,15732,Eichwalde,0,65-74,Brandenburg,14690,7302,7388,14690,4135,4087,11.965427,11.965427,1227.704,7159,23442.06,105.41,104.087,Landregion - Städtischer Raum,0.417226,0.417226,Montag,25,1,2010,4,1,Sonntag,21,3,2010,11,1,Donnerstag,25,3,2010,12,1,reisebuero,keine,0,4,0,55,523.5,261.75,130.88,1,1,0,0
4,2010-01-26,2010-03-07,2010-03-09,2,2,20,5279,2010,0,0,1,classic,1,0,0,0,0,37.88,0,5,813,2938,113,2,duesseldorf,Frau,53881,Euskirchen,1,45-54,Nordrhein-Westfalen,28531,13975,14556,57975,29578,28397,127.679129,139.430913,204.625,11683,23328.82,104.901,120.11,"Stadtregion - Kleinstädtischer, dörflicher Raum",0.212335,-0.579173,Dienstag,26,1,2010,4,1,Sonntag,7,3,2010,9,1,Dienstag,9,3,2010,10,1,reisebuero,keine,0,2,0,40,263.95,2639.5,131.98,7,1,0,1


# Modelling

#### Dropping redundant variables

In [6]:
df.drop("buchungskanal", axis=1, inplace=True)
df.drop("werbeban", axis=1, inplace=True)

KeyError: "['werbeban'] not found in axis"

#### Grouping variables

In [None]:
categorial = list(df.columns[df.dtypes==object])
datetime64 = ['buchungsdatum', 'anreisedatum', 'abreisedatum']
numeric    = list(df.select_dtypes(include=['int64', 'float64']).columns)

for entry in ['erstbuchung', 'repeater', 'kunden_id', 'folgebuchung', 'menue_code', 'zusatzleistung', 'plz', 'kundenclub']: 
    numeric.remove(entry)
    
for entry in ['menue_code', 'zusatzleistung']: 
    categorial.append(entry)    
    
categorial.remove('ort') 

# 7 = 'erstbuchung', 'repeater', 'kunden_id', 'folgebuchung', 'ort', 'plz', 'kundenclub'
df.shape[1] == len(categorial) + len(datetime64) + len(numeric) + 7

## Logistic Regression

In [None]:
X = df[numeric]
y = df.folgebuchung

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1337)

In [None]:
logistic_regression= LogisticRegression(max_iter=10_000)
logistic_regression.fit(X_train,y_train)
y_pred=logistic_regression.predict(X_test)

#### Confusion Matrix & Classification Report

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print('normalized Accuracy: ',round(accuracy_score(y_test, y_pred)*100,2))

### Dummy Variable

In [None]:
aenderung = { "saint etienne": "saint-etienne",
              "viana do castelo": "viana-do-castelo",
            }

df.destination.replace(aenderung, inplace=True)

In [None]:
df_dum = pd.get_dummies(df[categorial])
# bei vielen Ausprägungen bins, erst einmal ohne
df_dum.shape

In [None]:
df_dum.columns

### Train-Test-Split

In [None]:
X = pd.concat([df[numeric], df_dum], axis=1)
y = df.folgebuchung

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1337)

### Dealing with imbalace 
in target feature folgebuchung by resampling

In [None]:
df.folgebuchung.value_counts()

#### Upsampling

In [None]:
# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
erstbuchung  = X[X.folgebuchung==0]
folgebuchung = X[X.folgebuchung==1]

# upsample minority
folgebuchung_upsampled = resample(folgebuchung, replace=True, 
                          n_samples=len(erstbuchung), # match number in majority class
                          random_state=1337) 

# combine majority and upsampled minority
upsampled = pd.concat([erstbuchung, folgebuchung_upsampled])

upsampled.folgebuchung.value_counts()

#### Downsampling

In [None]:
# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
erstbuchung  = X[X.folgebuchung==0]
folgebuchung = X[X.folgebuchung==1]

# downsample majority
erstbuchung_downsampled = resample(erstbuchung, replace=False, 
                          n_samples=len(folgebuchung), # match minority n
                          random_state=1337) 

# combine majority and upsampled minority
downsampled = pd.concat([folgebuchung, erstbuchung_downsampled])

downsampled.erstbuchung.value_counts()

### 2nd Logistic Regression
with balance data and additional dummy variables

In [None]:
y_train = upsampled.folgebuchung
X_train = upsampled.drop('folgebuchung', axis=1)

#check warum nur folgebuchung drop und nicht auch kunden_id

upsampled = LogisticRegression(solver='liblinear', max_iter=10_000).fit(X_train, y_train)

upsampled_pred = upsampled.predict(X_test);

In [None]:
y_train = downsampled.folgebuchung
X_train = downsampled.drop('folgebuchung', axis=1)

#check warum nur folgebuchung drop und nicht auch kunden_id

downsampled = LogisticRegression(solver='liblinear', max_iter=10_000).fit(X_train, y_train)

downsampled_pred = downsampled.predict(X_test);

#### Confusion Matrix & Classification Report

In [None]:
n_conf_matrix = confusion_matrix(y_test, upsampled_pred)
n_conf_matrix

In [None]:
print(classification_report(y_test, upsampled_pred))

In [None]:
print('normalized Accuracy: ',round(accuracy_score(y_test, upsampled_pred)*100,2))

### Cross Validation

In [None]:
clf = LogisticRegression(solver='liblinear', max_iter=10_000)

scoring = ['precision_macro', 'recall_macro', 'f1_macro']
scores = cross_validate(clf, X_train, y_train, cv=5, scoring=scoring, 
                          return_train_score=False) # cv can also return train score but we set it to false
scores

#X_train oder x, gleiches für y

In [None]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

#ändern

In [None]:
#Dummys in Bins wenn die Ladezeit überhand nimmt


In [None]:
#clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)

In [None]:
#https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter

#sklearn.pipeline.Pipeline 

In [None]:
#df_dum.head(2)

In [None]:
#categorial