In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df = pd.read_csv("train.csv", delimiter=";")

In [3]:
df.sample(3).T

Unnamed: 0,22187,8809,34483
id,22188,8810,34484
year,2018,2018,2018
month,9,7,11
day,17,3,26
day_part,2,2,1
start_hub_id,5fc78e91-33ee-4aa8-9b16-7684a38856c9,5fc78e91-33ee-4aa8-9b16-7684a38856c9,6b3ceafc-f10f-46a3-a5b0-6c86e95a3a68
weekday,1,2,1
week,38,27,48
checkouts,0,0,0
reservations,0,0,0


## some basic data cleaning

In [8]:
# drop features ID's
# higly correlated features between eachother can be removed
df = df.drop(['id', 'month'], axis=1)

## Create a training dataset

In [9]:
# target
y = df["checkouts"]

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50996 entries, 0 to 50995
Data columns (total 21 columns):
year                50996 non-null int64
day                 50996 non-null int64
day_part            50996 non-null int64
start_hub_id        50996 non-null object
weekday             50996 non-null int64
week                50996 non-null int64
checkouts           50996 non-null int64
reservations        50996 non-null int64
checkout_min12h     50996 non-null int64
checkout_min24h     50996 non-null int64
checkout_min1w      50996 non-null int64
checkout_min2w      50996 non-null int64
checkout_min3w      50996 non-null int64
checkout_min4w      50996 non-null int64
temp                48816 non-null object
rain                48816 non-null object
is_holiday          50996 non-null bool
holidays_in_week    50996 non-null int64
zuid_school         50996 non-null bool
midden_school       50996 non-null bool
noord_school        50996 non-null bool
dtypes: bool(4), int64(14), obj

In [21]:
df.select_dtypes(exclude="object")

Unnamed: 0,year,day,day_part,weekday,week,checkouts,reservations,checkout_min12h,checkout_min24h,checkout_min1w,checkout_min2w,checkout_min3w,checkout_min4w,is_holiday,holidays_in_week,zuid_school,midden_school,noord_school
0,2018,14,2,1,20,0,0,0,0,0,0,0,0,False,1,False,False,False
1,2018,14,2,1,20,0,0,0,0,0,0,0,0,False,1,False,False,False
2,2018,14,2,1,20,1,1,0,0,0,0,0,0,False,1,False,False,False
3,2018,14,2,1,20,0,0,0,0,0,0,0,0,False,1,False,False,False
4,2018,14,2,1,20,0,0,0,0,0,0,0,0,False,1,False,False,False
5,2018,14,2,1,20,0,0,0,0,0,0,0,0,False,1,False,False,False
6,2018,14,2,1,20,0,0,0,0,0,0,0,0,False,1,False,False,False
7,2018,14,2,1,20,0,0,0,0,0,0,0,0,False,1,False,False,False
8,2018,14,2,1,20,0,0,0,0,0,0,0,0,False,1,False,False,False
9,2018,14,2,1,20,0,0,0,0,0,0,0,0,False,1,False,False,False


In [23]:
# features
X = df.select_dtypes(exclude="object").drop(["checkouts"], axis=1)

## choose a model

### xgboost

In [24]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [31]:
xgboost = XGBClassifier(n_jobs=-1)

In [32]:
%%time
xgboost.fit(X_train, y_train)

CPU times: user 44 s, sys: 63.3 ms, total: 44 s
Wall time: 44 s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [30]:
xgboost.__dict__

{'max_depth': 3,
 'learning_rate': 0.1,
 'n_estimators': 100,
 'silent': True,
 'objective': 'multi:softprob',
 'booster': 'gbtree',
 'gamma': 0,
 'min_child_weight': 1,
 'max_delta_step': 0,
 'subsample': 1,
 'colsample_bytree': 1,
 'colsample_bylevel': 1,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'base_score': 0.5,
 'missing': nan,
 'kwargs': {},
 '_Booster': <xgboost.core.Booster at 0x7fef1b661ba8>,
 'seed': None,
 'random_state': 0,
 'nthread': None,
 'n_jobs': 1,
 'importance_type': 'gain',
 'classes_': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]),
 'n_classes_': 32,
 '_le': LabelEncoder(),
 '_features_count': 17}

In [34]:
predict = xgboost.predict(X_test)

In [35]:
np.sqrt(mean_squared_error(y_test, predict))

0.6822799805316976

In [37]:
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97     14864
           1       0.50      0.42      0.46       857
           2       0.40      0.31      0.35       516
           3       0.22      0.09      0.13       178
           4       0.26      0.18      0.21       130
           5       0.11      0.01      0.02        80
           6       0.25      0.28      0.26        67
           7       0.07      0.03      0.05        29
           8       0.28      0.15      0.20        33
           9       0.23      0.25      0.24        12
          10       0.00      0.00      0.00        17
          11       0.00      0.00      0.00        11
          12       0.00      0.00      0.00        10
          13       0.00      0.00      0.00         4
          14       0.00      0.00      0.00         6
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          17       0.00    

In [40]:
confusion_matrix(y_test, predict)

array([[14680,   145,    37,     0,     2,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0],
       [  445,   361,    46,     3,     1,     0,     1,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0],
       [  182,   143,   161,    11,    10,     0,     7,     0,     0,
            1,     1,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0],
       [   36,    54,    59,    16,     8,     2,     2,     1,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0],
       [   14,    17,    48,    14,    24,     2,    10,     0,     0,
            0,     1,     0,     0,     0,     0,     0,     0,     0,
  