# Bike sharing (Kaggle) (Moritz)
- https://www.kaggle.com/c/184702-tu-ml-ws-18-bike-sharing#_=_
- large samples (train = 8690), small dimension (15)
- attribute characteristics: numeric, date?

## with preprocessing
- prepocessing: scale (standardize)

### Gradient Boosted Decision Tree
- with preprocessing
- with 50 samples: 
    - 13.559 s
    - {'max_depth': 5, 'min_samples_split': 15, 'n_estimators': 350}
    - RMSE: 89.9571
    - Kaggle: 143.91026
- with 150 samples: 
    - 25.025 s
    - {'max_depth': 5, 'min_samples_split': 15, 'n_estimators': 350}
    - RMSE: 118.99678
    - Kaggle: 128.42365
- with 500 samples: 
    - 74.803 s
    - {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 350}
    - RMSE: 78.88731
    - Kaggle: 128.39004
- with 2000 samples:
    - 431.691 s
    - {'max_depth': 10, 'min_samples_split': 15, 'n_estimators': 350}
    - RMSE: 54.24448
    - Kaggle: 54.09653
- with all samples:
    - 4361.643 s
    - {'max_depth': 10, 'min_samples_split': 15, 'n_estimators': 350}
    - RMSE: 43.12057
    - __Kaggle: 43.16327__

In [1]:
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import datetime as dt
%run './base.ipynb'

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


In [2]:
# read train data
train = pd.read_csv('./data/bike_sharing_kaggle/bikeSharing.shuf.train.csv').drop(['dteday'], axis=1)
train.set_index(['id'], inplace=True)
# extract, then drop 'Grade' col
train_target = train[['cnt']]
train.drop(['cnt'], axis='columns', inplace=True)

# read test data
test = pd.read_csv('./data/bike_sharing_kaggle/bikeSharing.shuf.test.csv').drop(['dteday'], axis=1)
test.set_index(['id'], inplace=True)

# scale train and test data
train_s, test_s = scale_data(train, test)

X_train = train_s
y_train = train_target
X_test = test_s

#display(train_s)
#display(X_train.shape)

In [3]:
# Linear Regression
reg = linear_reg(X_train, y_train)
result = pd.DataFrame(reg.predict(X_test), columns=['cnt'])

# join id col
result = pd.concat([X_test.reset_index()[['id']], result], axis='columns')

# Save result
filename = f'''lr_{dt.datetime.now()}.csv'''

result.to_csv('./predictions/bike_sharing_kaggle/' + filename, sep = ",", index=False)
print(f'''Saved as {filename}''')

#display(result)

R^2 value for model: 0.38403134493881663
Saved as lr_2019-01-02 10:27:49.671617.csv


  linalg.lstsq(X, y)


In [4]:
# SVR
# params
param_grid = {
    'C': np.linspace(.2,1,5),
    'kernel': ['linear'],#, 'rbf', 'sigmoid', 'poly'], # poly very slow
    'epsilon': np.linspace(0,.5,6),
    'gamma': ['auto']#, 'scale']
}

# run grid search
gs = run_svr(X_train, y_train.values.ravel(), cv=5, param_grid=param_grid)

# predict
result = pd.DataFrame(gs.best_estimator_.predict(X_test), columns=['cnt'])

# join id col
result = pd.concat([X_test.reset_index()[['id']], result], axis='columns')
display(result)

# Create SVR filename
filename = f'''svr_'''\
           f'''C-{gs.best_estimator_.C}_'''\
           f'''k-{gs.best_estimator_.kernel}_'''\
           f'''e-{gs.best_estimator_.epsilon}_'''\
           f'''g-{gs.best_estimator_.gamma}_'''\
           f'''{dt.datetime.now()}.csv'''

result.to_csv('./predictions/bike_sharing_kaggle/' + filename, sep = ",", index=False)
print(f'''Saved as {filename}''')

GridSearch initializing...
SVR model in training...
MSE: 21350.03313, RMSE: 146.11651, C: 1.0, kernel: linear, epsilon: 0.5, gamma: auto 


Unnamed: 0,id,cnt
0,13504,239.345903
1,15714,273.630612
2,1658,153.493340
3,10461,180.969537
4,10875,85.641983
5,14811,121.603837
6,1474,50.083726
7,5809,174.882963
8,4655,267.913224
9,7707,110.349696


Saved as svr_C-1.0_k-linear_e-0.5_g-auto_2019-01-02 10:34:39.413427.csv


In [21]:
# Gradient Boosted Decision Tree
param_fix = {
    'learning_rate': .01, 
    'loss': 'ls'
}

param_grid = {
    'n_estimators': (1, 10, 100, 200, 350),# 500), 
    'max_depth': (1, 5, 10, 25),# 50), 
    'min_samples_split': (2, 5, 15),# 50)
}

num_samples = 500
#X = X_train.iloc[:num_samples, :]
#y = y_train.iloc[:num_samples, :].values.ravel()

X = X_train
y = y_train.values.ravel()

gs = run_boosted_tree(X, y, [], [], param_fix=param_fix, cv=10, param_grid=param_grid)

#plot_scores(gbt.cv_results_)
#plot_training_deviance(gbt, test_data, test_target)

# predict
result = pd.DataFrame(gs.best_estimator_.predict(X_test), columns=['cnt'])

# join id col
result = pd.concat([X_test.reset_index()[['id']], result], axis='columns')
#display(result)

# Create SVR filename
filename = f'''gbdtree_'''\
           f'''ne-{gs.best_estimator_.n_estimators}_'''\
           f'''md-{gs.best_estimator_.max_depth}_'''\
           f'''mss-{gs.best_estimator_.min_samples_split}_'''\
           f'''{dt.datetime.now()}.csv'''

result.to_csv('./predictions/bike_sharing_kaggle/' + filename, sep = ",", index=False)
print(f'''Saved as {filename}''')

GridSearch initializing...
GradientBoostedRegressor model in training...
GradientBoostedRegressor model selected and fitted in 4361.643 s

MSE: 1859.38335, RMSE: 43.12057
Best parameters selected by GridSearch: {'max_depth': 10, 'min_samples_split': 15, 'n_estimators': 350}
Saved as gbdtree_ne-350_md-10_mss-15_2019-01-02 01:53:02.486364.csv
