### Modeling Bike Share Data

**Imports**

In [81]:
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import warnings 
warnings.filterwarnings("ignore")
import matplotlib as plt
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
import datetime
now = datetime.datetime.now()

In [4]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

df_train['id'] = 0

df_test['count'] = 0
df_test['id'] = 1

df_train = df_train.drop('casual', axis = 1)
df_train = df_train.drop('registered', axis = 1)

In [5]:
datetime_test = df_test.datetime

In [6]:
dataframes = [df_train, df_test]
df = pd.concat(dataframes)

**General data Info**

In [7]:
df.shape

(17379, 11)

In [8]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,id
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,16,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,40,0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,32,0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,13,0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,1,0


We see that all of the variables are numeric except for the datetime column which is datatime object. We can parse this column and create new columns for the year, month, day and time. 

In [9]:
# no missing variables
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17379 entries, 0 to 6492
Data columns (total 11 columns):
datetime      17379 non-null object
season        17379 non-null int64
holiday       17379 non-null int64
workingday    17379 non-null int64
weather       17379 non-null int64
temp          17379 non-null float64
atemp         17379 non-null float64
humidity      17379 non-null int64
windspeed     17379 non-null float64
count         17379 non-null int64
id            17379 non-null int64
dtypes: float64(3), int64(7), object(1)
memory usage: 1.6+ MB


**Handling datetime**

In [10]:
df['date'] = df.datetime.apply(lambda x: x.split()[0])

The time only gives the hour of the day (0-24), so we can remove the colon notation and just leave the hour number. 

In [11]:
df['time'] = df.datetime.apply(lambda x: int(x.split()[1].split(":")[0]))

In [12]:
df['year'] = df.date.apply(lambda x: datetime.strptime(x,'%Y-%m-%d').year)
df['month'] = df.date.apply(lambda x: datetime.strptime(x,'%Y-%m-%d').month)
df['day'] = df.date.apply(lambda x: datetime.strptime(x,'%Y-%m-%d').day)
df['weekday'] = df.date.apply(lambda x: datetime.strptime(x,'%Y-%m-%d').weekday())

Add a feature for normal vs. odd hours

In [13]:
df['Odd_hours'] = np.where((df.time >=0) & (df.time <=6), 1,0)

Next we want to change some of the nominal variables to be categories so that they do not get misinterpreted as ordinal

In [14]:
# nominal_variables = ["time","weekday","month","season","weather","holiday","workingday"]
# for var in nominal_variables:
#     df_train[var] = df_train[var].astype("category")

In [15]:
time_d = pd.DataFrame(pd.get_dummies(list(df.time), prefix = 'time_'))
weekday_d = pd.DataFrame(pd.get_dummies(list(df.weekday), prefix='weekday_'))
month_d = pd.DataFrame(pd.get_dummies(list(df.month), prefix='month_'))
season_d = pd.DataFrame(pd.get_dummies(list(df.season), prefix='season_'))
weather_d = pd.DataFrame(pd.get_dummies(list(df.weather), prefix='weather_'))
year_d = pd.DataFrame(pd.get_dummies(list(df.year), prefix='year_'))

In [16]:
with_dummies = [weather_d, month_d, time_d, season_d, weekday_d, year_d]
df = df.join(with_dummies)

**Summary Statistics**

In [17]:
df['count'].describe()

count    17379.000000
mean       119.999770
std        170.711941
min          0.000000
25%          0.000000
50%         28.000000
75%        192.000000
max        977.000000
Name: count, dtype: float64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17379 entries, 0 to 10885
Data columns (total 71 columns):
datetime      17379 non-null object
season        17379 non-null int64
holiday       17379 non-null int64
workingday    17379 non-null int64
weather       17379 non-null int64
temp          17379 non-null float64
atemp         17379 non-null float64
humidity      17379 non-null int64
windspeed     17379 non-null float64
count         17379 non-null int64
id            17379 non-null int64
date          17379 non-null object
time          17379 non-null int64
year          17379 non-null int64
month         17379 non-null int64
day           17379 non-null int64
weekday       17379 non-null int64
Odd_hours     17379 non-null int32
weather__1    17379 non-null uint8
weather__2    17379 non-null uint8
weather__3    17379 non-null uint8
weather__4    17379 non-null uint8
month__1      17379 non-null uint8
month__2      17379 non-null uint8
month__3      17379 non-null uint8
month__4

In [19]:
df.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'count', 'id', 'date', 'time', 'year',
       'month', 'day', 'weekday', 'Odd_hours', 'weather__1', 'weather__2',
       'weather__3', 'weather__4', 'month__1', 'month__2', 'month__3',
       'month__4', 'month__5', 'month__6', 'month__7', 'month__8', 'month__9',
       'month__10', 'month__11', 'month__12', 'time__0', 'time__1', 'time__2',
       'time__3', 'time__4', 'time__5', 'time__6', 'time__7', 'time__8',
       'time__9', 'time__10', 'time__11', 'time__12', 'time__13', 'time__14',
       'time__15', 'time__16', 'time__17', 'time__18', 'time__19', 'time__20',
       'time__21', 'time__22', 'time__23', 'season__1', 'season__2',
       'season__3', 'season__4', 'weekday__0', 'weekday__1', 'weekday__2',
       'weekday__3', 'weekday__4', 'weekday__5', 'weekday__6', 'year__2011',
       'year__2012'],
      dtype='object')

Since temp and atemp are so highly correlated, we can drop temp from the data set. 

**Helper Functions**

In [35]:
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error

def rmsle(y_true, y_preds):
    return np.sqrt(mean_squared_log_error(y_true, y_preds))

# Defining 'coz mean_squared_log_error removed from scikit-learn ver. 0.2
def rmsle_2(y_true, y_pred):
    return np.sqrt(np.mean((np.log(y_true) - np.log(y_pred))**2))

**Remove cols**

In [21]:
df_train = df[df['id'] == 0]
df_test = df[df['id'] == 1]

df_test = df_test.drop('count', axis=1)

In [22]:
X = df_train.drop('count', axis=1)
y = df_train['count']

**Simple Regression - Base Model**

In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [25]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(X_train, y_train)

ValueError: could not convert string to float: '2012-05-19 20:00:00'

In [None]:
preds = reg.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, preds)

Because linear regression can predict negative values, we cannot calcualte the RMLSE for the predictions, and this is thus not a valid method.

**XGBoost**

In [None]:
def eval_rmsle(preds, dtrain):
    labels = dtrain.get_label()
    return 'rmsle', float(np.sqrt(mean_squared_log_error(preds, labels)))

In [None]:
import xgboost as xgb

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

def run_xgb(param):
    
    dtrain = xgb.DMatrix(X_train, y_train)
    dval = xgb.DMatrix(X_val, y_val)
    dtest = xgb.DMatrix(X_test, y_test)
    
    num_round = 5000
    watchlist = [(dval, 'eval'), (dtrain, 'train')]
    evals_result = {}

    bst = xgb.train(param, dtrain, num_round, evals=watchlist, early_stopping_rounds=10, 
                    evals_result=evals_result, feval=eval_rmsle)


In [None]:
xgb.plot_importance(bst)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

rf = rf.fit(X_train, y_train)
preds = rf.predict(X_test)


In [None]:
rmsle(preds, y_test)

**Run Model**

In [None]:
feature_selector = [
    
    'id',
    'count',
    
    'atemp', 
    'windspeed',
    'humidity',
    'weather',
    
    'season__1',
    'season__2',
    'season__3',
    'season__4'

    
]

In [None]:
df.columns

In [None]:
my_df = df[feature_selector]

df_train = my_df[my_df['id'] == 0]
df_test = my_df[my_df['id'] == 1]

df_test = df_test.drop('count', axis=1)

In [None]:
param = {
    'silent':1, 
    'objective':'count:poisson', 
}

run_xgb(param)

In [None]:
train_error = evals_result['train']['rmsle']
val_error = evals_result['eval']['rmsle']
df_error = pd.DataFrame([train_error, val_error]).T
df_error.columns = ['train', 'val']

df_error.plot(title="XGBoost learning curves", ylim=(0,.5))

In [None]:
preds = bst.predict(dtest, ntree_limit = bst.best_ntree_limit)
labels = dtest.get_label()
rmsle(preds, y_test)

**Baseline Test: 0.44**

In [None]:
dtest_final = xgb.DMatrix(df_test)

preds = bst.predict(dtest_final)

In [None]:
preds = pd.Series(preds)

In [None]:
dt = pd.Series(datetime_test)

In [None]:
submission = pd.concat([dt, preds], axis=1)
submission = submission.rename(index=str, columns={"datetime": "datetime", 0: "count"})

In [None]:
submission.to_csv(r'Submissions/submission2.csv', index=False)

**Random Forest**

In [44]:
# Imports
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint

In [70]:
# Process data to match required format
## Copy data frame to separate different model workflows
df_rf = df

# Make list of columns to be removed
## Columns for which dummy var is created
remove_dummy_cols_rf = ['weather', 'month', 'time', 'season', 'year', 'weekday']
## Other columns to be removed
remove_other_cols_rf = ['temp','datetime', 'date', 'holiday', 'day']
## Merging columns to be removed
remove_cols_rf = remove_dummy_cols_rf + remove_other_cols_rf
df_rf = df_rf.drop(remove_cols_rf, axis = 1)
# df_rf.columns

# Separate global test and train
df_train_rf = df_rf[df_rf['id'] == 0]
df_test_rf = df_rf[df_rf['id'] == 1]
df_test_rf = df_test_rf.drop('count', axis=1)

#Separate Training set into X (features) and y(targets)
X_rf = df_train_rf.drop(['count','id'], axis=1)
y_rf = df_train_rf['count']

## Split training set in train and test
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf, y_rf, test_size=0.2)
## Further split train set into train and validation
X_train_rf, X_val_rf, y_train_rf, y_val_rf = train_test_split(X_train_rf, y_train_rf, test_size=0.2)

# Convert every pandas df to nparray
X_train_rf = np.array(X_train_rf.values.tolist())
X_test_rf = np.array(X_test_rf.values.tolist())
X_val_rf = np.array(X_val_rf.values.tolist())
y_train_rf = np.array(y_train_rf.values.tolist())
y_test_rf = np.array(y_test_rf.values.tolist())
y_val_rf = np.array(y_val_rf.values.tolist())

# Preparing global test matrix
test_rf = df_test_rf.drop(['id'], axis=1)
test_rf = np.array(test_rf.values.tolist())


In [75]:
##############CAUTION - Following takes ~7Mins to run##############
# Grid Search for Random Forest Model
param_grid = { 
                'n_estimators': [40, 50, 60, 70],
                'max_depth' : list(range(20,30,2)),
                'min_samples_split' : list(range(2,12,2))
            }

CV_rfc = GridSearchCV(estimator=rf_regr, 
                      param_grid=param_grid,
                      cv= 5,
                      scoring='neg_mean_squared_log_error')
CV_rfc.fit(X_train_rf, y_train_rf)
CV_rfc.best_params_

{'max_depth': 26, 'min_samples_split': 2, 'n_estimators': 70}

In [69]:
# Randomized Search
param_grid = { 
                'n_estimators': randint(40,100),
                'max_depth' : randint(10,100),
                'min_samples_split' : randint(2,10)
            }

CV_rfc = RandomizedSearchCV(estimator=rf_regr,
                            n_iter=5,
                            param_distributions=param_grid,
                            cv= 5,
                            scoring='neg_mean_squared_log_error')
CV_rfc.fit(X_train_rf, y_train_rf)
CV_rfc.best_params_

{'max_depth': 97, 'min_samples_split': 3, 'n_estimators': 40}

In [76]:
# Define model

## Initialize Regressor
rf_regr = RandomForestRegressor(n_estimators=70,
                                max_depth=26, 
                                random_state=0,
                                min_samples_split=2,
                               criterion='mse')
rf_regr.fit(X_train_rf, y_train_rf)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=26,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=70, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [93]:
# Train error
rf_train_predictions = rf_regr.predict(X_train_rf)
rf_train_error = rmsle_2(y_train_rf,rf_train_predictions)
rf_train_error

# Validation error
rf_val_predictions = rf_regr.predict(X_val_rf)
rf_val_error = rmsle_2(y_val_rf,rf_val_predictions)
rf_val_error

# Test error
rf_test_predictions = rf_regr.predict(X_test_rf)
rf_test_error = rmsle_2(y_test_rf,rf_test_predictions)
rf_test_error

print('Train error:{:.4f} \nValidation error:{:.4f} \nTest error:{:.4f}'.format(rf_train_error, rf_val_error, rf_test_error))

Train error:0.2020 
Validation error:0.4045 
Test error:0.4262


In [88]:
# Test on submission set
rf_submission_predictions = rf_regr.predict(test_rf)
preds = pd.Series(rf_submission_predictions)
dt = pd.Series(datetime_test)

# Generate submission file
submission = pd.concat([dt, preds], axis=1)
submission = submission.rename(index=str, columns={"datetime": "datetime", 0: "count"})
time_str = now.strftime("%Y-%m-%d_%H-%M")
file_name = 'Submissions/submission_' + time_str + '.csv'
submission.to_csv(file_name, index=False)