# Bike sharing


### Understanding the Dataset

##### Independent Variables
- datetime:   date and hour in "mm/dd/yyyy hh:mm" format
- season:     Four categories-> 1 = spring, 2 = summer, 3 = fall, 4 = winter
- holiday:    whether the day is a holiday or not (1/0)
- workingday: whether the day is neither a weekend nor holiday (1/0)
- weather:    Four Categories of weather
            1-> Clear, Few clouds, Partly cloudy, Partly cloudy
            2-> Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
            3-> Light Snow and Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
            4-> Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp:       hourly temperature in Celsius
- atemp:      "feels like" temperature in Celsius
- humidity:   relative humidity
- windspeed:  wind speed

##### Dependent Variables
- registered: number of registered user
- casual:     number of non-registered user
- count:      number of total rentals (registered + casual)


In [None]:
import os
print(os.listdir("../dataset/Bike-Sharing-Dataset/"))

In [None]:
#Loading all the needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor


In [None]:
# Importing the data
df = pd.read_csv("../dataset/Bike-Sharing-Dataset/day.csv")

train_df = df.loc[0:600,:]
test_df = df.loc[600:]

In [None]:
train_df.head()


In [None]:
print(df.shape)
print(train_df.shape)
print(test_df.shape)

In [None]:
# Looking at al the unique values in the column

train_df.nunique()


In [None]:
train_df.describe()

In [None]:
#Checking in there are any null values in the columns and their data types
train_df.info()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
# test_df["casual"] = np.nan
# test_df["registered"] = np.nan
# test_df["cnt"] = np.nan

In [None]:
print(test_df.dtypes)

In [None]:
print(train_df.dtypes)

In [None]:
test_df.head()

In [None]:
train_df['dteday'] = pd.to_datetime(train_df['dteday'])
test_df['dteday'] = pd.to_datetime(test_df['dteday'])

### Feature engineering 

In [None]:
test_df['year'] = test_df['dteday'].dt.year
test_df['month'] = test_df['dteday'].dt.month
test_df['hour'] = test_df['dteday'].dt.hour
test_df['DOW'] = test_df['dteday'].dt.dayofweek

train_df['year'] = train_df['dteday'].dt.year
train_df['month'] = train_df['dteday'].dt.month
train_df['hour'] = train_df['dteday'].dt.hour
train_df['DOW'] = train_df['dteday'].dt.dayofweek

In [None]:
train_df.head()

In [None]:
ind_variable_selected = ['workingday', 'temp', 'year', 'month', 'hour', 'DOW']

In [None]:
x_orig_train = train_df[ind_variable_selected]
y_orig_train = train_df['cnt']


In [None]:
x_orig_test = test_df[ind_variable_selected]
y_orig_test = test_df['cnt']


In [None]:
x_orig_train.head()

In [None]:
y_orig_train.head()

In [None]:
hour_df = pd.read_csv("../dataset/Bike-Sharing-Dataset/hour.csv")

In [None]:
hour_df.head()

### Create smaller training and validation sets

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x_orig_train, y_orig_train, test_size=0.25, random_state=40)

In [None]:
x_train.head()

In [None]:
x_valid.head()

### Set up the scoring rule

In [None]:
def RMSLE(predictions, realizations):
    prediction_use = predictions.clip(0)
    rmsle = np.sqrt(np.mean(np.array(np.log(prediction_use + 1) - np.log(realizations + 1)) **2 ))
    return rmsle

### Fit a regression tree
A regression tree is tuned with min_samples_split. Its default setting is 20, which means 20 samples are required in any node for that node to be split on. Once a node has fewer than 20 samples in it, no more split will occur. 

In [None]:
x_orig_test = test_df[ind_variable_selected]
y_orig_test = test_df['cnt']

In [None]:
rt = DecisionTreeRegressor(min_samples_split=25, random_state=40)
rt_model = rt.fit(x_train, y_train)
rt_pred = rt_model.predict(x_orig_test)

In [None]:
rt_pred

In [None]:
RMSLE(rt_pred,y_orig_test)


In [None]:
pd.DataFrame(rt_model.feature_importances_, index=ind_variable_selected)

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(rt_pred,y_orig_test, s=0.2)
plt.xlim(400,9000)
plt.ylim(400,9000)
plt.plot([400,8500],[400, 8500], color='r',linestyle='-',linewidth=2)
plt.show()

### Random Forest

In [None]:
rf = RandomForestRegressor(n_estimators=500, max_features=4, min_samples_leaf=5,random_state=201)
rf_model = rf.fit(x_train,y_train)
rf_pred = rf.predict(x_valid)

In [None]:
RMSLE(rf_pred,y_valid)

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(rf_pred,y_valid, s=0.2)
plt.xlim(400,9000)
plt.ylim(400,9000)
plt.plot([400,8500],[400, 8500], color='r',linestyle='-',linewidth=2)
plt.xlabel('rf_pred',fontsize=18)
plt.ylabel('y_valid',fontsize=18)
plt.show()

### Boosted Tree Model

In [None]:
import xgboost as xgb

xgb_train = xgb.DMatrix(x_train,label=y_train)
xgb_valid = xgb.DMatrix(x_valid)

In [None]:
num_round_for_cv = 500
param = {'max_depth':5, 'eta':0.1, 'seed': 201, 'objective':'reg:linear'}

running sgbboost in cross-validation mode to find out the best bnumber of rounds to use.

In [None]:
xgb.cv(param, xgb_train, num_round_for_cv,nfold=5, show_stdv=False, verbose_eval=True, as_pandas=False)

In [None]:
num_round = 500
xgb_model = xgb.train(param, xgb_train, num_round)
xgb_pred = xgb_model.predict(xgb_valid)

In [None]:
xgb_model.get_fscore()

In [None]:
xgb.plot_importance(xgb_model)

In [None]:
RMSLE(xgb_pred,y_valid)