In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from ml_metrics import rmsle
sns.set_style('darkgrid')
%matplotlib inline

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LassoLars
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb

from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, RFE

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

  from numpy.core.umath_tests import inner1d


In [2]:
# Uploading the training set
saved_train = pd.read_pickle("./pickles/saved_trainset.pkl")
saved_train_std = pd.read_pickle("./pickles/saved_trainset_std.pkl")

# Uploading testing set
saved_test = pd.read_pickle("./pickles/saved_testset.pkl")
saved_datetime = saved_test['datetime']
saved_test = saved_test.drop(['datetime'], axis=1)
saved_test_std = pd.read_pickle("./pickles/saved_testset_std.pkl")
saved_test_std = saved_test_std.drop(['datetime'], axis=1)

# Removing the predictors from the training sets
x_train = saved_train.drop(['casual', 'registered', 'count'], axis=1)
x_train_std = saved_train_std.drop(['casual', 'registered', 'count'], axis=1)

# Saving the different versions of predictors (saved trainset version)
y_train_casual = saved_train['casual']
y_train_registered = saved_train['registered']
y_train_count = saved_train['count']

# Saving the different versions of predictors (saved trainset_std version)
y_train_casual_std = saved_train_std['casual']
y_train_registered_std = saved_train_std['registered']
y_train_count_std = saved_train_std['count']

In [4]:
x_train.head(5)

Unnamed: 0,temp,atemp,humidity,windspeed,day,hour,minute,weekofyear,season_spring,season_summer,...,month_Sep,dayofweek_Monday,dayofweek_Saturday,dayofweek_Sunday,dayofweek_Thursday,dayofweek_Tuesday,dayofweek_Wednesday,time_of_day_evening,time_of_day_morning,time_of_day_night
0,9.84,14.395,81,0.0,1,0,0,52,1,0,...,0,0,1,0,0,0,0,0,0,1
1,9.02,13.635,80,0.0,1,1,0,52,1,0,...,0,0,1,0,0,0,0,0,0,1
2,9.02,13.635,80,0.0,1,2,0,52,1,0,...,0,0,1,0,0,0,0,0,0,1
3,9.84,14.395,75,0.0,1,3,0,52,1,0,...,0,0,1,0,0,0,0,0,0,1
4,9.84,14.395,75,0.0,1,4,0,52,1,0,...,0,0,1,0,0,0,0,0,0,1


In [8]:
x_train_std.columns

Index(['temp', 'atemp', 'humidity', 'windspeed', 'day', 'hour', 'minute',
       'weekofyear', 'season_spring', 'season_summer', 'season_winter',
       'workingday_Yes', 'year_2012', 'month_Aug', 'month_Dec', 'month_Feb',
       'month_Jan', 'month_Jul', 'month_Jun', 'month_Mar', 'month_May',
       'month_Nov', 'month_Oct', 'month_Sep', 'dayofweek_Monday',
       'dayofweek_Saturday', 'dayofweek_Sunday', 'dayofweek_Thursday',
       'dayofweek_Tuesday', 'dayofweek_Wednesday', 'time_of_day_evening',
       'time_of_day_morning', 'time_of_day_night'],
      dtype='object')

In [7]:
# The following are functions that will run the models. 

def rmsle_custom(actual, predicted):
    sle = (np.power(np.log(np.array((actual))+1) - 
            np.log(np.array(np.abs(predicted))+1), 2))
    msle = np.mean(sle)
    return (np.sqrt(msle))

def return_parameters(gridsearch, verbose=False):
    params = gridsearch.best_params_
    score = gridsearch.best_score_
    if verbose:
        print('{0} were the best parameters to use'.format(params))
        print('{0} was the accuracies'.format(np.abs(score)))
    return [params, np.abs(score)]

def gridsearch_cv(model, params, Xs_df, y_df, cv=5):
    Xs, y = Xs_df.values, y_df.values
    rmsle_score = make_scorer(rmsle_custom, greater_is_better=False)
    grid_search = GridSearchCV(estimator=model,
                               param_grid=params,
                               scoring=rmsle_score,
                               cv=cv, n_jobs=-1)
    model = grid_search.fit(Xs, y)
    return model

In [8]:
def run_model(models_params, Xs_df, y_df, test_df, test_ids):
    Xs, y = Xs_df.values, y_df.values
    models_params.fit(Xs_df, y_df)
    
    test = test_df.values
    predict = models_params.predict(test_df)
    predict = np.exp(predict) - 1
    
    submission = pd.DataFrame({'Id': test_ids, 'Target':predict.astype(int)})
    return submission

In [5]:
Y_TRAIN_SETS = {
    'y_casual': y_train_casual,
    'y_register': y_train_registered,
    'y_count': y_train_count
}

Y_TRAIN_SETS_STD = {
    'y_casual': y_train_casual_std,
    'y_register': y_train_registered_std,
    'y_count': y_train_count_std
}

### Ridge Model

In [6]:
# For version without the standardized dataset

for y_name, y in Y_TRAIN_SETS.items():
    print("{0}".format(y_name))
        
    # Creating the cv for the ridge regression
    ridge_model = Ridge()
    ridge_params = {
        'alpha': [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
    }

    gridcv_ridge = gridsearch_cv(ridge_model, ridge_params, x_train, y, cv=10)
    best_params_ridge, _ = return_parameters(gridcv_ridge, verbose=True)
    print("-------------------------\n")

y_casual
{'alpha': 30} were the best parameters to use
0.326383458165148 was the accuracies
-------------------------

y_register
{'alpha': 1} were the best parameters to use
0.20576026152780735 was the accuracies
-------------------------

y_count
{'alpha': 5} were the best parameters to use
0.19512459762604878 was the accuracies
-------------------------



In [7]:
# For version without the standardized dataset
best_params_list = []
for y_name, y in Y_TRAIN_SETS_STD.items():
    print("{0}".format(y_name))
        
    # Creating the cv for the ridge regression
    ridge_model = Ridge()
    ridge_params = {
        'alpha': [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
    }

    gridcv_ridge = gridsearch_cv(ridge_model, ridge_params, x_train_std, y, cv=10)
    best_params_ridge, _ = return_parameters(gridcv_ridge, verbose=True)
    best_params_list.append(best_params_ridge)
    print("-------------------------\n")

y_casual
{'alpha': 50} were the best parameters to use
0.32604002704501933 was the accuracies
-------------------------

y_register
{'alpha': 5} were the best parameters to use
0.2057563504407891 was the accuracies
-------------------------

y_count
{'alpha': 15} were the best parameters to use
0.1951064012400066 was the accuracies
-------------------------



In [8]:
ridge_model = Ridge(**best_params_list[2])
ridge_submission_df = run_model(ridge_model, x_train_std, y_train_count, saved_test_std, saved_datetime)
#ridge_submission_df.to_csv("./saved_submissions/ridge_train_v2-16_08_2018.csv", index=False)
ridge_submission_df.head(5)

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,4
1,2011-01-20 01:00:00,5
2,2011-01-20 02:00:00,6
3,2011-01-20 03:00:00,6
4,2011-01-20 04:00:00,7


### Random Forest

In [9]:
# Random Forest Regression
rf_model = RandomForestRegressor(n_estimators=500, n_jobs=-1)
rf_submission_df = run_model(rf_model, x_train, y_train_count, saved_test, saved_datetime)
#rf_submission_df.to_csv("./saved_submissions/rf_train_v2-28_08_2018.csv", index=False)
rf_submission_df.head(5)

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,11
1,2011-01-20 01:00:00,5
2,2011-01-20 02:00:00,3
3,2011-01-20 03:00:00,3
4,2011-01-20 04:00:00,2


### Xgboost

In [15]:
best_params_list = []
for y_name, y in Y_TRAIN_SETS.items():
    print("{0}".format(y_name))
        
    # Creating the cv for the ridge regression
    xgb_model = xgb.XGBRegressor()
    xgb_params = {
        'learning_rate': [0.05, 0.1, 0.3, 1, 3, 5, 10]
    }

    xgb_gridcv = gridsearch_cv(xgb_model, xgb_params, x_train, y, cv=10)
    xgb_best_params, _ = return_parameters(xgb_gridcv, verbose=True)
    best_params_list.append(xgb_best_params)
    print("-------------------------\n")

y_casual
{'learning_rate': 0.3} were the best parameters to use
0.23297893813907422 was the accuracies
-------------------------

y_register
{'learning_rate': 0.3} were the best parameters to use
0.09767920999354975 was the accuracies
-------------------------

y_count
{'learning_rate': 0.3} were the best parameters to use
0.09186284337610981 was the accuracies
-------------------------



In [9]:
xgb_params = {
    'learning_rate': .3
}

xgb_model = xgb.XGBRegressor(**xgb_params)
xgb_submission_df = run_model(xgb_model, x_train, y_train_count, saved_test, saved_datetime)
#xgb_submission_df.to_csv("./saved_submissions/xgb_train_v2-29_08_2018.csv", index=False)
xgb_submission_df.head(5)

Unnamed: 0,Id,Target
0,2011-01-20 00:00:00,7
1,2011-01-20 01:00:00,4
2,2011-01-20 02:00:00,2
3,2011-01-20 03:00:00,1
4,2011-01-20 04:00:00,1


In [14]:
x_train.columns

Index(['temp', 'atemp', 'humidity', 'windspeed', 'day', 'hour', 'minute',
       'weekofyear', 'season_spring', 'season_summer', 'season_winter',
       'workingday_Yes', 'year_2012', 'month_Aug', 'month_Dec', 'month_Feb',
       'month_Jan', 'month_Jul', 'month_Jun', 'month_Mar', 'month_May',
       'month_Nov', 'month_Oct', 'month_Sep', 'dayofweek_Monday',
       'dayofweek_Saturday', 'dayofweek_Sunday', 'dayofweek_Thursday',
       'dayofweek_Tuesday', 'dayofweek_Wednesday', 'time_of_day_evening',
       'time_of_day_morning', 'time_of_day_night'],
      dtype='object')

**Scoring: .41643 (397/3251)**

In [17]:
x_train.loc[0, :]


temp                    9.840
atemp                  14.395
humidity               81.000
windspeed               0.000
day                     1.000
hour                    0.000
minute                  0.000
weekofyear             52.000
season_spring           1.000
season_summer           0.000
season_winter           0.000
workingday_Yes          0.000
year_2012               0.000
month_Aug               0.000
month_Dec               0.000
month_Feb               0.000
month_Jan               1.000
month_Jul               0.000
month_Jun               0.000
month_Mar               0.000
month_May               0.000
month_Nov               0.000
month_Oct               0.000
month_Sep               0.000
dayofweek_Monday        0.000
dayofweek_Saturday      1.000
dayofweek_Sunday        0.000
dayofweek_Thursday      0.000
dayofweek_Tuesday       0.000
dayofweek_Wednesday     0.000
time_of_day_evening     0.000
time_of_day_morning     0.000
time_of_day_night       1.000
Name: 0, d