In [1]:
# Todo: Update the modeling function. for some weird reason, the cross validation seems to be a weird one
# Since we are adding a function ("make_scorer")


In [2]:
import numpy as np
import pandas as pd

from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor
import xgboost as xgb

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer
from ml_metrics import rmsle

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

In [3]:
# Importing the datasets that will be used

training_set = pd.read_csv("./archive/train.csv", parse_dates=True)
testing_set = pd.read_csv("./archive/test.csv", parse_dates=True)
train_test_set = pd.concat([training_set, testing_set], sort=True)

In [4]:
train_test_set['count_log'] = np.log(train_test_set['count']+1)
train_test_set['casual_log'] = np.log(train_test_set['casual']+1)
train_test_set['registered_log'] = np.log(train_test_set['registered']+1)
train_test_set['is_test'] = train_test_set['count'].isnull()

In [5]:
def remove_unwanted_vars(data, unwanted_list):
    feats = [feat for feat in data.columns if feat not in unwanted_list]
    return feats

def get_Xs_y(data, unwanted_list, target='count'):
    feats = remove_unwanted_vars(data, unwanted_list)
    return data[feats].values, data[target].values


In [6]:
# The following are the functions that can expediate the code
def rmsle_custom(actual, predicted):
    sle = (np.power(np.log(np.array((actual))+1) - 
            np.log(np.array(np.abs(predicted))+1), 2))
    msle = np.mean(sle)
    return (np.sqrt(msle))

def regression_models(fitted_model, Xs, y, cv=10):
    rmsle_score = make_scorer(rmsle_custom, greater_is_better=False)
    model_score = cross_val_score(fitted_model, Xs.values, y.values, cv=cv, scoring=rmsle_score)
    return np.abs(np.mean(model_score))

def return_parameters(gridsearch, verbose=False):
    params = gridsearch.best_params_
    accuracy = gridsearch.best_score_
    if verbose:
        print('{0} were the best parameters to use'.format(params))
        print('{0} was the accuracies'.format(np.abs(accuracy)))
    return [params, np.abs(accuracy)]

def gridsearch_cv(model, params, cv_iters, X, y):
    rmsle_score = make_scorer(rmsle_custom, greater_is_better=False)
    grid_search = GridSearchCV(estimator=model(),
                               param_grid=params,
                               scoring=rmsle_score,
                               cv=2, n_jobs=-1)
    fitted_model = grid_search.fit(X, y)
    return fitted_model


In [7]:
# Noticed that we have date that can be parsed in many ways

def feature_enginering(data):
    data['datetime'] = pd.to_datetime(data['datetime'])
    data['year'] = data['datetime'].dt.year
    data['month'] = data['datetime'].dt.month
    data['day'] = data['datetime'].dt.day
    data['hour'] = data['datetime'].dt.hour
    data['minute'] = data['datetime'].dt.minute
    data['dayofweek'] = data['datetime'].dt.dayofweek
    data['weekofyear'] = data['datetime'].dt.weekofyear

    data['weekend'] = data['dayofweek'].map(lambda x: int(x in [5,6]))

    # Conditional on time of day, morning=1, afternoon=2, evening=3, night=4
    conditions = [
        ((data['hour'] >=  5) & (data['hour'] < 12)),
        ((data['hour'] >=  12) & (data['hour'] < 17)),
        ((data['hour'] >=  17) & (data['hour'] < 21))
    ]
    choices = [1, 2, 3]
    data['time_of_day'] = np.select(conditions, choices, default=4)


    data['rush_hour'] = training_set['hour'].apply(lambda x: int(x in [8,9,10,17,18,19]))
    data['rush_workday'] = 0
    data.loc[data['weekend'] == 0, 'rush_workday'] = 1
    
    
    data['holiday'] = data[['month', 'day', 'holiday', 'year']].apply(
        lambda x: (x['holiday'], 1)[x['year'] == 2012 and x['month'] == 10 and (x['day'] in [30])], axis = 1)

    data['holiday'] = data[['month', 'day', 'holiday']].apply(
        lambda x: (x['holiday'], 1)[x['month'] == 12 and (x['day'] in [24, 26, 31])], axis = 1)


    return data

In [8]:
# From the BikeAnalysis: I will check the performance of the 15-variable feature selection

features_1 = ['season', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 
              'month', 'day', 'hour', 'dayofweek', 'weekofyear', 'time_of_day', 'rush_hour']
features_xgb = ['season','holiday','workingday','weather','temp','atemp','humidity', 'windspeed',
                'year','month','hour','dayofweek','weekofyear','time_of_day','rush_hour']

In [9]:
# One good method will be to compute the count, registered, and casual

y_count = training_set['count'].astype(float)
y_reg = training_set['casual'].astype(float)
y_cas = training_set['registered'].astype(float)

### Grid Search

In [10]:
training_set = feature_enginering(training_set)
Xs, y = training_set[features_1], training_set['count'].astype(float)
y_reg, y_cas = training_set['registered'], training_set['casual']



- From Analysis, we found the extra_tree is the best model.

In [None]:
# GradientBoostingRegressor
# Learning rate: .5, min_samples_split=5, loss=lad, max_depth=5, max_depth=6

params = { 
    'n_estimators': [1000,2000,3000,4000],
    'max_features': ["auto","sqrt","log2",0.6,0.8],
    'min_samples_leaf':[30,40,50,60,70],
    'min_samples_split':[150,200,250,300],
    'max_depth' : [10,15,20,25],
    'subsample': [0.4,0.6,0.8],
    'learning_rate':[0.1,0.01,0.001]
}

gridcv_gradboost = gridsearch_cv(GradientBoostingRegressor, params, 10, Xs, y)
return_parameters(gridcv_gradboost)

- Predicted and actual are using the absolute values
    - 0.6785850210551115
- actual is absolute values
    - nan
- Predicted is absolute values
    - 0.6799188779582174

In [91]:
# Fitting the model with the best parameters

gbm_params = {
    'n_estimators': 50, 'max_depth': 6, 'random_state': 0, 'min_samples_split': 5, 
    'learning_rate': 0.5, 'loss': 'lad'
}

gbm_model = GradientBoostingRegressor(**gbm_params)
gbm_model_mean = regression_models(gbm_model, Xs, y)

gbm_model_mean

0.6863817055451075

In [14]:
# Xgboost Regressor
# Grid Search Step 1 -> max_depth = 5, learning_rate=.1, n_estimators=50

# Using a new set of features
# accuracy was .58 with "features_1"

xgboost_params = {
    'max_depth': [7,8,9], 'learning_rate': [.1], 
    'n_estimators': [50]
}

# gridcv_xgboost = gridsearch_cv(xgb.XGBRegressor, xgboost_params, 10, Xs, y)
gridcv_xgboost = gridsearch_cv(xgb.XGBRegressor, xgboost_params, 10, training_set[features_xgb], y)
return_parameters(gridcv_xgboost)

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50} were the best parameters to use
-0.5868346520967258 was the accuracies


In [20]:
# Fitting the model with the best parameters

xgb_params = {
    'max_depth': 7, 'learning_rate': .1, 'random_state': 0, 'n_estimators': 50
}
 
xgb_model = xgb.XGBRegressor(**xgb_params)
xgb_model_mean = regression_models(xgb_model, Xs, y)
xgb_model_mean

0.5867960084359407

In [28]:
# xgb_model_mean_cas->-0.6851131606707945
# xgb_model_mean_cas->-0.6489501931980225
# xgb_model_mean->-0.6195904516161043

In [37]:
# Extratree regressor

extratree_parms = {
    'n_estimators': [10,20,30], 'max_depth': [5,7,9], 'min_samples_split': [2,4,6],
    'min_samples_leaf': [1,3,5], 'max_features': ['auto', 'sqrt', 'log2']
}

gridcv_extratrees = gridsearch_cv(ExtraTreesRegressor, extratree_parms, 10, Xs, y)
return_parameters(gridcv_extratrees)

[{'max_depth': 9,
  'max_features': 'auto',
  'min_samples_leaf': 3,
  'min_samples_split': 4,
  'n_estimators': 30},
 0.5382425911023736]

In [36]:
# Fitting the model with the best parameters

extratree_params = {'max_depth': 9, 'max_features': 'auto', 'min_samples_leaf': 1, 
              'min_samples_split': 4, 'n_estimators': 20}
 
extratree_model = ExtraTreesRegressor(**extratree_params)
extratree_model_mean = regression_models(extratree_model, Xs, y)
extratree_model_mean

-0.5434157102695787

In [24]:
# Does splitting the data btw casual and registered make any differenced (what is the average split?)
# It does not! (atleast using the analysis)

sum_count = sum(training_set['count'])
sum_casual = sum(training_set['casual'])
sum_registered = sum(training_set['registered'])

sum_casual/sum_count, sum_registered/sum_count

(0.188031413451893, 0.811968586548107)

### Predict

In [29]:
testing_set = feature_enginering(testing_set)
xgb_model = xgb_model.fit(training_set[features_1], training_set['count'])

In [32]:
# Test Predictions
predict = xgb_model.predict(testing_set[features_1])
predict[predict < 0] = 1.5
testing_set['count'] = predict

In [33]:
testing_set[['datetime', 'count']].to_csv('final_submit_6.30.csv', index=False)