In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
import xgboost as xgb
    
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import normalize
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse 

import pandas as pd
import numpy as np
import datetime as dt
from math import *

# =====================================================================

ifile = "D:/Capston/2013_Hour_By_Tract.csv"
scores_cols = ['Test', 'Test_score']
statfile = "D:/Capston/RegTest_" + dt.datetime.now().strftime("%Y%m%d%H%M%S") + ".txt"

features=['tractID','mday','hr','temp',
          'season', 'weather', 'weekday', 'mth',
          #'tempCluster','hrCluster',
          'holiday','workingday', 'windspeed','humidity']
features2=['tractID','mday','hr',
          'season_0','season_1','season_2','season_3',
          'weather_0','weather_1','weather_2','weather_3',
          'weekday_0','weekday_1','weekday_2','weekday_3','weekday_4','weekday_5','weekday_6',
          'mth_0','mth_1','mth_2','mth_3','mth_4','mth_5','mth_6','mth_7','mth_8','mth_9','mth_10','mth_11',
          'tempCluster','hrCluster',
          'holiday','workingday', 'windspeed','humidity']
n_job=4
#score='neg_mean_squared_error'

# =====================================================================

def read_data():
    ds = pd.read_csv(ifile, sep=',', header=0)
    ds['dteday'] = pd.to_datetime(ds['dteday'], format='%Y-%m-%d')
    ds['mday'] = ds['dteday'].dt.day 
    
    ds['casual'] = [log1p(x) for x in ds['casual']]
    ds['registered'] = [log1p(x) for x in ds['registered']]
    ds['cnt'] = [log1p(x) for x in ds['cnt']]
    
    return remove_columns(ds, ['dteday', 'yr'])


def remove_columns(ds, drop_cols):
    ds = ds.drop(drop_cols, axis = 1)

    return ds

# =====================================================================

def temp_cluster(temp):

    if temp <= 1.0: 
        return 0
    elif temp > 1.0 and temp <= 15.0:
        return 1
    elif temp > 15.0 and temp <= 22.0:
        return 2
    elif temp > 22.0 and temp <= 31.0:
        return 3
    else:
        return 4


def hr_cluster(hr):

    if hr <= 6.5: 
        return 0
    elif hr > 6.5 and hr <= 7.5:
        return 1
    elif hr > .5 and hr <= 8.5:
        return 2
    elif hr > 8.5 and hr <= 16:
        return 3
    elif hr > 16 and hr <= 18:
        return 4
    elif hr > 18 and hr <= 20:
        return 5
    else:
        return 6
     

def transform1_data(ds):

    enc = OneHotEncoder(sparse=False)  
    n = ds['season'].shape[0]
    enc_array = pd.DataFrame(sparse.csr_matrix(enc.fit_transform(ds['season'].values.reshape(-1, 1))).todense().reshape(n, 1, 4))
    ds['season_0'] = enc_array[[0]]
    ds['season_1'] = enc_array[[1]] 
    ds['season_2'] = enc_array[[2]] 
    ds['season_3'] = enc_array[[3]]     

    enc_array = pd.DataFrame(sparse.csr_matrix(enc.fit_transform(ds['weather'].values.reshape(-1, 1))).todense().reshape(n, 1, 4))
    ds['weather_0'] = enc_array[[0]]
    ds['weather_1'] = enc_array[[1]] 
    ds['weather_2'] = enc_array[[2]] 
    ds['weather_3'] = enc_array[[3]]     

    enc_array = pd.DataFrame(sparse.csr_matrix(enc.fit_transform(ds['weekday'].values.reshape(-1, 1))).todense().reshape(n, 1, 7))
    ds['weekday_0'] = enc_array[[0]]
    ds['weekday_1'] = enc_array[[1]] 
    ds['weekday_2'] = enc_array[[2]] 
    ds['weekday_3'] = enc_array[[3]]     
    ds['weekday_4'] = enc_array[[4]]
    ds['weekday_5'] = enc_array[[5]] 
    ds['weekday_6'] = enc_array[[6]] 
    
    enc_array = pd.DataFrame(sparse.csr_matrix(enc.fit_transform(ds['mth'].values.reshape(-1, 1))).todense().reshape(n, 1, 12))
    ds['mth_0'] = enc_array[[0]]
    ds['mth_1'] = enc_array[[1]] 
    ds['mth_2'] = enc_array[[2]] 
    ds['mth_3'] = enc_array[[3]]     
    ds['mth_4'] = enc_array[[4]]
    ds['mth_5'] = enc_array[[5]] 
    ds['mth_6'] = enc_array[[6]] 
    ds['mth_7'] = enc_array[[7]]
    ds['mth_8'] = enc_array[[8]] 
    ds['mth_9'] = enc_array[[9]] 
    ds['mth_10'] = enc_array[[10]]     
    ds['mth_11'] = enc_array[[11]]

    ds = transform2_data(ds)
    
    return remove_columns(ds, ['season', 'weather', 'weekday', 'mth', 'temp'])

        
def transform2_data(df):
        
    df['hrCluster'] = df.apply(lambda x: hr_cluster(x['hr']), axis = 1) 
    df['tempCluster'] = df.apply(lambda x: temp_cluster(x['temp']), axis = 1) 
        
    df['tractID'] = MinMaxScaler().fit_transform(df['tractID'].astype(float).values.reshape(-1, 1))
    df['mday'] = StandardScaler().fit_transform(df['mday'].astype(float).values.reshape(-1, 1))
    df['hr'] = StandardScaler().fit_transform(df['hr'].astype(float).values.reshape(-1, 1))
    df['weekday'] = StandardScaler().fit_transform(df['weekday'].astype(float).values.reshape(-1, 1))
    df['weather'] = StandardScaler().fit_transform(df['weather'].astype(float).values.reshape(-1, 1))
    df['temp'] = StandardScaler().fit_transform(df['temp'].values.reshape(-1, 1))
    df['mth'] = StandardScaler().fit_transform(df['mth'].astype(float).values.reshape(-1, 1))
    df['season'] = StandardScaler().fit_transform(df['season'].astype(float).values.reshape(-1, 1))    
    df['windspeed'] = [log1p(x) for x in df['windspeed']]
    df['humidity'] = [log1p(x) for x in df['humidity']]
    
    #print(df.head())
    return df


def rmsle(predicted, actual, TestScores, testname):
    df = pd.concat([actual, predicted], axis = 1)
    df['err'] = df.apply(lambda x: pow((log1p(x['cnt']) - log1p(x['pred'])), 2), axis = 1)
    
    #mean_squared_log_error(actual, predicted)
    tmp = pd.DataFrame([[testname, sqrt(df['err'].sum() / len(df))]],
                       columns = scores_cols)

    tmp.to_csv(statfile, mode='a', header = False, index = False, sep = '|')
    TestScores = TestScores.append(tmp, ignore_index = True)

    return TestScores

# =====================================================================

def linearregression(X_data, y_data, TestScores):
    from sklearn.linear_model import LinearRegression
    
    print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
          " - Start processing LinearRegression")

    model = LinearRegression(normalize = False)
    X_data['pred'] = cross_val_predict(model, X_data, y_data, cv=10)

    TestScores = rmsle(X_data['pred'], y_data, TestScores, 'LinearRegression')

    return TestScores

# =====================================================================

def ridgeregression(X_data, y_data, TestScores):
    from sklearn.linear_model import Ridge
    
    for sv in ['auto', 'svd', 'lsqr', 'sparse_cg', 'saga']:
        print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
              " - Start processing RidgeRegression for solver: %s" %(sv))

        model = Ridge(solver=sv, normalize = False, random_state=212)
        X_data['pred'] = cross_val_predict(model, X_data, y_data, cv=10)

        TestScores = rmsle(X_data['pred'], y_data, TestScores, 'RidgeRegression-' + sv)

    return TestScores

# =====================================================================

def bayesianridge(X_data, y_data, TestScores):
    from sklearn.linear_model import BayesianRidge
        
    print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
            " - Start processing BayesianRidge")
    model = BayesianRidge(n_iter=1000, normalize = False)
    X_data['pred'] = cross_val_predict(model, X_data, y_data.values.ravel(), cv=10)

    TestScores = rmsle(X_data['pred'], y_data, TestScores, 'BayesianRidge')

    return TestScores
          
# =====================================================================

def sgdregressor(X_data, y_data, TestScores):
    from sklearn.linear_model import SGDRegressor
    
    print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
            " - Start processing SGDRegressor")
    model = SGDRegressor(max_iter=1000, random_state=212)
    X_data['pred'] = cross_val_predict(model, X_data, y_data.values.ravel(), cv=10)

    TestScores = rmsle(X_data['pred'], y_data, TestScores, 'SGDRegressor')

    return TestScores          
          
# =====================================================================

def gradientboostingregression(X_data, y_data, TestScores):
    from sklearn.ensemble import GradientBoostingRegressor

#    for ne in [50, 100, 150, 200, 250, 300, 350, 400]:
    for ne in [250, 300, 300, 350, 400]:
        for md in [30, 35, 40, 45, 50]:
            print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
                    " - Start processing GradientBoostingRegressor for n_estimator:%s  max_depth:%s" %(ne, md))
            model = GradientBoostingRegressor(n_estimators = ne, max_depth = md,
                                            learning_rate = 0.1)
            X_data['pred'] = cross_val_predict(model, X_data, y_data.values.ravel(), cv=10)

            TestScores = rmsle(X_data['pred'], y_data, TestScores, "GradientBoostingRegressor %s-%s" %(ne, md))  

    return TestScores

# =====================================================================

def randomforestregression(X_data, y_data, TestScores):

    for ne in [300, 350, 400,450, 500, 550, 600]:
        for md in [30, 35, 40, 45, 50]:
            print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
                    " - Start processing RandomForestRegressor for n_estimator:%s  max_depth:%s" %(ne, md))
            model = RandomForestRegressor(n_estimators = ne, max_depth = md, 
                                        max_features = 'auto', random_state = 120)
            X_data['pred'] = cross_val_predict(model, X_data, y_data.values.ravel(), cv=10)

            TestScores = rmsle(X_data['pred'], y_data, TestScores, "RandomForestRegressor %s-%s" %(ne, md))                                    

    return TestScores

# =====================================================================

def extratreesregression(X_data, y_data, TestScores):

    for ne in [100, 150, 200, 250, 300, 350, 400]:
        for md in [20, 25, 30, 35, 40, 45, 50]:
            print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
                  " - Start processing ExtraTreesRegressor for n_estimator:%s  max_depth:%s" %(ne, md))
            model = ExtraTreesRegressor(n_estimators = ne, max_depth = md, 
                                        max_features = 'auto', random_state = 120)
            X_data['pred'] = cross_val_predict(model, X_data, y_data.values.ravel(), cv=10)

            TestScores = rmsle(X_data['pred'], y_data, TestScores, "ExtraTreesRegressor %s-%s" %(ne, md))                                       

    return TestScores

# =====================================================================

def baggingregression(X_data, y_data, TestScores):
    from sklearn.ensemble import BaggingRegressor
    
    for ne in [100, 150, 200, 250, 300, 350, 400]:
        print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
                      " - Start processing BaggingRegressor for n_estimator:%s" %(ne))
        model = BaggingRegressor(n_estimators = ne, bootstrap = True,
                               bootstrap_features = True, random_state = 120)
        X_data['pred'] = cross_val_predict(model, X_data, y_data.values.ravel(), cv=10)

        TestScores = rmsle(X_data['pred'], y_data, TestScores, "BaggingRegressor %s" %(ne))

    return TestScores

# =====================================================================

def xgboost(X_data, y_data, TestScores):
    
    for ne in [200, 250, 300, 350, 400]:
        for lr in [0.05, 0.1]:
            for bt in ['gblinear', 'dart']:
                print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
                        " - Start processing XGBRegressor for n_estimator: %s learning-rate: %s booster: %s" %(ne, lr, bt))
                model = xgb.XGBRegressor(n_estimators = ne, learning_rate=lr,
                                         booster=bt, random_state = 120)
                X_data['pred'] = cross_val_predict(model, X_data, y_data.values.ravel(), cv=10)

                TestScores = rmsle(X_data['pred'], y_data, TestScores, "XGBRegressor %s-%s-%s" %(ne, lr, bt))

    return TestScores

# =====================================================================

def stacking(X_data, y_data):
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_log_error    
    from vecstack import stacking

    tmp = bike.loc[bike['mday'] <= 20]
    X_train = tmp[features2].values
    y_train = tmp['cnt'].values.ravel()

    tmp = bike.loc[bike['mday'] > 20]
    X_test = tmp[features2].values
    y_test = tmp.loc[:,'cnt']

    mXgb = xgb.XGBRegressor(n_estimators = 50, learning_rate=0.05,booster='dart', seed = 0)
    mEtr = ExtraTreesRegressor(n_estimators = 200, max_depth = 35,max_features = 'auto', random_state = 0)
    mReg = RandomForestRegressor(n_estimators = 200, max_depth = 35,max_features = 'auto', random_state = 120)
    models = [mXgb, mReg, mEtr]

    S_train, S_test = stacking(models, X_train, y_train, X_test, regression = True,
                               metric=mean_squared_log_error, n_folds = 10)
    model = xgb.XGBRegressor(seed = 0, j_jobs = 2, learning_rate = 0.1, n_estimators = 200, max_depth = 35)

    model = model.fit(S_train, y_train)
    y_test['pred'] = model.predict(S_test)
    TestScores = rmsle( y_test['pred'],  y_test, TestScores, "Stacking")

# =====================================================================

print("Import 2013 Bike Sharing data")
TestScores = pd.DataFrame(columns = scores_cols)

if __name__ == '__main__':
    # Linear Regression perform much better without one-hot encode data
    bike = read_data()
    bike = transform2_data(bike)
    y = bike[['cnt']]
    X = bike.loc[:,features]
    linearregression(X, y, TestScores)  # LinearRegression|0.2961375389416182

    ridgeregression(X, y, TestScores)
    # RidgeRegression-auto|0.29631404684815926
    # RidgeRegression-svd|0.29932419093854423

    bayesianridge(X, y, TestScores)  # BayesianRidge|0.2976985295048054

    bike = read_data()
    bike = transform1_data(bike)
    y = bike[['cnt']]
    X = bike.loc[:,features2]
    
    sgdregressor(X, y, TestScores)
        
    stacking(X, y)

    randomforestregression(X, y, TestScores)

    extratreesregression(X, y, TestScores)

    baggingregression(X, y, TestScores)

    xgboost(X, y, TestScores)

    gradientboostingregression(X, y, TestScores)
    


Import 2013 Bike Sharing data
2017-11-26 04:55:19 - Start processing LinearRegression
2017-11-26 04:55:28 - Start processing RidgeRegression for solver: auto
2017-11-26 04:55:36 - Start processing RidgeRegression for solver: svd
2017-11-26 04:55:45 - Start processing RidgeRegression for solver: lsqr
2017-11-26 04:55:54 - Start processing RidgeRegression for solver: sparse_cg
2017-11-26 04:56:02 - Start processing RidgeRegression for solver: saga
2017-11-26 04:56:40 - Start processing BayesianRidge
2017-11-26 04:57:00 - Start processing SGDRegressor
