In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import normalize
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse 

from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_log_error    
from vecstack import stacking
import xgboost as xgb

import pandas as pd
import numpy as np
import datetime as dt
from math import *

# =====================================================================

ifile = "D:/Capston/2013_Hour_By_Tract.csv"
scores_cols = ['Test', 'Test_score']
statfile = "D:/Capston/Regression_" + dt.datetime.now().strftime("%Y%m%d%H%M%S") + ".txt"
n_job=2

features=['tractID','mday','hr','temp',
          'season', 'weather', 'weekday', 'mth',
          'tempCluster','hrCluster',
          'holiday','workingday', 'windspeed','humidity']
features2=['tractID','mday','hr',
          'season_0','season_1','season_2','season_3',
          'weather_0','weather_1','weather_2','weather_3',
          'weekday_0','weekday_1','weekday_2','weekday_3','weekday_4','weekday_5','weekday_6',
          'mth_0','mth_1','mth_2','mth_3','mth_4','mth_5','mth_6','mth_7','mth_8','mth_9','mth_10','mth_11',
          'tempCluster','hrCluster',
          'holiday','workingday', 'windspeed','humidity']

# =====================================================================

def read_data():
    ds = pd.read_csv(ifile, sep=',', header=0)
    ds['dteday'] = pd.to_datetime(ds['dteday'], format='%Y-%m-%d')
    ds['mday'] = ds['dteday'].dt.day 
    ds['sday'] = ds['dteday'].dt.day 
    
    ds['casual'] = [log1p(x) for x in ds['casual']]
    ds['registered'] = [log1p(x) for x in ds['registered']]
    ds['cnt'] = [log1p(x) for x in ds['cnt']]
    
    return remove_columns(ds, ['dteday', 'yr'])


def remove_columns(ds, drop_cols):
    ds = ds.drop(drop_cols, axis = 1)

    return ds

# =====================================================================

def temp_cluster(temp):

    if temp <= 1.0: 
        return 0
    elif temp > 1.0 and temp <= 15.0:
        return 1
    elif temp > 15.0 and temp <= 22.0:
        return 2
    elif temp > 22.0 and temp <= 31.0:
        return 3
    else:
        return 4


def hr_cluster(hr):

    if hr <= 6.5: 
        return 0
    elif hr > 6.5 and hr <= 7.5:
        return 1
    elif hr > .5 and hr <= 8.5:
        return 2
    elif hr > 8.5 and hr <= 16:
        return 3
    elif hr > 16 and hr <= 18:
        return 4
    elif hr > 18 and hr <= 20:
        return 5
    else:
        return 6
     

def transform1_data(ds):
    
    enc = OneHotEncoder(sparse=False)  
    n = ds['season'].shape[0]
    enc_array = pd.DataFrame(sparse.csr_matrix(enc.fit_transform(ds['season'].values.reshape(-1, 1))).todense().reshape(n, 1, 4))
    ds['season_0'] = enc_array[[0]]
    ds['season_1'] = enc_array[[1]] 
    ds['season_2'] = enc_array[[2]] 
    ds['season_3'] = enc_array[[3]]     

    enc_array = pd.DataFrame(sparse.csr_matrix(enc.fit_transform(ds['weather'].values.reshape(-1, 1))).todense().reshape(n, 1, 4))
    ds['weather_0'] = enc_array[[0]]
    ds['weather_1'] = enc_array[[1]] 
    ds['weather_2'] = enc_array[[2]] 
    ds['weather_3'] = enc_array[[3]]     

    enc_array = pd.DataFrame(sparse.csr_matrix(enc.fit_transform(ds['weekday'].values.reshape(-1, 1))).todense().reshape(n, 1, 7))
    ds['weekday_0'] = enc_array[[0]]
    ds['weekday_1'] = enc_array[[1]] 
    ds['weekday_2'] = enc_array[[2]] 
    ds['weekday_3'] = enc_array[[3]]     
    ds['weekday_4'] = enc_array[[4]]
    ds['weekday_5'] = enc_array[[5]] 
    ds['weekday_6'] = enc_array[[6]] 
    
    enc_array = pd.DataFrame(sparse.csr_matrix(enc.fit_transform(ds['mth'].values.reshape(-1, 1))).todense().reshape(n, 1, 12))
    ds['mth_0'] = enc_array[[0]]
    ds['mth_1'] = enc_array[[1]] 
    ds['mth_2'] = enc_array[[2]] 
    ds['mth_3'] = enc_array[[3]]     
    ds['mth_4'] = enc_array[[4]]
    ds['mth_5'] = enc_array[[5]] 
    ds['mth_6'] = enc_array[[6]] 
    ds['mth_7'] = enc_array[[7]]
    ds['mth_8'] = enc_array[[8]] 
    ds['mth_9'] = enc_array[[9]] 
    ds['mth_10'] = enc_array[[10]]     
    ds['mth_11'] = enc_array[[11]]
    
    ds = transform2_data(ds)

    return remove_columns(ds, ['season', 'weather', 'weekday', 'mth', 'temp'])

        
def transform2_data(df):
        
    df['hrCluster'] = df.apply(lambda x: hr_cluster(x['hr']), axis = 1) 
    df['tempCluster'] = df.apply(lambda x: temp_cluster(x['temp']), axis = 1) 

    df['tractID'] = MinMaxScaler().fit_transform(df['tractID'].astype(float).values.reshape(-1, 1))
    df['mday'] = StandardScaler().fit_transform(df['mday'].astype(float).values.reshape(-1, 1))
    df['hr'] = StandardScaler().fit_transform(df['hr'].astype(float).values.reshape(-1, 1))
    df['weekday'] = StandardScaler().fit_transform(df['weekday'].astype(float).values.reshape(-1, 1))
    df['weather'] = StandardScaler().fit_transform(df['weather'].astype(float).values.reshape(-1, 1))
    df['temp'] = StandardScaler().fit_transform(df['temp'].values.reshape(-1, 1))
    df['mth'] = StandardScaler().fit_transform(df['mth'].astype(float).values.reshape(-1, 1))
    df['season'] = StandardScaler().fit_transform(df['season'].astype(float).values.reshape(-1, 1))    
    df['windspeed'] = [log1p(x) for x in df['windspeed']]
    df['humidity'] = [log1p(x) for x in df['humidity']]
    
    return df


def rmsle(predicted, actual, TestScores, testname):
    df = pd.concat([actual, predicted], axis = 1)
    df['err'] = df.apply(lambda x: pow((x['cnt'] - x['pred']), 2), axis = 1)
    error = df['err'].sum()

    tmp = pd.DataFrame([[testname, sqrt(df['err'].sum() / len(df))]],
                       columns = scores_cols)

    tmp.to_csv(statfile, mode='a', header = False, index = False, sep = '|')
    TestScores = TestScores.append(tmp, ignore_index = True)

    return TestScores


# =====================================================================

def runRegression(model, testname, Testscores, features):

    tmp = bike.loc[bike['sday'] <= 20]
    X_train = tmp[features]
    if testname == 'LinearRegression' or testname == 'RidgeRegression':
        y_train = tmp['cnt']
    else:
        y_train = tmp['cnt'].values.ravel()

    tmp = bike.loc[bike['sday'] > 20]
    X_test = tmp.loc[:,features]
    y_test = tmp.loc[:,['cnt']]

        
    model.fit(X_train, y_train)
    
    X_test['pred'] = pd.DataFrame(model.predict(X_test))

    Testscores=rmsle( X_test['pred'], y_test, Testscores, testname)

    return Testscores


# =====================================================================

print("Import 2013 Bike Sharing data")
TestScores = pd.DataFrame(columns = scores_cols)


if __name__ == '__main__': 
    bike = read_data()
    bike = transform2_data(bike)
    
    print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
          " - Start processing LinearRegression")
    model = LinearRegression(normalize = False)
    TestScores = runRegression(model,'LinearRegression', TestScores, features) 

    for sv in ['auto', 'svd', 'lsqr', 'sparse_cg', 'saga']:
        print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
              " - Start processing RidgeRegression for solver: %s" %(sv))
        model = Ridge(solver=sv, normalize = False, random_state=212)
        TestScores = runRegression(model,'RidgeRegression', TestScores, features)
    

    print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
            " - Start processing BayesianRidge")
    model = BayesianRidge(n_iter=1000, normalize = False)
    TestScores = runRegression(model, 'BayesianRidge', TestScores, features)
    
    bike = read_data()
    bike = transform1_data(bike)
    
    print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
            " - Start processing SGDRegressor")
    model = SGDRegressor(max_iter=1000, random_state=212)
    TestScores = runRegression(model, 'SGDRegressor', TestScores, features2)

# =====================================================================

    print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
                    " - Start processing RandomForestRegressor")
    model = RandomForestRegressor(n_estimators = 300, max_depth = 35, 
                                  max_features = 'auto', random_state = 120)
    TestScores = runRegression(model, 'RandomForestRegressor', TestScores, features2)
    
    print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
                      " - Start processing ExtraTreesRegressor")
    model = ExtraTreesRegressor(n_estimators = 300, max_depth =35, 
                                            max_features = 'auto', random_state = 120)
    TestScores = runRegression(model, 'ExtraTreesRegressor', TestScores, features2)


    print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
                      " - Start processing BaggingRegressor")
    model = BaggingRegressor(n_estimators = 200, bootstrap = True,
                               bootstrap_features = True, random_state = 120)
    TestScores = runRegression(model, 'BaggingRegressor', TestScores, features2)
    

    print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
                        " - Start processing XGBRegressor")
    model = xgb.XGBRegressor(n_estimators = 250, learning_rate=0.1,
                                         booster='gblinear', random_state = 120)
    TestScores = runRegression(model, 'XGBRegressor', TestScores, features2)
    
# =====================================================================

    tmp = bike.loc[bike['sday'] <= 20]
    X_train = tmp[features2].values
    y_train = tmp['cnt'].values.ravel()

    tmp = bike.loc[bike['sday'] > 20]
    X_test = tmp[features2].values
    y_test = tmp.loc[:,'cnt']
    mXgb = xgb.XGBRegressor(n_estimators = 50, learning_rate=0.05,booster='dart', seed = 0)
    mEtr = ExtraTreesRegressor(n_estimators = 200, max_depth = 35,max_features = 'auto', random_state = 0)
    mReg = RandomForestRegressor(n_estimators = 200, max_depth = 35,max_features = 'auto', random_state = 120)
    models = [mXgb, mReg, mEtr]

    S_train, S_test = stacking(models, X_train, y_train, X_test, regression = True,
                               metric=mean_squared_log_error, n_folds = 10)
    model = xgb.XGBRegressor(seed = 0, j_jobs = 2, learning_rate = 0.1, n_estimators = 200, max_depth = 35)
    
    model = model.fit(S_train, y_train)
    y_test['pred'] = model.predict(S_test)
    
    TestScores = rmsle(y_test['pred'], y_test, TestScores, 'Stacking')
    
    '''
    print ('Final prediction score: [%.8f]' % mean_squared_log_error(y_test, y_pred))
    tmp = pd.DataFrame([['Stacking', mean_squared_log_error(y_test, y_pred)]],
                       columns = scores_cols)

    tmp.to_csv(statfile, mode='a', header = False, index = False, sep = '|')
    TestScores = TestScores.append(tmp, ignore_index = True)
    '''
# =====================================================================

    print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
                    " - Start processing GradientBoostingRegressor")
    model = GradientBoostingRegressor(n_estimators = 200, max_depth = 30,
                                           learning_rate = 0.1)
    TestScores = runRegression(model, 'GradientBoostingRegressor', TestScores, features2)


    





Import 2013 Bike Sharing data
2017-11-26 04:26:30 - Start processing LinearRegression
2017-11-26 04:26:33 - Start processing RidgeRegression for solver: auto
2017-11-26 04:26:35 - Start processing RidgeRegression for solver: svd
2017-11-26 04:26:37 - Start processing RidgeRegression for solver: lsqr
2017-11-26 04:26:39 - Start processing RidgeRegression for solver: sparse_cg
2017-11-26 04:26:41 - Start processing RidgeRegression for solver: saga
2017-11-26 04:26:46 - Start processing BayesianRidge
2017-11-26 04:26:58 - Start processing SGDRegressor
2017-11-26 04:28:10 - Start processing RandomForestRegressor
2017-11-26 04:35:51 - Start processing ExtraTreesRegressor
2017-11-26 04:42:41 - Start processing BaggingRegressor
2017-11-26 04:46:50 - Start processing XGBRegressor
