In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
import xgboost as xgb
    
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import normalize
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse 

import pandas as pd
import numpy as np
import datetime as dt
from math import *

# =====================================================================

ifile = "D:/Capston/2013_Hour_By_Tract.csv"
scores_cols = ['Test', 'fit_time', 'score_time', 'Test_score', 'Train_score']
statfile = "D:/Capston/RegStat_" + dt.datetime.now().strftime("%Y%m%d%H%M%S") + ".txt"
features=['tractID','mday','hr','tempCluster','hrCluster',
          'season_0','season_1','season_2','season_3',
          'weather_0','weather_1','weather_2','weather_3',
          'weekday_0','weekday_1','weekday_2','weekday_3','weekday_4','weekday_5','weekday_6',
          'mth_0','mth_1','mth_2','mth_3','mth_4','mth_5','mth_6','mth_7','mth_8','mth_9','mth_10','mth_11',
          'holiday','workingday', 'windspeed','humidity']
n_job=4
score='neg_mean_squared_error'

# =====================================================================

def read_data():
    ds = pd.read_csv(ifile, sep=',', header=0)
    ds['dteday'] = pd.to_datetime(ds['dteday'], format='%Y-%m-%d')
    ds['mday'] = ds['dteday'].dt.day 
    
    return remove_columns(ds, ['dteday', 'yr'])


def remove_columns(ds, drop_cols):
    ds = ds.drop(drop_cols, axis = 1)

    return ds

# =====================================================================

def store_scores(scores, TestScores, testname):
    df = pd.DataFrame([[testname,
                        scores['fit_time'].mean(),
                        scores['score_time'].mean(),
                        (-1 * scores['test_score'].mean()),
                        (-1 * scores['train_score'].mean())]],
                      columns = scores_cols)

    df.to_csv(statfile, mode='a', header = False, index = False, sep = '|')
    TestScores = TestScores.append(df, ignore_index = True)
    #print (TestScores)
    return TestScores

# =====================================================================

def split_list(alist, wanted_parts = 1):
    length = len(alist)
    return [ alist[i * length // wanted_parts: (i + 1) * length // wanted_parts]
            for i in range(wanted_parts) ]


def data_cluster(df, grpBy, cluster_num):
    
    cluster_data = df.groupby([grpBy]).agg(lambda x: x.mean())[['cnt']]
    model = cluster.KMeans(n_clusters = cluster_num)
    
    return np.array(model.fit_predict(split_list(cluster_data.iloc[:,0].values, len(cluster_data))))


def temp_cluster(temp):

    if temp <= 1.0: 
        return 0
    elif temp > 1.0 and temp <= 15.0:
        return 1
    elif temp > 15.0 and temp <= 22.0:
        return 2
    elif temp > 22.0 and temp <= 31.0:
        return 3
    else:
        return 4


def hr_cluster(hr):

    if hr <= 6.5: 
        return 0
    elif hr > 6.5 and hr <= 9.5:
        return 2
    elif hr > 9.5 and hr <= 16.5:
        return 1
    elif hr > 16.5 and hr <= 20:
        return 2
    else:
        return 0
     

def transform1_data(ds):
    
    ds['hrCluster'] = ds.apply(lambda x: hr_cluster(x['hr']), axis = 1) 
    ds['tempCluster'] = ds.apply(lambda x: temp_cluster(x['temp']), axis = 1) 

    enc = OneHotEncoder(sparse=False)  
    n = ds['season'].shape[0]
    enc_array = pd.DataFrame(sparse.csr_matrix(enc.fit_transform(ds['season'].values.reshape(-1, 1))).todense().reshape(n, 1, 4))
    ds['season_0'] = enc_array[[0]]
    ds['season_1'] = enc_array[[1]] 
    ds['season_2'] = enc_array[[2]] 
    ds['season_3'] = enc_array[[3]]     

    enc_array = pd.DataFrame(sparse.csr_matrix(enc.fit_transform(ds['weather'].values.reshape(-1, 1))).todense().reshape(n, 1, 4))
    ds['weather_0'] = enc_array[[0]]
    ds['weather_1'] = enc_array[[1]] 
    ds['weather_2'] = enc_array[[2]] 
    ds['weather_3'] = enc_array[[3]]     

    enc_array = pd.DataFrame(sparse.csr_matrix(enc.fit_transform(ds['weekday'].values.reshape(-1, 1))).todense().reshape(n, 1, 7))
    ds['weekday_0'] = enc_array[[0]]
    ds['weekday_1'] = enc_array[[1]] 
    ds['weekday_2'] = enc_array[[2]] 
    ds['weekday_3'] = enc_array[[3]]     
    ds['weekday_4'] = enc_array[[4]]
    ds['weekday_5'] = enc_array[[5]] 
    ds['weekday_6'] = enc_array[[6]] 
    
    enc_array = pd.DataFrame(sparse.csr_matrix(enc.fit_transform(ds['mth'].values.reshape(-1, 1))).todense().reshape(n, 1, 12))
    ds['mth_0'] = enc_array[[0]]
    ds['mth_1'] = enc_array[[1]] 
    ds['mth_2'] = enc_array[[2]] 
    ds['mth_3'] = enc_array[[3]]     
    ds['mth_4'] = enc_array[[4]]
    ds['mth_5'] = enc_array[[5]] 
    ds['mth_6'] = enc_array[[6]] 
    ds['mth_7'] = enc_array[[7]]
    ds['mth_8'] = enc_array[[8]] 
    ds['mth_9'] = enc_array[[9]] 
    ds['mth_10'] = enc_array[[10]]     
    ds['mth_11'] = enc_array[[11]]

    return remove_columns(ds, ['season', 'weather', 'weekday', 'mth', 'temp'])

        
def transform2_data(df):
        
    df['casual'] = [log1p(x) for x in df['casual']]
    df['registered'] = [log1p(x) for x in df['registered']]
    df['cnt'] = [log1p(x) for x in df['cnt']]

    df['tractID'] = MinMaxScaler().fit_transform(df['tractID'].astype(float).values.reshape(-1, 1))
    df['mday'] = StandardScaler().fit_transform(df['mday'].astype(float).values.reshape(-1, 1))
    df['hr'] = StandardScaler().fit_transform(df['hr'].astype(float).values.reshape(-1, 1))
    #df['weekday'] = StandardScaler().fit_transform(df['weekday'].astype(float).values.reshape(-1, 1))
    #df['weather'] = StandardScaler().fit_transform(df['weather'].astype(float).values.reshape(-1, 1))
    #df['temp'] = StandardScaler().fit_transform(df['temp'].values.reshape(-1, 1))
    #df['jday'] = StandardScaler().fit_transform(df['jday'].astype(float).values.reshape(-1, 1))
    #df['mth'] = StandardScaler().fit_transform(df['mth'].astype(float).values.reshape(-1, 1))
    #df['season'] = StandardScaler().fit_transform(df['season'].astype(float).values.reshape(-1, 1))    
    df['windspeed'] = [log1p(x) for x in df['windspeed']]
    df['humidity'] = [log1p(x) for x in df['humidity']]
    

    print(df.head())
    return df

# =====================================================================

def linearregression(X_data, y_data, TestScores):
    from sklearn.linear_model import LinearRegression
    
    print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
          " - Start processing LinearRegression")
    model = LinearRegression(normalize = True)
    scores = cross_validate(model, X_data, y_data, cv=10, 
                            scoring=score,
                            return_train_score=True, n_jobs=n_job)
    TestScores = store_scores(scores, TestScores,  
                                "LinearRegression")

    return TestScores

# =====================================================================

def ridgeregression(X_data, y_data, TestScores):
    from sklearn.linear_model import Ridge
    
    for sv in ['auto', 'svd', 'lsqr', 'sparse_cg', 'saga']:
        print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
              " - Start processing RidgeRegression for solver: %s" %(sv))
        model = Ridge(solver=sv, normalize = True, random_state=212)
        scores = cross_validate(model, X_data, y_data, cv=10, 
                                scoring=score,
                                return_train_score=True, n_jobs=n_job)
        TestScores = store_scores(scores, TestScores,  
                                "RidgeRegression %s" %(sv))

    return TestScores

# =====================================================================

def bayesianridge(X_data, y_data, TestScores):
    from sklearn.linear_model import BayesianRidge
        
    print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
            " - Start processing BayesianRidge")
    model = BayesianRidge(n_iter=1000, normalize = True)
    scores = cross_validate(model, X_data, y_data, cv=10, 
                            scoring=score,
                            return_train_score=True, n_jobs=n_job)
    TestScores = store_scores(scores, TestScores,  
                            "BayesianRidge")

    return TestScores

          
# =====================================================================

def sgdregressor(X_data, y_data, TestScores):
    from sklearn.linear_model import SGDRegressor
    
    print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
            " - Start processing SGDRegressor")
    model = SGDRegressor(n_iter=1000, random_state=212)
    scores = cross_validate(model, X_data, y_data, cv=10, 
                            scoring=score,
                            return_train_score=True, n_jobs=n_job)
    TestScores = store_scores(scores, TestScores,  
                            "SGDRegressor")

    return TestScores
          
          
# =====================================================================

def gradientboostingregression(X_data, y_data, TestScores):
    from sklearn.ensemble import GradientBoostingRegressor

#    for ne in [50, 100, 150, 200, 250, 300, 350, 400]:
    for ne in [300, 350, 400]:
        for md in [30, 35, 40]:
            print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
                    " - Start processing GradientBoostingRegressor for n_estimator:%s  max_depth:%s" %(ne, md))
            model = GradientBoostingRegressor(n_estimators = ne, max_depth = md,
                                            learning_rate = 0.1)
            scores = cross_validate(model, X_data, y_data, cv=10, 
                                    scoring=score,
                                    return_train_score=True, n_jobs=n_job)
            TestScores = store_scores(scores, TestScores,  
                                    "GradientBoostingRegressor %s-%s" %(ne, md))

    return TestScores

# =====================================================================

def randomforestregression(X_data, y_data, TestScores):

    for ne in [300, 400,500]:
        for md in [30, 35, 40]:
            print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
                    " - Start processing RandomForestRegressor for n_estimator:%s  max_depth:%s" %(ne, md))
            model = RandomForestRegressor(n_estimators = ne, max_depth = md, 
                                        max_features = 'auto', random_state = 120)
            scores = cross_validate(model, X_data, y_data, cv=10, 
                                    scoring=score, 
                                    return_train_score=True, n_jobs=n_job)
            TestScores = store_scores(scores, TestScores,  
                                    "RandomForestRegressor %s-%s" %(ne, md))

    return TestScores

# =====================================================================

def extratreesregression(X_data, y_data, TestScores):

    for ne in [100, 150, 200]:
        for md in [20, 25, 30]:
            for mf in ['sqrt', 'auto']:
                print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
                      " - Start processing ExtraTreesRegressor for n_estimator:%s  max_depth:%s  max_features:%s" %(ne, md, mf))
                model = ExtraTreesRegressor(n_estimators = ne, max_depth = md, 
                                            max_features = mf, random_state = 120)
                scores = cross_validate(model, X_data, y_data, cv=10, 
                                        scoring=score, 
                                        return_train_score=True, n_jobs=n_job)
                TestScores = store_scores(scores, TestScores,  
                                        "ExtraTreesRegressor %s-%s-%s" %(ne, md, mf))

    return TestScores

# =====================================================================

def baggingregression(X_data, y_data, TestScores):
    from sklearn.ensemble import BaggingRegressor
    
    for ne in [100, 150, 200]:
        print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
                      " - Start processing BaggingRegressor for n_estimator:%s" %(ne))
        model = BaggingRegressor(n_estimators = ne, bootstrap = True,
                               bootstrap_features = True, random_state = 120)
        scores = cross_validate(model, X_data, y_data, cv=10, 
                                scoring=score, 
                                return_train_score=True, n_jobs=n_job)
        TestScores = store_scores(scores, TestScores, "BaggingRegressor %s" %(ne))

    return TestScores

# =====================================================================

def xgboost(X_data, y_data, TestScores):
    
    for ne in [200, 250, 300]:
        for lr in [0.05, 0.1]:
            for bt in ['gblinear', 'dart']:
                print(dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + 
                        " - Start processing XGBRegressor for n_estimator: %s learning-rate: %s booster: %s" %(ne, lr, bt))
                model = xgb.XGBRegressor(n_estimators = ne, learning_rate=lr,
                                         booster=bt, random_state = 120)
                scores = cross_validate(model, X_data, y_data, cv=10, 
                                        scoring=score, 
                                        return_train_score=True, n_jobs=n_job)
                TestScores = store_scores(scores, TestScores, "XGBRegressor %s-%s-%s" %(ne, lr, bt))

    return TestScores

# =====================================================================

def stacking(X_data, y_data):
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_log_error    
    from vecstack import stacking


    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.2, random_state = 212)

    mXgb = xgb.XGBRegressor(n_estimators = 50, learning_rate=0.05,booster='dart', seed = 0)
    mEtr = ExtraTreesRegressor(n_estimators = 200, max_depth = 35,max_features = 'auto', random_state = 0)
    mReg = RandomForestRegressor(n_estimators = 200, max_depth = 35,max_features = 'auto', random_state = 120)
    models = [mXgb, mReg, mEtr]

    S_train, S_test = stacking(models, X_train, y_train, X_test, regression = True,
                               metric=mean_squared_log_error, n_folds = 5)
    model = xgb.XGBRegressor(seed = 0, j_jobs = 2, learning_rate = 0.1, n_estimators = 200, max_depth = 35)

    model = model.fit(S_train, y_train)
    y_pred = model.predict(S_test)
    
    print ('Final prediction score: [%.8f]' % mean_squared_log_error(y_test, y_pred))

# =====================================================================

print("Import 2013 Bike Sharing data")
bike = read_data()
TestScores = pd.DataFrame(columns = scores_cols)

print(bike.head())

#X = bike[['tractID','workingday','humidity','hr', 'temp','mthCluster','hrCluster','weather','weekday','mth','mday','windspeed','tempCluster','season_0','season_1','season_2','season_3']]
y = bike[['cnt']]

if __name__ == '__main__':
    bike = transform1_data(bike)
    X = bike[features]
    linearregression(X, y, TestScores)
    ridgeregression(X, y, TestScores)
    bayesianridge(X, y.values.ravel(), TestScores)
    sgdregressor(X, y.values.ravel(), TestScores)
        
    bike = transform2_data(bike)
    X = bike[features]
    #stacking(X.values, y.values.ravel())
    #randomforestregression(X, y.values.ravel(), TestScores)
    extratreesregression(X, y.values.ravel(), TestScores)
    baggingregression(X, y.values.ravel(), TestScores)
    xgboost(X, y.values.ravel(), TestScores)
    gradientboostingregression(X, y.values.ravel(), TestScores)
    
    



Import 2013 Bike Sharing data
   tractID  jday  season  mth  hr  holiday  weekday  workingday  weather  \
0        2     1       1    1   0        1        2           0        2   
1        8     1       1    1   0        1        2           0        2   
2       16     1       1    1   0        1        2           0        2   
3       17     1       1    1   0        1        2           0        2   
4       19     1       1    1   0        1        2           0        2   

   temp  atemp  humidity  windspeed  casual  registered  cnt  mday  
0   3.3    0.4        68       11.1       0           1    1     1  
1   3.3    0.4        68       11.1       0           1    1     1  
2   3.3    0.4        68       11.1       0           1    1     1  
3   3.3    0.4        68       11.1       0           1    1     1  
4   3.3    0.4        68       11.1       0           2    2     1  
2017-11-25 22:07:24 - Start processing LinearRegression
2017-11-25 22:07:29 - Start processing Ridg