In [95]:
## Load helpers

# Python Basic Tools
import pandas as pd
import numpy as np
import seaborn as sn
import os
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# Sklearn Basic Tools
import sklearn.metrics
import sklearn.neighbors
from sklearn.model_selection import train_test_split

# Sklearn Regression and Classification Tools
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.naive_bayes import GaussianNB

# Usual cross-val tools and Grid searching 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

# Set random seed
np.random.seed(47)


In [102]:
# Import Indicators Data
df = pd.read_csv("Regime-Classification-inputs copy.csv")
df.index = pd.to_datetime(df.Date) 
del df['Date']

df

Unnamed: 0_level_0,INDPRO,PCE,PMI,T10YFF,Composite Leading Indicator (CLI),New Privately-Owned Housing Units Started: Total Units,Index of Consumer Sentiment,Chicago Fed National Activity Index,Regime
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1978-01-01,47.9744,1329.5,59.8,1.22,103.38940,1718,83.7,-1.21,Slowdown
1978-02-01,48.2322,1355.1,57.4,1.23,103.26540,1738,84.3,0.99,Slowdown
1978-03-01,49.1693,1377.5,55.9,1.23,103.15830,2032,78.8,1.27,Slowdown
1978-04-01,50.1546,1396.4,55.0,1.29,103.08790,2197,81.6,1.72,Slowdown
1978-05-01,50.3971,1412.0,57.7,0.98,103.08600,2075,82.9,0.35,Slowdown
...,...,...,...,...,...,...,...,...,...
2021-02-01,96.3720,14774.0,58.7,1.18,99.03598,1447,76.8,-1.67,
2021-03-01,98.9400,15515.3,60.8,1.54,99.29778,1725,84.9,2.30,
2021-04-01,98.9660,15656.4,64.7,1.57,99.60093,1514,88.3,-0.16,
2021-05-01,99.6536,15659.3,60.7,1.56,99.94283,1546,82.9,0.26,


In [103]:
# Only use dataset before 2011-05-01

df = df[df.index <= '2011-05-01']
df

Unnamed: 0_level_0,INDPRO,PCE,PMI,T10YFF,Composite Leading Indicator (CLI),New Privately-Owned Housing Units Started: Total Units,Index of Consumer Sentiment,Chicago Fed National Activity Index,Regime
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1978-01-01,47.9744,1329.5,59.8,1.22,103.3894,1718,83.7,-1.21,Slowdown
1978-02-01,48.2322,1355.1,57.4,1.23,103.2654,1738,84.3,0.99,Slowdown
1978-03-01,49.1693,1377.5,55.9,1.23,103.1583,2032,78.8,1.27,Slowdown
1978-04-01,50.1546,1396.4,55.0,1.29,103.0879,2197,81.6,1.72,Slowdown
1978-05-01,50.3971,1412.0,57.7,0.98,103.0860,2075,82.9,0.35,Slowdown
...,...,...,...,...,...,...,...,...,...
2011-01-01,93.4768,10435.5,57.5,3.22,100.8496,630,74.2,0.04,Expansion
2011-02-01,93.0617,10470.1,59.0,3.42,100.8136,517,77.5,-0.39,Expansion
2011-03-01,94.0244,10550.5,59.3,3.28,100.7231,600,67.5,0.39,Expansion
2011-04-01,93.7011,10587.6,59.1,3.36,100.5675,554,69.8,-0.48,Expansion


In [107]:
# Augmented Dickey-Fuller Test (ADF Test)
from statsmodels.tsa.stattools import adfuller

def adfuller_test(sales):
    result = adfuller(sales)
    labels = ['ADF test statistics', 'P-value', '#Lags used', 'Number of observation used']
    for value, label in zip(result, labels):
        print(label+' : '+str(value))
    if result[1] <= 0.05:
        print('Strong evidence against the null hypothesis (Ho), Reject the null hypothesis, Data has no unit root and is stationary')
    else:
        print('Weak evidence against the null hypothesis (Ho), time series has a unit root, indicating it is non stationary. ')
        
        
#adfuller_test(all_data['New Privately-Owned Housing Units Started: Total Units'])
for feature in df.columns:
    print(feature)
    adfuller_test(df[feature])
    print("------------------------------------")
    

INDPRO
ADF test statistics : -0.7466605750538764
P-value : 0.8342904923629519
#Lags used : 4
Number of observation used : 396
Weak evidence against the null hypothesis (Ho), time series has a unit root, indicating it is non stationary. 
------------------------------------
PCE
ADF test statistics : 2.4897951810055554
P-value : 0.9990464858578348
#Lags used : 4
Number of observation used : 396
Weak evidence against the null hypothesis (Ho), time series has a unit root, indicating it is non stationary. 
------------------------------------
PMI
ADF test statistics : -4.944453439590027
P-value : 2.855305891267862e-05
#Lags used : 2
Number of observation used : 398
Strong evidence against the null hypothesis (Ho), Reject the null hypothesis, Data has no unit root and is stationary
------------------------------------
T10YFF
ADF test statistics : -2.963021586241064
P-value : 0.038495011732924515
#Lags used : 8
Number of observation used : 392
Strong evidence against the null hypothesis (Ho),

ValueError: could not convert string to float: 'Slowdown'

In [110]:
# Change Non-Stationary to Stationary
cols = ['INDPRO', 
        'PCE',
        'New Privately-Owned Housing Units Started: Total Units' ]

non_stat = df[cols]
stat = non_stat.diff()[1:]
df = df.copy()[1:]
df[cols] = stat
df

Unnamed: 0_level_0,INDPRO,PCE,PMI,T10YFF,Composite Leading Indicator (CLI),New Privately-Owned Housing Units Started: Total Units,Index of Consumer Sentiment,Chicago Fed National Activity Index,Regime
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1978-02-01,0.2578,25.6,57.4,1.23,103.2654,20.0,84.3,0.99,Slowdown
1978-03-01,0.9371,22.4,55.9,1.23,103.1583,294.0,78.8,1.27,Slowdown
1978-04-01,0.9853,18.9,55.0,1.29,103.0879,165.0,81.6,1.72,Slowdown
1978-05-01,0.2425,15.6,57.7,0.98,103.0860,-122.0,82.9,0.35,Slowdown
1978-06-01,0.3034,13.8,60.2,0.84,103.1671,-5.0,80.0,0.77,Slowdown
...,...,...,...,...,...,...,...,...,...
2011-01-01,-0.1724,43.4,57.5,3.22,100.8496,91.0,74.2,0.04,Expansion
2011-02-01,-0.4151,34.6,59.0,3.42,100.8136,-113.0,77.5,-0.39,Expansion
2011-03-01,0.9627,80.4,59.3,3.28,100.7231,83.0,67.5,0.39,Expansion
2011-04-01,-0.3233,37.1,59.1,3.36,100.5675,-46.0,69.8,-0.48,Expansion


In [111]:
# Convert category variables to dummy variables

df['Regime'] = df['Regime'].replace({'Slowdown':1,'Contraction':2, 'Expansion':3, 'Recovery':4})
df


Unnamed: 0_level_0,INDPRO,PCE,PMI,T10YFF,Composite Leading Indicator (CLI),New Privately-Owned Housing Units Started: Total Units,Index of Consumer Sentiment,Chicago Fed National Activity Index,Regime
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1978-02-01,0.2578,25.6,57.4,1.23,103.2654,20.0,84.3,0.99,1
1978-03-01,0.9371,22.4,55.9,1.23,103.1583,294.0,78.8,1.27,1
1978-04-01,0.9853,18.9,55.0,1.29,103.0879,165.0,81.6,1.72,1
1978-05-01,0.2425,15.6,57.7,0.98,103.0860,-122.0,82.9,0.35,1
1978-06-01,0.3034,13.8,60.2,0.84,103.1671,-5.0,80.0,0.77,1
...,...,...,...,...,...,...,...,...,...
2011-01-01,-0.1724,43.4,57.5,3.22,100.8496,91.0,74.2,0.04,3
2011-02-01,-0.4151,34.6,59.0,3.42,100.8136,-113.0,77.5,-0.39,3
2011-03-01,0.9627,80.4,59.3,3.28,100.7231,83.0,67.5,0.39,3
2011-04-01,-0.3233,37.1,59.1,3.36,100.5675,-46.0,69.8,-0.48,3


In [114]:
# Seperate data with features and labels
X = df[df.columns[0:8]] #features
y = df['Regime']  #labels
cols = list(X.columns)

# Split the dataframe into random train and test subsets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=47) #split by half


In [115]:
# Import package
!pip install xgboost
from xgboost import XGBClassifier





In [116]:
# Define Traning Scores Function
def getTrainScores(gs):
    results = {}
    runs = 0
    for X,y in zip(list(gs.cv_results_['mean_test_score']), gs.cv_results_['params']):
        results[runs] = 'mean:' + str(X) + 'params' + str(y)
        runs += 1
    best = {'best_mean': gs.best_score_, "best_param":gs.best_params_}
    return results, best

In [117]:
# gsearch1: Tune max_depth and min_child_weight

xgb1 = XGBClassifier(learning_rate=0.1,# Fix learning rate
                    n_estimators=1000, # Fix number of estimators
                    max_depth=5,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=4,
                    num_class=9,
                    seed=47)
param_test1 = {
 'max_depth':range(1,10),
 'min_child_weight':range(1,10)
}
# metrics to consider:  f1_macro score
# best metric to assess the quality of problems with multiple binary labels or multiple classes
gsearch1 = GridSearchCV(estimator = xgb1, param_grid = param_test1, scoring='f1_macro',n_jobs=-1,verbose = 10, cv=5)
gsearch1.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   39.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   47.9s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   57.5s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  2



GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=5, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=1000, n_jobs=None, nthread=4,
                                     num_class=9, num_parallel_tree=None,
                                     objective='multi:softmax',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weigh

In [119]:
getTrainScores(gsearch1)


({0: "mean:0.6219060588173072params{'max_depth': 1, 'min_child_weight': 1}",
  1: "mean:0.6381241745327998params{'max_depth': 1, 'min_child_weight': 2}",
  2: "mean:0.6275392203620052params{'max_depth': 1, 'min_child_weight': 3}",
  3: "mean:0.6177601915684101params{'max_depth': 1, 'min_child_weight': 4}",
  4: "mean:0.6179044548372221params{'max_depth': 1, 'min_child_weight': 5}",
  5: "mean:0.6187506253357308params{'max_depth': 1, 'min_child_weight': 6}",
  6: "mean:0.6074874091265461params{'max_depth': 1, 'min_child_weight': 7}",
  7: "mean:0.6089836019847817params{'max_depth': 1, 'min_child_weight': 8}",
  8: "mean:0.6094241214685125params{'max_depth': 1, 'min_child_weight': 9}",
  9: "mean:0.6728514912262823params{'max_depth': 2, 'min_child_weight': 1}",
  10: "mean:0.6612065816207495params{'max_depth': 2, 'min_child_weight': 2}",
  11: "mean:0.6587852785088251params{'max_depth': 2, 'min_child_weight': 3}",
  12: "mean:0.6511297026555912params{'max_depth': 2, 'min_child_weight': 4

In [125]:
# gsearch2:Tune gamma

xgb2 = XGBClassifier(learning_rate=0.1,# Fix learning rate
                    n_estimators=1000, # Fix number of estimators
                    max_depth=4,
                    min_child_weight=4,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=4,
                    num_class=9,
                    seed=47)
param_test2 = {
 'gamma':[i/10.0 for i in range(0,5)]
}


                        
                        
# metrics to consider:  f1_macro score
# best metric to assess the quality of problems with multiple binary labels or multiple classes
gsearch2 = GridSearchCV(estimator = xgb2, param_grid = param_test2, scoring='f1_macro',n_jobs=-1,verbose = 10, cv=5)
gsearch2.fit(X_train, y_train)






Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done  21 out of  25 | elapsed:   21.7s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   24.8s finished




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=4, min_child_weight=4,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=1000, n_jobs=None, nthread=4,
                                     num_class=9, num_parallel_tree=None,
                                     objective='multi:softmax',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weigh

In [126]:
getTrainScores(gsearch2)


({0: "mean:0.6797468696356166params{'gamma': 0.0}",
  1: "mean:0.6653979090177196params{'gamma': 0.1}",
  2: "mean:0.6661477771208681params{'gamma': 0.2}",
  3: "mean:0.674114459117452params{'gamma': 0.3}",
  4: "mean:0.6749850456137321params{'gamma': 0.4}"},
 {'best_mean': 0.6797468696356166, 'best_param': {'gamma': 0.0}})

In [132]:
# gsearch3: Tune subsample and colsample_bytree

xgb3 = XGBClassifier(learning_rate=0.1,# Fix learning rate
                    n_estimators=1000, # Fix number of estimators
                    max_depth=4,
                    min_child_weight=4,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=4,
                    num_class=9,
                    seed=47)
param_test3 = {'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]}

# metrics to consider:  f1_macro score
# best metric to assess the quality of problems with multiple binary labels or multiple classes
gsearch3 = GridSearchCV(estimator = xgb3, param_grid = param_test3, scoring='f1_macro',n_jobs=-1,verbose = 10, cv=5)
gsearch3.fit(X_train, y_train)


Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   26.0s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   41.8s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   50.3s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  1.1min finished




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=4, min_child_weight=4,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=1000, n_jobs=None, nthread=4,
                                     num_class=9, num_parallel_tree=None,
                                     objective='multi:softmax',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weigh

In [133]:
getTrainScores(gsearch3)



({0: "mean:0.6448144759625255params{'colsample_bytree': 0.6, 'subsample': 0.6}",
  1: "mean:0.6547190680899967params{'colsample_bytree': 0.6, 'subsample': 0.7}",
  2: "mean:0.6526784720203983params{'colsample_bytree': 0.6, 'subsample': 0.8}",
  3: "mean:0.6589494834911067params{'colsample_bytree': 0.6, 'subsample': 0.9}",
  4: "mean:0.6498969074403858params{'colsample_bytree': 0.7, 'subsample': 0.6}",
  5: "mean:0.6763062782606388params{'colsample_bytree': 0.7, 'subsample': 0.7}",
  6: "mean:0.6638918113961939params{'colsample_bytree': 0.7, 'subsample': 0.8}",
  7: "mean:0.6788200422629737params{'colsample_bytree': 0.7, 'subsample': 0.9}",
  8: "mean:0.6597014744681078params{'colsample_bytree': 0.8, 'subsample': 0.6}",
  9: "mean:0.6773128414702045params{'colsample_bytree': 0.8, 'subsample': 0.7}",
  10: "mean:0.6797468696356166params{'colsample_bytree': 0.8, 'subsample': 0.8}",
  11: "mean:0.6719905170664547params{'colsample_bytree': 0.8, 'subsample': 0.9}",
  12: "mean:0.655604958617

In [134]:
# gsearch4:Tuning Regularization Parameters

xgb4 = XGBClassifier(learning_rate=0.1,# Fix learning rate
                    n_estimators=1000, # Fix number of estimators
                    max_depth=4,
                    min_child_weight=4,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=4,
                    num_class=9,
                    seed=47)
param_test4 = {
 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}

# metrics to consider:  f1_macro score
# best metric to assess the quality of problems with multiple binary labels or multiple classes
gsearch4 = GridSearchCV(estimator = xgb4, param_grid = param_test4, scoring='f1_macro',n_jobs=-1,verbose = 10, cv=5)
gsearch4.fit(X_train, y_train)


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done  21 out of  25 | elapsed:   18.0s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   19.7s finished




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=4, min_child_weight=4,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=1000, n_jobs=None, nthread=4,
                                     num_class=9, num_parallel_tree=None,
                                     objective='multi:softmax',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weigh

In [135]:
getTrainScores(gsearch4)


({0: "mean:0.6797468696356166params{'reg_alpha': 0}",
  1: "mean:0.6661587472187911params{'reg_alpha': 0.001}",
  2: "mean:0.6664707572378434params{'reg_alpha': 0.005}",
  3: "mean:0.6665398580239359params{'reg_alpha': 0.01}",
  4: "mean:0.672005858045611params{'reg_alpha': 0.05}"},
 {'best_mean': 0.6797468696356166, 'best_param': {'reg_alpha': 0}})

In [136]:
# Apply optimal parameters XGB Classifier

xgb = XGBClassifier(learning_rate=0.1,# Fix learning rate
                    n_estimators=1000, # Fix number of estimators
                    max_depth=4,
                    min_child_weight=4,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=4,
                    num_class=9,
                    reg_alpha = 0,
                    seed=47)

xgb.fit(X_train, y_train)




# Predict the labels of the test set: preds

predictions = xgb.predict(X_test)

# Compute the accuracy: accuracy

accuracy = float(np.sum(predictions == y_test))/y_test.shape[0]

print("accuracy: %f" % (accuracy*100))
print(predictions)

accuracy: 57.500000
[1 1 3 3 2 1 2 1 1 4 1 2 1 4 3 1 1 3 2 3 2 1 3 3 1 1 2 1 1 3 3 1 1 1 1 1 1
 2 3 3 1 2 3 1 3 2 1 2 2 3 1 3 1 1 2 3 1 1 3 3 1 3 3 3 3 3 3 2 2 1 3 3 2 1
 1 4 2 3 1 1]


In [None]:
# A parameter grid for XGBoost

# from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# params = {
#         'min_child_weight': [1, 5, 10],
#         'gamma': [0.5, 1, 1.5, 2, 5],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'max_depth': [3, 4, 5]
#         }

# xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
#                     silent=True, nthread=1)

# folds = 3
# param_comb = 5

# skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

# random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=1001 )
# random_search.fit(X, y)

# print('\n All results:')
# print(random_search.cv_results_)
# print('\n Best estimator:')
# print(random_search.best__estimator__)
# print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
# print(random_search.best__score__ * 2 - 1)
# print('\n Best hyperparameters:')
# print(random_search.best__params__)
# results = pd.DataFrame(random_search.cv__results__)

# #results.to_csv('xgb-random-grid-search-results-01.csv', index=False)

In [42]:
# # from sklearn.model_selection import KFold, GridSearchCV
# from sklearn.metrics import accuracy_score, make_scorer
# from sklearn.feature_selection import SelectKBest, chi2

# pipe = Pipeline([
#   ('fs', SelectKBest()),
#   ('clf', xgb.XGBClassifier(objective='binary:logistic'))
# ])

# # Define our search space for grid search
# search_space = [
#   {
#     'clf__n_estimators': [50, 100, 150, 200],
#     'clf__learning_rate': [0.01, 0.1, 0.2, 0.3],
#     'clf__max_depth': range(3, 10),
#     'clf__colsample_bytree': [i/10.0 for i in range(1, 3)],
#     'clf__gamma': [i/10.0 for i in range(3)],
#     'fs__score_func': [chi2],
#     'fs__k': [10],
#   }
# ]
# # Define cross validation
# kfold = KFold(n_splits=10, random_state=42)
# # AUC and accuracy as score
# scoring = {'AUC':'roc_auc', 'Accuracy':make_scorer(accuracy_score)}
# # Define grid search
# grid = GridSearchCV(
#   pipe,
#   param_grid=search_space,
#   cv=kfold,
#   scoring=scoring,
#   refit='AUC',
#   verbose=1,
#   n_jobs=-1
# )
# # Fit grid search
# model = grid.fit(X_train, y_train)

AttributeError: 'XGBClassifier' object has no attribute 'XGBClassifier'

In [44]:
# import xgboost as xgb
# xgbcl = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
#          gamma=0.0, max_delta_step=0.0, min_child_weight=1.0,
#          missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
#          reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto')

# kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# param_grid = { 
#     'colsample_bytree':[.75,1],
#     'learning_rate':[0.01,0.05,0.1,0.3,0.5],
#     'max_depth':[1,2,3,5],
#     'subsample':[.75,1],
#     'n_estimators': list(range(50, 400, 50))
# }

# grid_search = GridSearchCV(estimator=xgbcl, scoring='roc_auc', param_grid=param_grid, n_jobs=-1, cv=kfold)
# grid_result = grid_search.fit(X_train, y_train)

# print(f'Best: {grid_result.best_score_} using {grid_result.best_params_}','\n')

ValueError: multiclass format is not supported