In [1]:
import pandas as pd
import numpy as np
import xgboost2
import pickle as pkl

In [2]:
with open('adult_study3.pkl', 'rb') as infile:
    study = pkl.load(infile)
    
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials:  100
Best trial:
  Value: 0.8265008444649163
  Params: 
    reg: 0.2946302209082853
    gamma: 2.525640121911007
    feature_sel: 0.7697746203729819
    max_depth: 20
    min_child_weight: 4.459295408509322
    lr: 0.4668075716919234
    min_leaf_num: 10
    boosting_rounds: 14


In [39]:
%%time

# import dataframes
X_train = pd.read_csv('./data/X_train_adult.csv')
X_test = pd.read_csv('./data/X_test_adult.csv')
y_train = pd.read_csv('./data/y_train_adult.csv')
y_test = pd.read_csv('./data/y_test_adult.csv')

X_train = X_train.drop(['Unnamed: 0', 'fnlwgt'], axis=1)
X_test = X_test.drop(['Unnamed: 0', 'fnlwgt'], axis=1)

print(X_train.shape)
print(X_test.shape)
print(X_train.columns)

with open('final_census_model.pkl', 'rb') as infile:
    model = pkl.load(infile)
    
display(model.fe_index_array)

n_features = [2,4,6,8,10,12] 
sorted_features = np.argsort(model.fe_index_array[1])  # change the index to what type of fi
display(sorted_features)

train_data = []
for idx, each in enumerate(n_features):
    train_data.append(X_train.iloc[:, sorted_features[:each]].values)

models = [None for i in range(len(train_data))]
accuracies = []
for i in range(len(train_data)):
    print(train_data[i].shape)
    model = xgboost2.XGBoostClassifier()
    model.fit(train_data[i], y_train['income'].values,
              boosting_rounds=14, 
              feature_sel=0.77, 
              min_num_leaf=10, 
              min_child_weight=4.46, 
              max_depth=20, 
              lr=0.47, 
              reg=0.29, 
              gamma=2.53)
    pred = model.predict(X_test.values)
    accuracies.append((y_test['income'].values == pred).sum()/len(pred))
    print(accuracies[i])
    models[i] = model


(26048, 12)
(6513, 12)
Index(['age', 'workclass', 'education.num', 'marital.status', 'occupation',
       'relationship', 'race', 'sex', 'capital.gain', 'capital.loss',
       'hours.per.week', 'native.country'],
      dtype='object')


[array([ 0,  4, 10,  2,  1,  5,  8,  3,  6, 11,  7,  9], dtype=int64),
 array([ 0,  2,  8,  5, 10,  4,  3,  9,  1,  7, 11,  6], dtype=int64),
 array([ 0,  8, 10,  2,  9,  4,  3, 11,  5,  1,  6,  7], dtype=int64)]

array([ 0,  8,  1,  6,  5,  3, 11,  9,  2,  7,  4, 10], dtype=int64)

(26048, 2)
boosting round 0
boosting round 1
boosting round 2
boosting round 3
boosting round 4
boosting round 5
boosting round 6
boosting round 7
boosting round 8
boosting round 9
boosting round 10
boosting round 11
boosting round 12
boosting round 13
0.7164133271917703
(26048, 4)
boosting round 0
boosting round 1
boosting round 2
boosting round 3
boosting round 4
boosting round 5
boosting round 6
boosting round 7
boosting round 8
boosting round 9
boosting round 10
boosting round 11
boosting round 12
boosting round 13
0.7124213112237064
(26048, 6)
boosting round 0
boosting round 1
boosting round 2
boosting round 3
boosting round 4
boosting round 5
boosting round 6
boosting round 7
boosting round 8
boosting round 9
boosting round 10
boosting round 11
boosting round 12
boosting round 13
0.7147243973591278
(26048, 8)
boosting round 0
boosting round 1
boosting round 2
boosting round 3
boosting round 4
boosting round 5
boosting round 6
boosting round 7
boosting round 8
boosting round 9
boo

In [40]:
accuracies

[0.7164133271917703,
 0.7124213112237064,
 0.7147243973591278,
 0.691847075080608,
 0.7164133271917703,
 0.7136496238292646]