In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import xgboost as xgb

from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import OneHotEncoder

def find_index(value, li):
    l = len(li)
    for i in range(l):
        if(li[i] == value):
            return i
    return -1


def find_best_value_for_parameter(X, y,
                                  other_parameter_values,
                                  parameter_name,
                                  first_level_values,
                                  second_level_values):
    grid = {parameter_name: first_level_values}
    clf = xgb.XGBClassifier()
    clf.set_params(**other_parameter_values)
    grid_search = GridSearchCV(estimator = clf, param_grid = grid, scoring='roc_auc', cv = 5, verbose = 100)
    grid_search.fit(X, y)
    ind = find_index(grid_search.best_params_[parameter_name], first_level_values)
    if(ind == -1):
        return grid_search.best_params_[parameter_name]
    else:
        grid = {parameter_name: second_level_values[ind]}
        grid_search = GridSearchCV(estimator = clf, param_grid = grid, scoring='roc_auc', cv = 5, verbose = 100)
        grid_search.fit(X, y)
        return grid_search.best_params_[parameter_name]
    
    
if __name__ == "__main__": 
    # load labeled data
    train_df = pd.read_csv('train.csv')
    train_data = pd.DataFrame.as_matrix(train_df)
    y = train_data[:,0]; X = train_data[:,1:9];
    # load unlabeled data
    test_df = pd.read_csv('test.csv')
    test_data = pd.DataFrame.as_matrix(test_df)
    id_test = test_data[:,0]; X_test = test_data[:,1:9];
    
    X_fitting = np.vstack([X, X_test])
    enc = OneHotEncoder(categorical_features='all')
    enc.fit(X_fitting)
    X = enc.transform(X)

In [None]:
# cross validation
#grid = {'min_child_weight':range(1,6,2), 
#        'colsample_bytree':[i/10.0 for i in range(6,8)],
#        'max_delta_step':[i for i in range(1,3)]}
#grid = {'max_delta_step':[i for i in range(1,3)],
#       'gamma':[0,0.001,0.01,0.1,1,10]}
#     grid = {'n_estimators': [50,100,200,400,800]}
#     grid_search = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate =0.2, n_estimators=90, max_depth=9, objective= 'binary:logistic', seed=27), param_grid = grid, scoring='roc_auc', cv=5)
#     grid_search.fit(X, y)
#     print grid_search.best_params_
other_parameter_values = {'learning_rate': 0.2,
                          'n_estimators':90,
                          'max_depth':9,
                          'objective':'binary:logistic'}
parameter_name = 'n_estimators'
other_parameter_values[parameter_name] = find_best_value_for_parameter(X, y, other_parameter_values,
                                                                      parameter_name,
                                                                      [50,100,200,400,800],
                                                                      {0:[50,60,70,80,90],
                                                                       1:[100,120,140,160,180],
                                                                       2:[200,240,280,320,360],
                                                                       3:[400,480,560,640,720],
                                                                       4:[880,960, ]})

In [None]:
print other_parameter_values[parameter_name]

In [None]:
parameter_name = 'max_depth'
other_parameter_values[parameter_name] = find_best_value_for_parameter(X, y, other_parameter_values,
                                                                      parameter_name,
                                                                      [4, 8, 16, 32],
                                                                      {0:[4,5,6,7],
                                                                       1:[8,9,10,11,12,13,14,15],
                                                                       2:[16,20,24,28],
                                                                       3:[32, 40, 48, 56]})

In [None]:
 print other_parameter_values[parameter_name]

In [None]:
parameter_name = 'colsample_bytree'
other_parameter_values[parameter_name] = find_best_value_for_parameter(X, y, other_parameter_values,
                                                                      parameter_name,
                                                                      [0.1, 0.2],
                                                                      {0:[0.1,0.12,0.14,0.16,0.18],
                                                                       1:[0.2,0.22,0.24,0.26,0.28]})

In [None]:
 print other_parameter_values

In [None]:
parameter_name = 'min_child_weight'
other_parameter_values[parameter_name] = find_best_value_for_parameter(X, y, other_parameter_values,
                                                                      parameter_name,
                                                                      [0.001, 0.01],
                                                                      {0:[0.001, 0.002, 0.004, 0.008],
                                                                       1:[0.01 , 0.02 , 0.04 , 0.08 ]})

In [None]:
print other_parameter_values

In [None]:
parameter_name = 'max_delta_step'
other_parameter_values[parameter_name] = find_best_value_for_parameter(X, y, other_parameter_values,
                                                                      parameter_name,
                                                                      [1, 2, 4, 8],
                                                                      {0:[1, 1.2, 1.4, 1.6, 1.8],
                                                                       1:[2, 2.4, 2.8, 3.2, 3.6],
                                                                       2:[4, 4.8, 5.6, 6.4, 7.2],
                                                                       3:[8, 9.6, 11.2, 12.8, 14.4]})

In [None]:
print other_parameter_values

In [2]:
#     train with best parameters
xgb_clsf = xgb.XGBClassifier(learning_rate =0.2,
                             n_estimators=880, 
                             max_depth=16,
                             objective= 'binary:logistic',
                             min_child_weight = 0.04,
                             colsample_bytree = 0.22,
                             max_delta_step=4)
#gamma = 0.0)
   
# xgb_clsf = xgb.XGBClassifier(objective = 'binary:logistic') 
# xgb_clsf = xgb.XGBClassifier()
# xgb_clsf.set_params(**other_parameter_values)
xgb_clsf.fit(X, y)

# print other_parameter_values

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.22,
       gamma=0, learning_rate=0.2, max_delta_step=4, max_depth=16,
       min_child_weight=0.04, missing=None, n_estimators=880, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [3]:
# load unlabeled data
test_df = pd.read_csv('test.csv')
test_data = pd.DataFrame.as_matrix(test_df)
id_test = test_data[:,0]; X_test = test_data[:,1:9];
X_test = enc.transform(X_test)
# predictions on unlabeled data
y_test_pred = xgb_clsf.predict_proba(X_test)
ans = pd.DataFrame({'Id': id_test, 'Action' : y_test_pred[:,1]})
ans.to_csv('XGB-One-Hot.csv', index=False, columns=['Id', 'Action'])

In [None]:
y_train_pred = xgb_clsf.predict_proba(X)
ans = pd.DataFrame({'y': y, 'y_pred' : y_train_pred[:,1]})
ans.to_csv('hw3p2_train.csv', index=False, columns=['y', 'y_pred'])

In [None]:
print X_test

In [None]:
print X_test.shape

In [None]:
y

In [None]:
A = [0.883687,
0.882011,
0.875992,
0.871452,
0.874892]
print np.mean(A)

In [None]:
other_parameter_values = {'learning_rate': 0.2,
                          'n_estimators':90,
                          'max_depth':9,
                          'objective':'binary:logistic'}

Acc = {}

Acc[200] = 0.8436422
Acc[400] = 0.851833
Acc[800] = 0.8564346
Acc[880] = 0.856721
Acc[960] = 0.8564782
Acc[(880, 16)] = 0.8569396
Acc[(0.3, 880, 16)] = 0.8596584
Acc[(0.22, 880, 16)] = 0.8597334
Acc[(0.22, 880, 16, 0.1)] = 0.8759102
Acc[(0.22, 880, 16, 0.04)] = 0.8776068