In [140]:
import pandas as pd
import xgboost as xgb
import numpy as np
from scipy.stats import uniform, randint
from sklearn.metrics import auc, accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split


In [141]:
def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [142]:
data = pd.read_csv('train_final.csv')
data = data.sample(frac=1)

Y = data.Y
X = pd.DataFrame(data.drop('Id', axis=1))
X = X.drop('Y', axis=1)

X = X.drop('f24', axis=1)
X = X.drop('f20', axis=1)
X = X.drop('f11', axis=1)
X = X.drop('f6', axis=1)
X = X.drop('f18', axis=1)
X = X.drop('f9', axis=1)
X = X.drop('f5', axis=1)
X = X.drop('f22', axis=1)
X = X.drop('f2', axis=1)
X = X.drop('f21', axis=1)


[X.iloc[i, j] for i,j in zip(*np.where(pd.isnull(X)))]
X

Unnamed: 0,f1,f3,f4,f7,f8,f10,f12,f13,f14,f15,f16,f17,f19,f23
4119,45903,1.77,117891,117890,117879,0,1,117880,-2.872073,20440,117878,19721,117879,1
1984,27123,1.77,118327,117961,117906,0,1,117908,1.101435,7610,120383,290919,117905,1928
13616,74908,1.77,118386,117961,134559,0,2,117908,1.000000,71186,118522,290919,117905,1
9696,32585,3.54,118291,118290,260942,0,5,120346,1.000000,5320,119598,118424,120344,10
16621,25273,3.54,118300,117961,204593,0,1,118332,0.000000,4736,118301,118331,307024,5
4497,34860,24.78,118386,117961,117906,1,1,117908,1.588628,5511,119954,290919,117905,1
6751,23920,1.77,118343,117961,132719,0,2,118332,0.386275,5323,119598,118331,307024,1
11029,20293,3.54,118150,117916,117913,0,1,117888,1.000000,77372,117884,117887,117885,1
11785,33145,1.77,118300,117961,279443,0,2,118779,3.124701,21033,123749,308574,118777,2
7600,80771,1.77,118052,117961,117906,0,1,118322,1.147093,4500,118881,290919,118321,22


In [143]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X,Y)

yhat = xgb_model.predict(X)

print(confusion_matrix(Y, yhat))

[[  597   653]
 [    0 20097]]


In [144]:
xgb_model = xgb.XGBRegressor()

params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, .5),
    "learning_rate": uniform(0.03, 0.3),
    "max_depth": randint(2, 6),
    "n_estimators": randint(100,150)
}

search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=42, n_iter=200, cv=7, verbose=1, n_jobs=-1, return_train_score=True)

search.fit(X,Y)

report_best_scores(search.cv_results_, 1)

Fitting 7 folds for each of 200 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   48.0s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed:  2.7min finished


Model with rank: 1
Mean validation score: 0.488 (std: 0.031)
Parameters: {'colsample_bytree': 0.8609771937961997, 'gamma': 0.46202087955371657, 'learning_rate': 0.10083495993522129, 'max_depth': 5, 'n_estimators': 122}



In [145]:
report_best_scores(search.cv_results_, 1)

Model with rank: 1
Mean validation score: 0.488 (std: 0.031)
Parameters: {'colsample_bytree': 0.8609771937961997, 'gamma': 0.46202087955371657, 'learning_rate': 0.10083495993522129, 'max_depth': 5, 'n_estimators': 122}



In [177]:
xgb_model = xgb.XGBRegressor(objective="binary:logistic", random_state=42, eval_metric="auc", colsample_bytree=.8609771937961997, gamma=.46202087955371657, learning_rate=.10083495993522129, max_depth=5, n_estimators=2500, subsample=.611)
xgb_model.fit(X,Y)


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8609771937961997, eval_metric='auc',
       gamma=0.46202087955371657, learning_rate=0.10083495993522129,
       max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
       n_estimators=2500, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.611)

In [178]:
train_final = pd.read_csv("final_3000.csv")
train_final_Y = train_final.Y
train_final = train_final.drop('Id', axis=1)
train_final = train_final.drop('Y', axis=1)

train_final = train_final.drop('f24', axis=1)
train_final = train_final.drop('f20', axis=1)
train_final = train_final.drop('f11', axis=1)
train_final = train_final.drop('f6', axis=1)
train_final = train_final.drop('f18', axis=1)
train_final = train_final.drop('f9', axis=1)
train_final = train_final.drop('f5', axis=1)
train_final = train_final.drop('f22', axis=1)
train_final = train_final.drop('f2', axis=1)
train_final = train_final.drop('f21', axis=1)

yguess = xgb_model.predict(train_final)
yguess = pd.DataFrame(yguess)
yguess
    
score = roc_auc_score(train_final_Y, yguess)
score

1.0

In [183]:
x_test = pd.DataFrame(pd.read_csv('test_final.csv'))
x_test = x_test.drop('Id', 1)
x_test = x_test.drop('f24', axis=1)
x_test = x_test.drop('f20', axis=1)
x_test = x_test.drop('f11', axis=1)
x_test = x_test.drop('f6', axis=1)
x_test = x_test.drop('f18', axis=1)
x_test = x_test.drop('f9', axis=1)
x_test = x_test.drop('f5', axis=1)
x_test = x_test.drop('f22', axis=1)
x_test = x_test.drop('f2', axis=1)
x_test = x_test.drop('f21', axis=1)


yhat = xgb_model.predict(x_test)
yhat = pd.DataFrame(yhat)
    
yhat.to_csv("RF_Day2")



In [184]:
yhat

Unnamed: 0,0
0,0.982329
1,0.984725
2,1.000000
3,0.999999
4,0.999244
5,0.999992
6,0.985679
7,0.997986
8,0.999962
9,0.999994


In [109]:
sf

Unnamed: 0,Y
0,1
1,0
2,1
3,1
4,1
5,1
6,0
7,1
8,1
9,1
