In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation, metrics


import xgboost
from xgboost import DMatrix

In [3]:
train = pd.read_csv("train_without_oversampling.csv")
test = pd.read_csv("test.csv")

In [4]:
y = train.target
train.drop("target", axis=1, inplace=True)
train.drop("client_id", axis=1, inplace=True)

In [5]:
train_data, test_data, y_train, y_test = train_test_split(train, y, test_size = 0.4, stratify = y)

In [8]:
dtrain = DMatrix(train_data.to_sparse(), label=y_train)
dtest = DMatrix(test_data.to_sparse(), label=y_test)

In [9]:
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [10]:
def objective(space):
    print space
    num_round = int(space['n_estimators'])
    max_depth = space['max_depth']
    lr = space['learning_rate']
    xgb = xgboost.XGBClassifier(max_depth=max_depth, learning_rate=lr, n_estimators=num_round)
    xgb.fit(train_data.to_sparse(), y_train)
    y_ans = xgb.predict(test_data.to_sparse())
    res = metrics.roc_auc_score(y_test, y_ans)
    print res
    return{'loss': -res, 'status': STATUS_OK}


space ={
        'n_estimators' : hp.quniform('n_estimators', 100, 600, 1),
        'learning_rate' : hp.uniform('eta', 0.01, 0.2),
        'max_depth' : hp.quniform('max_depth', 3, 8, 1),
}


trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=75,
            trials=trials)

print best

{'n_estimators': 119.0, 'learning_rate': 0.12778018840981303, 'max_depth': 3.0}
0.537191010749
{'n_estimators': 453.0, 'learning_rate': 0.18056655455932305, 'max_depth': 7.0}
0.563816185656
{'n_estimators': 181.0, 'learning_rate': 0.14102684819328282, 'max_depth': 8.0}
0.556521194192
{'n_estimators': 373.0, 'learning_rate': 0.17672974675988648, 'max_depth': 5.0}


KeyboardInterrupt: 

In [None]:
xgb = xgboost.XGBClassifier(max_depth=8, learning_rate=0.5, n_estimators=515)
xgb.fit(train.to_sparse(), y)
y_ans = xgb.predict(test.to_sparse())

In [None]:
ans['val'] = y_ans

In [None]:
ans.head()

In [None]:
ans.rename(columns={"client_id":"_ID_", "val":"_VAL_"}, inplace=True)

In [None]:
ans.to_csv("ans.csv", index=False, sep=",", header=False)