In [1]:
import pandas as pd
import numpy as np
from sklearn import cross_validation, metrics
from sklearn.cross_validation import StratifiedKFold as KFold
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier as RF
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
#from sklearn.grid_search import GridSearchCV   #Perforing grid search
#import matplotlib.pylab as plt
#%matplotlib inline
from tqdm import tqdm
import datetime
from time import strftime

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials



In [2]:
working_path = "/home/watts/lal/Kaggle/lung_cancer/"

In [3]:
def get_current_date():
    return strftime('%Y%m%d')

In [4]:
num_slices = 16
img_width = 128
img_height = 128

In [5]:
train_fname = 'cache/my_train_%d_%d_%d_%s.csv' % (num_slices, img_width, img_height, get_current_date())
test_fname = 'cache/my_test_%d_%d_%d_%s.csv' % (num_slices, img_width, img_height, get_current_date())
train = pd.read_csv(working_path+train_fname, sep=',')
test = pd.read_csv(working_path+test_fname, sep=',')
target = 'output'
idcol = 'id'
scan_folder = 'scan_folder'

In [7]:
df = train
df = df.drop('output',axis=1)
df = df.drop('id',axis=1)
df = df.drop('scan_folder',axis=1)

X = df
y = train['output']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234)


def objective(space):

    clf = xgb.XGBClassifier(learning_rate=0.325,
                            silent=True,
                            objective="binary:logistic",
                            nthread=-1,
                            gamma=0.85,
                            min_child_weight=5,
                            max_delta_step=1,
                            subsample=0.85,
                            colsample_bytree=0.55,
                            colsample_bylevel=1,
                            reg_alpha=0.5,
                            reg_lambda=1,
                            scale_pos_weight=1,
                            base_score=0.5,
                            seed=0,
                            missing=None,
                            n_estimators=1920, max_depth=6)

    
    eval_set  = [( X_train, y_train), (X_test, y_test)]

    clf.fit(X_train, y_train,
            eval_set=eval_set, eval_metric="logloss", 
            early_stopping_rounds=100)

    pred = clf.predict_proba(X_test)[:,1]
    loss = log_loss(y_test, pred)
    print "logloss:", loss

    return{'loss':loss, 'status': STATUS_OK }



In [7]:
space = {
        'max_depth': hp.quniform('max_depth', 1, 13, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'learning_rate': hp.quniform('learning_rate', 0.025, 0.5, 0.025),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 5),
        'silent' : 1
}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print best

[0]	validation_0-logloss:0.635405	validation_1-logloss:0.619989
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[1]	validation_0-logloss:0.608509	validation_1-logloss:0.582556
[2]	validation_0-logloss:0.59554	validation_1-logloss:0.562219
[3]	validation_0-logloss:0.588478	validation_1-logloss:0.550914
[4]	validation_0-logloss:0.584632	validation_1-logloss:0.544298
[5]	validation_0-logloss:0.583211	validation_1-logloss:0.540378
[6]	validation_0-logloss:0.582713	validation_1-logloss:0.538711
[7]	validation_0-logloss:0.582285	validation_1-logloss:0.536989
[8]	validation_0-logloss:0.581909	validation_1-logloss:0.534469
[9]	validation_0-logloss:0.581293	validation_1-logloss:0.535781
[10]	validation_0-logloss:0.581119	validation_1-logloss:0.534005
[11]	validation_0-logloss:0.581089	validation_1-logloss:0.533669
[12]	validation_0-logloss:0.581046	validation_1-logloss:0.532944


In [7]:
df = train
df = df.drop('output',axis=1)
df = df.drop('id',axis=1)
df = df.drop('scan_folder',axis=1)

X = df
y = train['output']

df1 = test
df1 = df1.drop('id',axis=1)
df1 = df1.drop('scan_folder',axis=1)
X_test = df1

clf = xgb.XGBClassifier(learning_rate=0.275,
                        silent=True,
                        objective="binary:logistic",
                        nthread=-1,
                        gamma=0.85,
                        min_child_weight=5,
                        max_delta_step=1,
                        subsample=0.85,
                        colsample_bytree=0.7,
                        colsample_bylevel=1,
                        reg_alpha=0.5,
                        reg_lambda=1,
                        scale_pos_weight=1,
                        base_score=0.5,
                        seed=0,
                        missing=None,
                        n_estimators=360, max_depth=4)


eval_set  = [( X, y)]

clf.fit(X, y,
        eval_set=eval_set, eval_metric="logloss", 
        early_stopping_rounds=100)


#loss = log_loss(y_test, pred)
#print "logloss:", loss

#return{'loss':loss, 'status': STATUS_OK }



[0]	validation_0-logloss:0.639927
Will train until validation_0-logloss hasn't improved in 100 rounds.
[1]	validation_0-logloss:0.609495
[2]	validation_0-logloss:0.593302
[3]	validation_0-logloss:0.583186
[4]	validation_0-logloss:0.57759
[5]	validation_0-logloss:0.574911
[6]	validation_0-logloss:0.572954
[7]	validation_0-logloss:0.572295
[8]	validation_0-logloss:0.571583
[9]	validation_0-logloss:0.571388
[10]	validation_0-logloss:0.571285
[11]	validation_0-logloss:0.571317
[12]	validation_0-logloss:0.571287
[13]	validation_0-logloss:0.571246
[14]	validation_0-logloss:0.571253
[15]	validation_0-logloss:0.571146
[16]	validation_0-logloss:0.571119
[17]	validation_0-logloss:0.57112
[18]	validation_0-logloss:0.571117
[19]	validation_0-logloss:0.571117
[20]	validation_0-logloss:0.571117
[21]	validation_0-logloss:0.570998
[22]	validation_0-logloss:0.57101
[23]	validation_0-logloss:0.571011
[24]	validation_0-logloss:0.571017
[25]	validation_0-logloss:0.570778
[26]	validation_0-logloss:0.570564

In [None]:
Y_pred = clf.predict_proba(X_test)[:,0]

In [8]:
print Y_pred

[ 0.21613663  0.21613663  0.22965464  0.26549712  0.26549712  0.26549712
  0.22965464  0.25263828  0.22965464  0.26549712  0.26549712  0.22965464
  0.26549712  0.28767926  0.26549712  0.26549712  0.26549712  0.26549712
  0.26549712  0.26549712  0.26549712  0.26549712  0.26549712  0.26549712
  0.26549712  0.26549712  0.26549712  0.26549712  0.26549712  0.21613663
  0.26549712  0.26549712  0.26549712  0.26549712  0.26549712  0.25263828
  0.26549712  0.26549712  0.26549712  0.22965464  0.26549712  0.22965464
  0.21613663  0.26549712  0.21613663  0.26549712  0.26549712  0.22965464
  0.22965464  0.26549712  0.22965464  0.26549712  0.26549712  0.26549712
  0.26549712  0.22965464  0.26549712  0.22965464  0.22965464  0.26549712
  0.26549712  0.26549712  0.26549712  0.22965464  0.38552463  0.26549712
  0.22965464  0.26549712  0.26549712  0.25263828  0.26549712  0.26549712
  0.26549712  0.26549712  0.26549712  0.18332653  0.26549712  0.26549712
  0.26549712  0.26549712  0.21182558  0.26549712  0

In [9]:
df = train
df = df.drop('output',axis=1)
df = df.drop('id',axis=1)
df = df.drop('scan_folder',axis=1)

X_train = df
Y_train = train['output']
T_train_xgb = xgb.DMatrix(X_train, Y_train)

params = {
    'learning_rate':0.275,
    'objective':"binary:logistic",
    'nthread':-1,
    'gamma':0.85,
    'min_child_weight':5,
    'max_delta_step':1,
    'subsample':0.85,
    'colsample_bytree':0.70,
    'colsample_bylevel':1,
    'reg_alpha':0.5,
    'reg_lambda':1,
    'scale_pos_weight':1,
    'base_score':0.5,
    'seed':0,
    'missing':None,
    'n_estimators':360, 
    'max_depth':4}



In [11]:
xgb.cv(params = params, dtrain = T_train_xgb, num_boost_round = 3000, nfold = 10,
                metrics = ['logloss'], # Make sure you enter metrics inside a list or you may encounter issues!
                early_stopping_rounds = 100) 

Unnamed: 0,test-logloss-mean,test-logloss-std,train-logloss-mean,train-logloss-std
0,0.639538,0.011503,0.639067,0.001923
1,0.609908,0.019655,0.60919,0.003445
2,0.593905,0.026467,0.592797,0.004123
3,0.585034,0.031405,0.583597,0.004726
4,0.580303,0.035182,0.578658,0.00498
5,0.577871,0.038183,0.575646,0.005003
6,0.576657,0.040257,0.57394,0.004993
7,0.576243,0.042032,0.573009,0.005064
8,0.576195,0.043371,0.572477,0.005114
9,0.576066,0.044488,0.572106,0.005169


In [18]:
bst = xgb.train(dtrain=T_train_xgb,params=params, num_boost_round=36)

In [19]:
df = test
df = df.drop('id',axis=1)
df = df.drop('scan_folder',axis=1)

X_test = df

In [20]:
Y_pred = bst.predict(xgb.DMatrix(X_test))
print Y_pred

[ 0.21670665  0.21670665  0.2298065   0.26640013  0.26640013  0.26640013
  0.2298065   0.25024709  0.2298065   0.26640013  0.26640013  0.2298065
  0.26640013  0.26640013  0.26640013  0.26640013  0.26640013  0.26640013
  0.26640013  0.26640013  0.26640013  0.26640013  0.26640013  0.26640013
  0.26640013  0.26640013  0.26640013  0.26640013  0.26640013  0.21670665
  0.26640013  0.26640013  0.26640013  0.26640013  0.26640013  0.25024709
  0.26640013  0.26640013  0.26640013  0.2298065   0.26640013  0.2298065
  0.21670665  0.26640013  0.21670665  0.26640013  0.26640013  0.2298065
  0.2298065   0.26640013  0.2298065   0.26640013  0.26640013  0.26640013
  0.26640013  0.2298065   0.26640013  0.2298065   0.2298065   0.26640013
  0.26640013  0.26640013  0.26640013  0.2298065   0.33417788  0.26640013
  0.2298065   0.26640013  0.26640013  0.25024709  0.26640013  0.26640013
  0.26640013  0.26640013  0.26640013  0.27518812  0.26640013  0.26640013
  0.26640013  0.26640013  0.23504196  0.26640013  0.26

In [23]:
import string
data = []
cols = ['id', 'cancer']
df = test
for i, row in tqdm(df.iterrows(), total=len(df)):
    scan_folder = row['scan_folder']
    cancer = Y_pred[i]
    t = {
         'id': scan_folder,
         'cancer': cancer
        }
    data.append(t)
df_sub = pd.DataFrame(data)
df_sub = df_sub[cols]
now = str(datetime.datetime.now())
now = now.replace(' ','-')
now = now.replace(':','-')
print now
sub_fname = working_path+'cache/submissions/my_sub_%s.csv' % now
df_sub.to_csv(sub_fname, sep=',', index=False)
print 'Done'
print sub_fname

100%|██████████| 198/198 [00:00<00:00, 4583.43it/s]

2017-04-07-15-23-12.735597
Done
/home/watts/lal/Kaggle/lung_cancer/cache/submissions/my_sub_2017-04-07-15-23-12.735597.csv



