In [1]:
import pandas as pd
import numpy as np
from sklearn import cross_validation, metrics
from sklearn.cross_validation import StratifiedKFold as KFold
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier as RF
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
#from sklearn.grid_search import GridSearchCV   #Perforing grid search
#import matplotlib.pylab as plt
#%matplotlib inline
from tqdm import tqdm
import datetime
from time import strftime

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials



In [2]:
working_path = "/home/watts/lal/Kaggle/lung_cancer/"

In [3]:
def get_current_date():
    return strftime('%Y%m%d')

In [4]:
num_slices = 16
img_width = 128
img_height = 128

In [5]:
train_fname = 'cache/my_train_%d_%d_%d_%s.csv' % (num_slices, img_width, img_height, get_current_date())
test_fname = 'cache/my_test_%d_%d_%d_%s.csv' % (num_slices, img_width, img_height, get_current_date())
train = pd.read_csv(working_path+train_fname, sep=',')
test = pd.read_csv(working_path+test_fname, sep=',')
target = 'output'
idcol = 'id'
scan_folder = 'scan_folder'

In [19]:
df = train
df = df.drop('output',axis=1)
df = df.drop('id',axis=1)
df = df.drop('scan_folder',axis=1)

X = df
y = train['output']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234)


def objective(space):

    clf = xgb.XGBClassifier(learning_rate=0.325,
                            silent=True,
                            objective="binary:logistic",
                            nthread=-1,
                            gamma=0.85,
                            min_child_weight=5,
                            max_delta_step=1,
                            subsample=0.85,
                            colsample_bytree=0.55,
                            colsample_bylevel=1,
                            reg_alpha=0.5,
                            reg_lambda=1,
                            scale_pos_weight=1,
                            base_score=0.5,
                            seed=0,
                            missing=None,
                            n_estimators=1920, max_depth=6)

    
    eval_set  = [( X_train, y_train), (X_test, y_test)]

    clf.fit(X_train, y_train,
            eval_set=eval_set, eval_metric="logloss", 
            early_stopping_rounds=100)

    pred = clf.predict_proba(X_test)[:,1]
    loss = log_loss(y_test, pred)
    print "logloss:", loss

    return{'loss':loss, 'status': STATUS_OK }



In [20]:
space = {
        'max_depth': hp.quniform('max_depth', 1, 13, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'learning_rate': hp.quniform('learning_rate', 0.025, 0.5, 0.025),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 5),
        'silent' : 1
}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print best

[0]	validation_0-logloss:0.625758	validation_1-logloss:0.644953
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[1]	validation_0-logloss:0.593443	validation_1-logloss:0.628317
[2]	validation_0-logloss:0.576996	validation_1-logloss:0.621776
[3]	validation_0-logloss:0.568412	validation_1-logloss:0.62112
[4]	validation_0-logloss:0.563186	validation_1-logloss:0.622081
[5]	validation_0-logloss:0.561101	validation_1-logloss:0.624142
[6]	validation_0-logloss:0.559874	validation_1-logloss:0.626733
[7]	validation_0-logloss:0.559302	validation_1-logloss:0.627864
[8]	validation_0-logloss:0.559236	validation_1-logloss:0.628247
[9]	validation_0-logloss:0.559074	validation_1-logloss:0.628331
[10]	validation_0-logloss:0.558638	validation_1-logloss:0.629788
[11]	validation_0-logloss:0.558466	validation_1-logloss:0.630697
[12]	validation_0-logloss:0.558272	validation_1-logloss:0.630413


In [19]:
# df = train
# df = df.drop('output',axis=1)
# df = df.drop('id',axis=1)
# df = df.drop('scan_folder',axis=1)

# X = df
# y = train['output']

# df1 = test
# df1 = df1.drop('id',axis=1)
# df1 = df1.drop('scan_folder',axis=1)
# X_test = df1

# clf = xgb.XGBClassifier(learning_rate=0.275,
#                         silent=True,
#                         objective="binary:logistic",
#                         nthread=-1,
#                         gamma=0.85,
#                         min_child_weight=5,
#                         max_delta_step=1,
#                         subsample=0.85,
#                         colsample_bytree=0.7,
#                         colsample_bylevel=1,
#                         reg_alpha=0.5,
#                         reg_lambda=1,
#                         scale_pos_weight=1,
#                         base_score=0.5,
#                         seed=0,
#                         missing=None,
#                         n_estimators=360, max_depth=4)


# eval_set  = [( X, y)]

# clf.fit(X, y,
#         eval_set=eval_set, eval_metric="logloss", 
#         early_stopping_rounds=100)


# #loss = log_loss(y_test, pred)
# #print "logloss:", loss

# #return{'loss':loss, 'status': STATUS_OK }



[0]	validation_0-logloss:0.638777
Will train until validation_0-logloss hasn't improved in 100 rounds.
[1]	validation_0-logloss:0.60405
[2]	validation_0-logloss:0.583495
[3]	validation_0-logloss:0.570222
[4]	validation_0-logloss:0.558393
[5]	validation_0-logloss:0.551406
[6]	validation_0-logloss:0.544562
[7]	validation_0-logloss:0.53928
[8]	validation_0-logloss:0.535073
[9]	validation_0-logloss:0.53016
[10]	validation_0-logloss:0.523089
[11]	validation_0-logloss:0.520842
[12]	validation_0-logloss:0.518028
[13]	validation_0-logloss:0.514114
[14]	validation_0-logloss:0.511481
[15]	validation_0-logloss:0.508225
[16]	validation_0-logloss:0.505494
[17]	validation_0-logloss:0.502736
[18]	validation_0-logloss:0.50053
[19]	validation_0-logloss:0.498897
[20]	validation_0-logloss:0.49662
[21]	validation_0-logloss:0.494297
[22]	validation_0-logloss:0.492634
[23]	validation_0-logloss:0.490852
[24]	validation_0-logloss:0.489209
[25]	validation_0-logloss:0.485381
[26]	validation_0-logloss:0.484556
[

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0.85, learning_rate=0.275, max_delta_step=1, max_depth=4,
       min_child_weight=5, missing=None, n_estimators=360, nthread=-1,
       objective='binary:logistic', reg_alpha=0.5, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.85)

In [25]:
# Y_pred = clf.predict_proba(X_test)[:,1]

In [26]:
# print Y_pred

[ 0.08699444  0.22978897  0.0192046   0.16760391  0.69848216  0.06361359
  0.61828774  0.28674236  0.6011076   0.42416048  0.24720684  0.1588967
  0.14680041  0.22030134  0.29356506  0.11591086  0.02419939  0.24720684
  0.78091651  0.24720684  0.38362387  0.15189996  0.21789235  0.11570415
  0.31645468  0.4089081   0.28152153  0.20942698  0.38591024  0.24720684
  0.60422957  0.66303349  0.49712273  0.26599178  0.144554    0.84433132
  0.27494019  0.70779157  0.07171493  0.00707974  0.05020643  0.33456114
  0.19324997  0.6807245   0.24720684  0.12696113  0.0041235   0.24720684
  0.19095615  0.09423231  0.37932897  0.21789235  0.01726827  0.08672291
  0.3112314   0.5257929   0.19031876  0.03569041  0.42476988  0.02619242
  0.62740135  0.13240016  0.59366703  0.17957331  0.08321784  0.80739909
  0.08579277  0.37002903  0.24720684  0.02434442  0.44815499  0.26057982
  0.03770505  0.66303349  0.05098478  0.52095622  0.60422957  0.24720684
  0.09064041  0.24720684  0.26057982  0.20099826  0.

In [6]:
df = train
df = df.drop('output',axis=1)
df = df.drop('id',axis=1)
df = df.drop('scan_folder',axis=1)

X_train = df
Y_train = train['output']
T_train_xgb = xgb.DMatrix(X_train, Y_train)

params = {
    'learning_rate':0.275,
    'objective':"binary:logistic",
    'nthread':-1,
    'gamma':0.85,
    'min_child_weight':5,
    'max_delta_step':1,
    'subsample':0.85,
    'colsample_bytree':0.70,
    'colsample_bylevel':1,
    'reg_alpha':0.5,
    'reg_lambda':1,
    'scale_pos_weight':1,
    'base_score':0.5,
    'seed':0,
    'missing':None,
    'n_estimators':360, 
    'max_depth':4}



In [7]:
xgb.cv(params = params, dtrain = T_train_xgb, num_boost_round = 3000, nfold = 10,
                metrics = ['logloss'], # Make sure you enter metrics inside a list or you may encounter issues!
                early_stopping_rounds = 100) 

Unnamed: 0,test-logloss-mean,test-logloss-std,train-logloss-mean,train-logloss-std
0,0.639769,0.008425,0.639661,0.001705
1,0.609944,0.016219,0.608066,0.00299
2,0.593903,0.021656,0.590084,0.003811
3,0.585241,0.026361,0.580181,0.004661
4,0.5794,0.029392,0.57406,0.00475
5,0.576126,0.031861,0.569615,0.004389
6,0.574269,0.034183,0.56688,0.004107
7,0.573418,0.03577,0.565054,0.004254
8,0.573197,0.036726,0.564216,0.003982
9,0.57291,0.037605,0.563336,0.004133


In [8]:
bst = xgb.train(dtrain=T_train_xgb,params=params, num_boost_round=24)

In [9]:
df = test
df = df.drop('id',axis=1)
df = df.drop('scan_folder',axis=1)

X_test = df

In [10]:
Y_pred = bst.predict(xgb.DMatrix(X_test))
print Y_pred

[ 0.66201687  0.64009124  0.66201687  0.26508909  0.66201687  0.66201687
  0.3251701   0.66201687  0.66201687  0.3251701   0.3251701   0.66201687
  0.66201687  0.1957234   0.17502128  0.66201687  0.16406827  0.15314201
  0.3251701   0.3251701   0.24459207  0.28945377  0.66201687  0.17502128
  0.1957234   0.16406827  0.66201687  0.27547836  0.66201687  0.64009124
  0.64009124  0.26508909  0.64009124  0.23925266  0.66201687  0.17502128
  0.66201687  0.36071423  0.17502128  0.26508909  0.16406827  0.66201687
  0.3251701   0.22088213  0.66201687  0.66201687  0.24459207  0.24459207
  0.16406827  0.24459207  0.3251701   0.66201687  0.24459207  0.64009124
  0.23925266  0.36071423  0.15314201  0.66201687  0.64009124  0.24459207
  0.17502128  0.64009124  0.15314201  0.24459207  0.64009124  0.66201687
  0.17502128  0.66201687  0.64009124  0.66201687  0.3251701   0.64009124
  0.17502128  0.3251701   0.15314201  0.3251701   0.5637027   0.3251701
  0.24459207  0.66201687  0.1957234   0.23925266  0.

In [11]:
import string
data = []
cols = ['id', 'cancer']
df = test
for i, row in tqdm(df.iterrows(), total=len(df)):
    scan_folder = row['scan_folder']
    cancer = Y_pred[i]
    t = {
         'id': scan_folder,
         'cancer': cancer
        }
    data.append(t)
df_sub = pd.DataFrame(data)
df_sub = df_sub[cols]
now = str(datetime.datetime.now())
now = now.replace(' ','-')
now = now.replace(':','-')
print now
sub_fname = working_path+'cache/submissions/my_sub_%s.csv' % now
df_sub.to_csv(sub_fname, sep=',', index=False)
print 'Done'
print sub_fname

100%|██████████| 506/506 [00:00<00:00, 5563.74it/s]

2017-04-14-20-07-08.430095
Done
/home/watts/lal/Kaggle/lung_cancer/cache/submissions/my_sub_2017-04-14-20-07-08.430095.csv





In [44]:
df0 = train

df1 = test
df1['output'] = Y_pred

df = pd.concat([df0,df1])
df2 = df

df = df.drop('output',axis=1)
df = df.drop('id',axis=1)
df = df.drop('scan_folder',axis=1)

X_train = df
Y_train = df2['output']
T_train_xgb = xgb.DMatrix(X_train, Y_train)

params = {
    'learning_rate':0.275,
    'objective':"binary:logistic",
    'nthread':-1,
    'gamma':0.85,
    'min_child_weight':5,
    'max_delta_step':1,
    'subsample':0.85,
    'colsample_bytree':0.70,
    'colsample_bylevel':1,
    'reg_alpha':0.5,
    'reg_lambda':1,
    'scale_pos_weight':1,
    'base_score':0.5,
    'seed':0,
    'missing':None,
    'n_estimators':360, 
    'max_depth':4}



In [45]:
xgb.cv(params = params, dtrain = T_train_xgb, num_boost_round = 3000, nfold = 10,
                metrics = ['logloss'], # Make sure you enter metrics inside a list or you may encounter issues!
                early_stopping_rounds = 100) 

Unnamed: 0,test-logloss-mean,test-logloss-std,train-logloss-mean,train-logloss-std
0,0.641908,0.004806,0.641298,0.001402
1,0.614422,0.008737,0.612775,0.001823
2,0.598861,0.011504,0.596838,0.002187
3,0.590058,0.013629,0.587819,0.002344
4,0.585408,0.015473,0.582581,0.002291
5,0.582941,0.017018,0.579532,0.002235
6,0.581633,0.017938,0.577726,0.002377
7,0.58119,0.018711,0.576725,0.002289
8,0.581142,0.019506,0.576051,0.002383
9,0.581066,0.020106,0.575731,0.002389


In [46]:
bst = xgb.train(dtrain=T_train_xgb,params=params, num_boost_round=12)

In [49]:
df = test
df = df.drop('id',axis=1)
df = df.drop('scan_folder',axis=1)
df = df.drop('output',axis=1)

X_test = df

In [50]:
Y_pred = bst.predict(xgb.DMatrix(X_test))
print Y_pred

[ 0.27321723  0.27321723  0.27321723  0.26597357  0.27321723  0.27321723
  0.27321723  0.27321723  0.27321723  0.27321723  0.27321723  0.27321723
  0.27321723  0.27321723  0.27321723  0.27321723  0.13396716  0.27321723
  0.27321723  0.27321723  0.27321723  0.27321723  0.27321723  0.27321723
  0.27321723  0.13396716  0.27321723  0.27321723  0.27321723  0.27321723
  0.27321723  0.26641837  0.27321723  0.23962446  0.27321723  0.27321723
  0.27321723  0.27321723  0.27321723  0.26641837  0.13396716  0.27321723
  0.27321723  0.27321723  0.27321723  0.27321723  0.27321723  0.27321723
  0.13396716  0.27321723  0.27321723  0.27321723  0.27321723  0.27321723
  0.23962446  0.27321723  0.27321723  0.27321723  0.27321723  0.27321723
  0.27321723  0.27321723  0.27321723  0.27321723  0.27321723  0.27321723
  0.27321723  0.27321723  0.27321723  0.27321723  0.27321723  0.27321723
  0.27321723  0.27321723  0.27321723  0.27321723  0.27321723  0.27321723
  0.27321723  0.27321723  0.27321723  0.31205079  0

In [51]:
import string
data = []
cols = ['id', 'cancer']
df = test
for i, row in tqdm(df.iterrows(), total=len(df)):
    scan_folder = row['scan_folder']
    cancer = Y_pred[i]
    t = {
         'id': scan_folder,
         'cancer': cancer
        }
    data.append(t)
df_sub = pd.DataFrame(data)
df_sub = df_sub[cols]
now = str(datetime.datetime.now())
now = now.replace(' ','-')
now = now.replace(':','-')
print now
sub_fname = working_path+'cache/submissions/my_sub_%s.csv' % now
df_sub.to_csv(sub_fname, sep=',', index=False)
print 'Done'
print sub_fname

100%|██████████| 506/506 [00:00<00:00, 5856.90it/s]

2017-04-11-03-29-58.484109
Done
/home/watts/lal/Kaggle/lung_cancer/cache/submissions/my_sub_2017-04-11-03-29-58.484109.csv



