In [45]:
from xgboost import plot_importance
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import lightgbm
import catboost
import xgboost

In [46]:
train= pd.read_csv('./ISIC/train.csv')
test= pd.read_csv('./ISIC/test.csv')
sub   = pd.read_csv('./ISIC/sample_submission.csv')
train.head()

print(train.target.value_counts())
print("### Number of missing values\n", train.isnull().sum())

0    32542
1      584
Name: target, dtype: int64
### Number of missing values
 image_name                         0
patient_id                         0
sex                               65
age_approx                        68
anatom_site_general_challenge    527
diagnosis                          0
benign_malignant                   0
target                             0
dtype: int64


In [47]:
train['sex'] = train['sex'].fillna('na')
train['age_approx'] = train['age_approx'].fillna(0)
train['anatom_site_general_challenge'] = train['anatom_site_general_challenge'].fillna('na')

test['sex'] = test['sex'].fillna('na')
test['age_approx'] = test['age_approx'].fillna(0)
test['anatom_site_general_challenge'] = test['anatom_site_general_challenge'].fillna('na')


In [48]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

train.sex = le.fit_transform(train.sex)
train.anatom_site_general_challenge = le.fit_transform(train.anatom_site_general_challenge)

test.sex = le.fit_transform(test.sex)
test.anatom_site_general_challenge = le.fit_transform(test.anatom_site_general_challenge)
train.head()
print("### Number of missing values\n", train.isnull().sum())
print("### Number of missing values\n", test.isnull().sum())

### Number of missing values
 image_name                       0
patient_id                       0
sex                              0
age_approx                       0
anatom_site_general_challenge    0
diagnosis                        0
benign_malignant                 0
target                           0
dtype: int64
### Number of missing values
 image_name                       0
patient_id                       0
sex                              0
age_approx                       0
anatom_site_general_challenge    0
dtype: int64


In [49]:
x_train = train[['sex', 'age_approx','anatom_site_general_challenge']]
y_train = train['target']


x_test = test[['sex', 'age_approx','anatom_site_general_challenge']]
test['target'] = 0

In [50]:
xgboost_model = xgboost.XGBClassifier()
lightgbm_model = lightgbm.LGBMClassifier()
catboost_model = catboost.CatBoostClassifier()

In [51]:
feature_names = ['sex','age_approx','anatom_site_general_challenge']
ycol = ['target']


kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)


for fid, (trn_idx, val_idx) in enumerate(kfold.split(train[feature_names], train[ycol])):
    X_train = train.iloc[trn_idx][feature_names]
    Y_train = train.iloc[trn_idx][ycol]

    X_val = train.iloc[val_idx][feature_names]
    Y_val = train.iloc[val_idx][ycol]
    
    lgb_m = lgb_model = lightgbm_model.fit(X_train,
                              Y_train,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train, Y_train), (X_val, Y_val)],
                              eval_metric='auc',
                              early_stopping_rounds=100)
    pred_lgb_m = lgb_m.predict(test[feature_names], num_iteration=lgb_m.best_iteration_)
    
    
    ctb_m = catboost_model.fit(X_train,
                                  Y_train,
                                  eval_set=[(X_train, Y_train), (X_val, Y_val)],
                                  verbose=100,
                                  early_stopping_rounds=100)
    pred_ctb_m = ctb_m.predict(test[feature_names])
    
    test['target'] += (pred_lgb_m + pred_ctb_m) / kfold.n_splits
    

[1]	train's auc: 0.704507	train's binary_logloss: 0.0865366	valid's auc: 0.666301	valid's binary_logloss: 0.0871434
Training until validation scores don't improve for 100 rounds
[2]	train's auc: 0.714342	train's binary_logloss: 0.0854913	valid's auc: 0.678247	valid's binary_logloss: 0.0863249
[3]	train's auc: 0.721323	train's binary_logloss: 0.0847498	valid's auc: 0.688319	valid's binary_logloss: 0.0856666
[4]	train's auc: 0.720707	train's binary_logloss: 0.0841482	valid's auc: 0.686664	valid's binary_logloss: 0.0852354
[5]	train's auc: 0.723547	train's binary_logloss: 0.0837075	valid's auc: 0.687651	valid's binary_logloss: 0.0848755
[6]	train's auc: 0.724178	train's binary_logloss: 0.0833437	valid's auc: 0.686024	valid's binary_logloss: 0.0846534
[7]	train's auc: 0.724362	train's binary_logloss: 0.083036	valid's auc: 0.686019	valid's binary_logloss: 0.0844435
[8]	train's auc: 0.727329	train's binary_logloss: 0.0827915	valid's auc: 0.686002	valid's binary_logloss: 0.0843235
[9]	train's

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[27]	train's auc: 0.742175	train's binary_logloss: 0.0808918	valid's auc: 0.683952	valid's binary_logloss: 0.0839511
[28]	train's auc: 0.742518	train's binary_logloss: 0.0808547	valid's auc: 0.68428	valid's binary_logloss: 0.0839723
[29]	train's auc: 0.742706	train's binary_logloss: 0.0808158	valid's auc: 0.684384	valid's binary_logloss: 0.083973
[30]	train's auc: 0.743089	train's binary_logloss: 0.080776	valid's auc: 0.683438	valid's binary_logloss: 0.0839883
[31]	train's auc: 0.743241	train's binary_logloss: 0.0807436	valid's auc: 0.682451	valid's binary_logloss: 0.0840245
[32]	train's auc: 0.74334	train's binary_logloss: 0.0807158	valid's auc: 0.68218	valid's binary_logloss: 0.0840328
[33]	train's auc: 0.743521	train's binary_logloss: 0.0806879	valid's auc: 0.682092	valid's binary_logloss: 0.0840596
[34]	train's auc: 0.743838	train's binary_logloss: 0.0806584	valid's auc: 0.682608	valid's binary_logloss: 0.0840581
[35]	train's auc: 0.744095	train's binary_logloss: 0.0806378	valid's 

Learning rate set to 0.071195
0:	learn: 0.5651025	test: 0.5651025	test1: 0.5651055	best: 0.5651055 (0)	total: 23.4ms	remaining: 23.4s
100:	learn: 0.0818201	test: 0.0818201	test1: 0.0839000	best: 0.0838430 (77)	total: 1.04s	remaining: 9.24s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.08384304212
bestIteration = 77

Shrink model to first 78 iterations.
[1]	train's auc: 0.712854	train's binary_logloss: 0.0863631	valid's auc: 0.663614	valid's binary_logloss: 0.0873136
Training until validation scores don't improve for 100 rounds
[2]	train's auc: 0.717864	train's binary_logloss: 0.0853105	valid's auc: 0.660757	valid's binary_logloss: 0.0865982
[3]	train's auc: 0.72071	train's binary_logloss: 0.0845735	valid's auc: 0.658661	valid's binary_logloss: 0.0861362
[4]	train's auc: 0.721976	train's binary_logloss: 0.0840114	valid's auc: 0.66003	valid's binary_logloss: 0.0857757
[5]	train's auc: 0.725841	train's binary_logloss: 0.0835499	valid's auc: 0.662735	valid's binary_l

[79]	train's auc: 0.752817	train's binary_logloss: 0.0797451	valid's auc: 0.677998	valid's binary_logloss: 0.0858802
[80]	train's auc: 0.752888	train's binary_logloss: 0.0797321	valid's auc: 0.677964	valid's binary_logloss: 0.0858991
[81]	train's auc: 0.752886	train's binary_logloss: 0.0797298	valid's auc: 0.677749	valid's binary_logloss: 0.085902
[82]	train's auc: 0.753018	train's binary_logloss: 0.079725	valid's auc: 0.67724	valid's binary_logloss: 0.085923
[83]	train's auc: 0.753057	train's binary_logloss: 0.0797206	valid's auc: 0.677381	valid's binary_logloss: 0.0859785
[84]	train's auc: 0.753227	train's binary_logloss: 0.0797078	valid's auc: 0.676864	valid's binary_logloss: 0.0860197
[85]	train's auc: 0.753346	train's binary_logloss: 0.0796956	valid's auc: 0.677034	valid's binary_logloss: 0.0860503
[86]	train's auc: 0.753459	train's binary_logloss: 0.079685	valid's auc: 0.676824	valid's binary_logloss: 0.0860699
[87]	train's auc: 0.753493	train's binary_logloss: 0.0796822	valid's 

[55]	train's auc: 0.749713	train's binary_logloss: 0.0800166	valid's auc: 0.660165	valid's binary_logloss: 0.08611
[56]	train's auc: 0.74997	train's binary_logloss: 0.080001	valid's auc: 0.659501	valid's binary_logloss: 0.0861658
[57]	train's auc: 0.750095	train's binary_logloss: 0.0799885	valid's auc: 0.65951	valid's binary_logloss: 0.0861846
[58]	train's auc: 0.750188	train's binary_logloss: 0.0799787	valid's auc: 0.659871	valid's binary_logloss: 0.0861872
[59]	train's auc: 0.750212	train's binary_logloss: 0.0799736	valid's auc: 0.659611	valid's binary_logloss: 0.0862344
[60]	train's auc: 0.750293	train's binary_logloss: 0.0799476	valid's auc: 0.659684	valid's binary_logloss: 0.0862467
[61]	train's auc: 0.750413	train's binary_logloss: 0.0799359	valid's auc: 0.660266	valid's binary_logloss: 0.0862838
[62]	train's auc: 0.750406	train's binary_logloss: 0.0799314	valid's auc: 0.660557	valid's binary_logloss: 0.0863406
[63]	train's auc: 0.750462	train's binary_logloss: 0.0799278	valid's 

[34]	train's auc: 0.741652	train's binary_logloss: 0.0806805	valid's auc: 0.691739	valid's binary_logloss: 0.0841267
[35]	train's auc: 0.741941	train's binary_logloss: 0.0806544	valid's auc: 0.69117	valid's binary_logloss: 0.084136
[36]	train's auc: 0.742055	train's binary_logloss: 0.0806302	valid's auc: 0.691034	valid's binary_logloss: 0.0841705
[37]	train's auc: 0.742177	train's binary_logloss: 0.0806093	valid's auc: 0.690713	valid's binary_logloss: 0.0842076
[38]	train's auc: 0.742555	train's binary_logloss: 0.0805874	valid's auc: 0.689983	valid's binary_logloss: 0.0842194
[39]	train's auc: 0.742722	train's binary_logloss: 0.08055	valid's auc: 0.690981	valid's binary_logloss: 0.0842341
[40]	train's auc: 0.742947	train's binary_logloss: 0.0805266	valid's auc: 0.690695	valid's binary_logloss: 0.0842556
[41]	train's auc: 0.743386	train's binary_logloss: 0.0805033	valid's auc: 0.690759	valid's binary_logloss: 0.0842353
[42]	train's auc: 0.743594	train's binary_logloss: 0.0804825	valid's

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.08372097369
bestIteration = 86

Shrink model to first 87 iterations.
[1]	train's auc: 0.702692	train's binary_logloss: 0.0866522	valid's auc: 0.675051	valid's binary_logloss: 0.0865594
Training until validation scores don't improve for 100 rounds
[2]	train's auc: 0.71563	train's binary_logloss: 0.0855611	valid's auc: 0.674105	valid's binary_logloss: 0.0857264
[3]	train's auc: 0.7238	train's binary_logloss: 0.084771	valid's auc: 0.681972	valid's binary_logloss: 0.0851661
[4]	train's auc: 0.728408	train's binary_logloss: 0.0841997	valid's auc: 0.680321	valid's binary_logloss: 0.0848001
[5]	train's auc: 0.729787	train's binary_logloss: 0.0837134	valid's auc: 0.676882	valid's binary_logloss: 0.0845218
[6]	train's auc: 0.73149	train's binary_logloss: 0.0833264	valid's auc: 0.675318	valid's binary_logloss: 0.0843049
[7]	train's auc: 0.733036	train's binary_logloss: 0.0830162	valid's auc: 0.677654	valid's binary_logloss: 0.0

[82]	train's auc: 0.751679	train's binary_logloss: 0.0800354	valid's auc: 0.659733	valid's binary_logloss: 0.0854777
[83]	train's auc: 0.75174	train's binary_logloss: 0.0800307	valid's auc: 0.661146	valid's binary_logloss: 0.0855003
[84]	train's auc: 0.751792	train's binary_logloss: 0.0800242	valid's auc: 0.659414	valid's binary_logloss: 0.0855302
[85]	train's auc: 0.751819	train's binary_logloss: 0.0800204	valid's auc: 0.659607	valid's binary_logloss: 0.0855366
[86]	train's auc: 0.751865	train's binary_logloss: 0.0800166	valid's auc: 0.659619	valid's binary_logloss: 0.085556
[87]	train's auc: 0.751878	train's binary_logloss: 0.080012	valid's auc: 0.658909	valid's binary_logloss: 0.0855703
[88]	train's auc: 0.751872	train's binary_logloss: 0.0800094	valid's auc: 0.660617	valid's binary_logloss: 0.0855757
[89]	train's auc: 0.751947	train's binary_logloss: 0.0799917	valid's auc: 0.660431	valid's binary_logloss: 0.085598
[90]	train's auc: 0.751944	train's binary_logloss: 0.079988	valid's 

In [52]:
test.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,target
0,ISIC_0052060,IP_3579794,1,70.0,2,0.0
1,ISIC_0052349,IP_7782715,1,40.0,1,0.0
2,ISIC_0058510,IP_7960270,0,55.0,5,0.0
3,ISIC_0073313,IP_6375035,0,50.0,5,0.0
4,ISIC_0073502,IP_0589375,0,45.0,1,0.0


In [53]:
sub.target = test.target
sub.to_csv('./submission.csv',index=False)