In [20]:
%matplotlib inline
# %matplotlib notebook

import os
import sys
import time
import pickle
import numpy as np
import csv
import pandas as pd
from scipy import ndimage
from skimage import measure
from skimage import feature
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import tensorflow as tf
import xgboost as xgb

from sklearn.cross_validation import StratifiedKFold as KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import Normalizer

import util
import image_aug
import luna_preprocess
import kagl_preprocess
import kagl_test_unet5
import kagl_feature_util
import kagl_feature_set1

reload(util);
reload(image_aug);
reload(luna_preprocess);
reload(kagl_preprocess);
reload(kagl_test_unet5);
reload(kagl_feature_util);
reload(kagl_feature_set1);

In [21]:
def flatten_list(a):
    ans = []
    for x in a:
        ans.extend(x)
    return ans


def plot_image_and_mask(image, mask):
    plt.imshow(image, cmap='gray')
    plt.imshow(mask, cmap='jet', alpha=0.5)

    
def plot_nodule(patient, nodule_idx, new_size=48):
    p = patient
    nodule = p._nodule_regions[nodule_idx]
    print 'area =', nodule.area
    print 'shape = ', nodule.filled_image.shape
    t_image = p.nodule_local_image(nodule_idx, [new_size]*3)
    t_mask = p.nodule_local_mask(nodule_idx, [new_size]*3)
    z = new_size//2
    plt.figure()
    plt.imshow(t_image[z], cmap='gray')
    plt.show()
    plt.figure()
    plot_image_and_mask(t_image[z], t_mask[z])
    plt.show()
    
    
def prepare_data(names, meta_patient, feature_set):
    feature_np = []
    label_np = []
    for name in names:
        label_np.append(meta_patient.labels[name])
        feature_np.append(feature_set[name])
    feature_np = np.asarray(feature_np, dtype=np.float)
    label_np = np.asarray(label_np, dtype=np.float)
    
    try:
        print 'feature.shape = ', feature_np.shape
        print 'pos = %d, %f'%(
            np.sum(label_np),
            np.sum(label_np) * 1.0 / len(label_np))
        print 'neg = %d, %f'%(
            len(label_np) - np.sum(label_np),
            1.0 - np.sum(label_np) * 1.0 / len(label_np))
    except:
        pass
    
    return feature_np, label_np

In [46]:
feature_names = [
    'lung_volume',
    'lung_mean_hu',
    'lung_std_hu',
    'lung_min_dz',
    'lung_max_dz',
    'lung_min_r',
    'lung_max_r',
    'lung_mean_r',
    'lung_std_r',
    'num_nodules',
] + flatten_list([[
        f + '_mean',
        f + '_min',
        f + '_max',
        f + '_sum',
        f + '_std',
        f + '_skew',
        f + '_kertosis'
    ]
    for f in [
        'nodule_area',
        'nodule_pos_z',
        'nodule_pos_r',
        'nodule_pos_br',
        'nodule_mean_image',
        'nodule_std_image',
        'nodule_boundary_diff',
        'nodule_convex_area',
        'nodule_eccentricity',
        'nodule_equivalent_diameter',
        'nodule_major_axis_length',
        'nodule_minor_axis_length',
        'nodule_orientation',
        'nodule_perimeter',
        'nodule_solidity'
    ]
])

feature_name_ids = dict(zip(feature_names, range(len(feature_names))))

In [40]:
print len(feature_names)

115


In [33]:
stage = 'stage1'
meta_patient = kagl_preprocess.MetaPatient(stage)

train_names = []
test_names = []
for name, label in meta_patient.labels.iteritems():
    if label is not None:
        train_names.append(name)
    else:
        test_names.append(name)
        
train_names = np.random.permutation(train_names)
num_valid_names = int(len(train_names) * 0.1)
valid_names = train_names[0:num_valid_names]
train_names = train_names[num_valid_names:]

print 'train =', len(train_names)
print 'valid =', len(valid_names)
print 'test =', len(test_names)

train = 1118
valid = 279
test = 198


In [34]:
pos_train_names = []
neg_train_names = []
for name in train_names:
    label = meta_patient.labels[name]
    if label >= 0.5:
        pos_train_names.append(name)
    else:
        neg_train_names.append(name)
print '#pos =', len(pos_train_names)
print '#neg =', len(neg_train_names)

pos_train_names = list(np.random.choice(
    pos_train_names, size=len(neg_train_names), replace=True))
train_names = np.random.permutation(pos_train_names + neg_train_names)

print 'train =', len(train_names)

#pos = 280
#neg = 838
train = 1676


In [35]:
feature_set = np.load('kagl_output_feature_set1.npy').item()

train_feature_np, train_label_np = prepare_data(
    train_names, meta_patient, feature_set)

valid_feature_np, valid_label_np = prepare_data(
    valid_names, meta_patient, feature_set)

test_feature_np, _ = prepare_data(
    test_names, meta_patient, feature_set)

feature.shape =  (1676, 115)
pos = 838, 0.500000
neg = 838, 0.500000
feature.shape =  (279, 115)
pos = 82, 0.293907
neg = 197, 0.706093
feature.shape =  (198, 115)


In [36]:
x_train = train_feature_np
y_train = train_label_np

x_valid = valid_feature_np
y_valid = valid_label_np

x_test = test_feature_np

In [37]:
clf_rf = RF(n_estimators=10000, criterion='entropy', max_depth=None,
            n_jobs=1)
clf_rf.fit(x_train, y_train)

y_pred_train = clf_rf.predict_proba(x_train)[:,1]
print 'logloss(train) =', kagl_feature_util.logloss(y_train, y_pred_train)

y_pred_valid = clf_rf.predict_proba(x_valid)[:,1]
print 'logloss(valid) =', kagl_feature_util.logloss(y_valid, y_pred_valid)

print 'Random forest'
print(classification_report(
        y_valid, y_pred_valid >= 0.5, target_names=["No Cancer", "Cancer"]))

logloss(train) = 0.0745810455981
logloss(valid) = 0.543295201
Random forest
             precision    recall  f1-score   support

  No Cancer       0.74      0.93      0.83       197
     Cancer       0.59      0.23      0.33        82

avg / total       0.70      0.73      0.68       279



In [19]:
print 'Write submission: Random forest'
y_pred_test = clf_rf.predict_proba(x_test)[:,1]
kagl_feature_util.write_submission_file(test_names, y_pred_test, 'rf-bal')

Write submission: Random forest
Write lcad-rf-bal.csv


'lcad-rf-bal.csv'

In [43]:
feature_imp = sorted(
    zip(map(lambda x: round(x, 4), clf_rf.feature_importances_), feature_names), 
    reverse=True)

In [64]:
t = [
    feature_name_ids[feature_name]
    for imp, feature_name in feature_imp[0:30]
]
print ','.join([str(i) for i in t])

14,12,10,54,89,13,91,15,1,23,2,26,28,59,22,75,32,77,61,16,17,25,82,103,63,114,42,19,68,56


In [58]:
x_train[:,t].shape

(1676, 20)