In [64]:
import argparse
import csv
import os

from keras import applications
from keras.applications.mobilenet import preprocess_input
from keras.preprocessing import image
import numpy as np
import pandas
import time
import h5py
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score,confusion_matrix

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [14]:
model =  applications.mobilenet.MobileNet(weights='imagenet', include_top=False, pooling='avg',input_shape=(224,224,3))

In [15]:
data = pandas.read_csv('input/train.csv')

In [81]:
data.head()

Unnamed: 0,row_id,age,gender,view_position,image_name,detected
0,id_0,45,M,0,scan_0000.png,class_3
1,id_1,57,F,0,scan_0001.png,class_3
2,id_10,58,M,0,scan_00010.png,class_3
3,id_1000,64,M,0,scan_0001000.png,class_6
4,id_10000,33,M,1,scan_00010000.png,class_3


In [25]:
n = len(data)

df_features = []
start_time = time.time()
for i in range(n):
    img_path = os.path.join('input', 'train_', data.iloc[i,4])
    if os.path.isfile(img_path):
        if(i%100 == 0):
            print('is file: {}'.format(img_path))
            print(i)
            print ('time elapsed: ' + str((time.time()-start_time)/60)+ ' min')

        # load image setting the image size to 224 x 224
        img = image.load_img(img_path, target_size=(224, 224))
        # convert image to numpy array
        x = image.img_to_array(img)
        # the image is now in an array of shape (3, 224, 224)
        # but we need to expand it to (1, 2, 224, 224) as Keras is expecting a list of images
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        # extract the features
        features = model.predict(x)
        features = features.flatten()
        df_features.append(features)

is file: input/train_/scan_0000.png
0
time elapsed: 7.62939453125e-06 min
is file: input/train_/scan_00010145.png
100
time elapsed: 0.23191694418589273 min
is file: input/train_/scan_00010286.png
200
time elapsed: 0.47322530349095665 min
is file: input/train_/scan_00010446.png
300
time elapsed: 0.7173392613728841 min
is file: input/train_/scan_00010604.png
400
time elapsed: 0.972936753431956 min
is file: input/train_/scan_00010762.png
500
time elapsed: 1.2198948780695598 min
is file: input/train_/scan_00010907.png
600
time elapsed: 1.4603290518124898 min
is file: input/train_/scan_00011061.png
700
time elapsed: 1.7053731679916382 min
is file: input/train_/scan_00011206.png
800
time elapsed: 1.9440357168515523 min
is file: input/train_/scan_00011363.png
900
time elapsed: 2.1940085172653196 min
is file: input/train_/scan_00011508.png
1000
time elapsed: 2.4366525212923684 min
is file: input/train_/scan_00011669.png
1100
time elapsed: 2.6931780854860943 min
is file: input/train_/scan_00011

is file: input/train_/scan_00025102.png
10100
time elapsed: 24.47081842025121 min
is file: input/train_/scan_00025248.png
10200
time elapsed: 24.708692113558453 min
is file: input/train_/scan_00025390.png
10300
time elapsed: 24.951989897092183 min
is file: input/train_/scan_00025545.png
10400
time elapsed: 25.196199786663055 min
is file: input/train_/scan_00025679.png
10500
time elapsed: 25.434403860569 min
is file: input/train_/scan_00025825.png
10600
time elapsed: 25.677616035938264 min
is file: input/train_/scan_00025981.png
10700
time elapsed: 25.92256119251251 min
is file: input/train_/scan_00026145.png
10800
time elapsed: 26.1655113418897 min
is file: input/train_/scan_00026290.png
10900
time elapsed: 26.405546708901724 min
is file: input/train_/scan_00026448.png
11000
time elapsed: 26.64571396112442 min
is file: input/train_/scan_0002659.png
11100
time elapsed: 26.883683824539183 min
is file: input/train_/scan_00026728.png
11200
time elapsed: 27.120201754570008 min
is file: inpu

In [41]:
# save features
h5f_data = h5py.File('feature', 'w')
h5f_data.create_dataset('dataset_1', data=np.array(df_features))
h5f_data.close()

In [54]:
labels = data.detected.as_matrix().tolist()

In [55]:
le = LabelEncoder()
le.fit(labels)
labels = le.transform(labels)

In [82]:
X = pandas.DataFrame(df_features)
X = pandas.concat([X,data.drop(['row_id','detected','image_name','gender'],axis=1)],axis=1)

In [83]:
le.fit(data['gender'])
X['gender'] = le.transform(data['gender'])

In [84]:
X_train, X_test, Y_train, Y_test = train_test_split(X,labels,test_size=0.3)

In [None]:
tree_clf = DecisionTreeClassifier(max_depth=10)
tree_clf.fit(X_train,Y_train)

In [None]:
Y_pred = tree_clf.predict(X_test)
print (accuracy_score(Y_pred,Y_test))

In [None]:
Y_pred = tree_clf.predict(X_test)
print (confusion_matrix(Y_pred,Y_test))

In [73]:
np.unique(Y_pred)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13])