In [2]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn import datasets

from keras.models import Model,Sequential
from keras.layers import Activation, Dense,Flatten, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from scipy.sparse import issparse
import numpy as np
import pandas as pd

import os
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [0]:
from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

    

In [0]:
#Implementacao do contrutor de modelos
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras import backend as K
from keras.applications import ResNet50,VGG16,VGG19,InceptionV3

from keras import metrics

def create_model_vgg16(units=(64,32),num_classe=9,optimizer='rmsprop',final_act='sigmoid'):
  vgg = VGG16(input_shape=(128, 128, 3), include_top=False, weights='imagenet')
  x = vgg.output
  x = Flatten()(x)
  x = Dense(units[0], activation='relu')(x)
  x = Dropout(0.2)(x)
  x = Dense(units[1], activation='relu')(x)
  x = Dropout(0.2)(x)

  out = Dense(num_classe, activation=final_act)(x)
  model = Model(inputs=vgg.input, outputs=out)
  for layer in model.layers[:-7]:
      layer.trainable = False

  # Check the trainable status of the individual layers
  #for layer in model.layers:
  #    print(layer, layer.trainable)

  #model.summary()
  model.compile(optimizer=optimizer,
                loss='binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])
  return model

In [0]:
#ler o csv e carregar num objeto dataframe do pandas
path_name="drive/My Drive/"
images_dir=path_name+"condor/images/"
os.path.exists(images_dir)

data_frame=pd.read_csv(path_name+'dataset/condor_data.csv', index_col=[0])
data_frame["labels"] = data_frame["labels"].apply(lambda x: list(set(eval(x))))

In [6]:
#Binarizar os labels do y [[0 0 1 0]] por exemplo
from sklearn.preprocessing import MultiLabelBinarizer  
X,y=data_frame.filename, data_frame.labels
mbl=MultiLabelBinarizer()
y=mbl.fit_transform(data_frame.labels)
columns= mbl.classes_

for id,column in enumerate(columns):
  data_frame[column]=y[:,id]

#data_frame=data_frame.query('arma==1 or bandeja==1 or fonte==1 or tomada==1 and municao==0')
print(data_frame.describe())

total=len(data_frame.index)
print(total)
def analizar_pesos(data_frame,columns):
  class_weights={}
  #print(data_frame.query('{}==1'.format('arma')))
  #definindo os pesos de cada classe
  for id,column in enumerate(columns):
    count=len(data_frame.query('{}==1'.format(column)))
    try:
      class_weights[id]=total/count
    except:
      class_weights[id]=0

  return class_weights

class_weights=analizar_pesos(data_frame,columns)
print(columns,class_weights)

         adptador     bandeja     bateria  ...      coldre    pendrive       spark
count  244.000000  244.000000  244.000000  ...  244.000000  244.000000  244.000000
mean     0.196721    0.258197    0.172131  ...    0.049180    0.135246    0.073770
std      0.398337    0.438542    0.378270  ...    0.216689    0.342689    0.261934
min      0.000000    0.000000    0.000000  ...    0.000000    0.000000    0.000000
25%      0.000000    0.000000    0.000000  ...    0.000000    0.000000    0.000000
50%      0.000000    0.000000    0.000000  ...    0.000000    0.000000    0.000000
75%      0.000000    1.000000    0.000000  ...    0.000000    0.000000    0.000000
max      1.000000    1.000000    1.000000  ...    1.000000    1.000000    1.000000

[8 rows x 9 columns]
244
['adptador' 'bandeja' 'bateria' 'cabo' 'carregador' 'cartucho' 'coldre'
 'pendrive' 'spark'] {0: 5.083333333333333, 1: 3.873015873015873, 2: 5.809523809523809, 3: 11.090909090909092, 4: 6.256410256410256, 5: 8.413793103448276, 

In [7]:
from keras_preprocessing.image import ImageDataGenerator
datagen =  ImageDataGenerator(
  rescale=1. / 255,
  zoom_range = 0.05, # Aleatory zoom
  rotation_range= 10,
  width_shift_range=0.1,  # horizontal shift
  height_shift_range=0.1,  # vertical shift
  horizontal_flip=True,
  vertical_flip=True,)

path="drive/My Drive/"
img_iter = datagen.flow_from_dataframe(
    data_frame,
    shuffle=True,
    directory=path,
    x_col='filename',
    y_col=columns ,
    class_mode='other',# quando existe colunas binarizadas usar o other categorical quando possui uma lista de labels numa coluna label
    target_size=(128, 128),
    batch_size=24,
    subset='training'
)

X_train,y_train=img_iter.next()
for i in range(1,20):
  X_data,y_data=img_iter.next()
  X_train=np.concatenate((X_train, X_data))
  y_train=np.concatenate((y_train, y_data))

print(type(X_train),type(y_train),X_train.shape,y_train.shape)

Found 244 validated image filenames.
<class 'numpy.ndarray'> <class 'numpy.ndarray'> (460, 128, 128, 3) (460, 9)


In [8]:
from time import time
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
start= time()

model = KerasClassifier(build_fn=create_model_vgg16,epochs=20, 
                        batch_size=16,verbose=1)

units=[(64,32),(128,32),(32,16),(128,64)]
optimizers = ['rmsprop', 'adam','adagrad','adadelta','nadam']
param_grid = dict(optimizer=optimizers,units=units)
grid = GridSearchCV(estimator=model, 
                    param_grid=param_grid,
                    return_train_score=True,
                    #scoring=['precision_macro','recall_macro','f1_macro'],
                    refit='precision_m',
                    cv=3,
                    verbose=3)

grid_result = grid.fit(X_train, y_train)

# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

print("total time:",time()-start)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] optimizer=rmsprop, units=(64, 32) ...............................




Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.









Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[CV]  optimizer=rmsprop, units=(64, 32), score=(train=0.971, test=0.918), total=  25.8s
[CV] optimizer=rmsprop, units=(64, 32) ...............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.1s remaining:    0.0s


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[CV]  optimizer=rmsprop, units=(64, 32), score=(train=0.987, test=0.946), total=   9.5s
[CV] optimizer=rmsprop, units=(64, 32) ...............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   36.0s remaining:    0.0s


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[CV]  optimizer=rmsprop, units=(64, 32), score=(train=0.978, test=0.946), total=   9.2s
[CV] optimizer=rmsprop, units=(128, 32) ..............................
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[CV]  optimizer=rmsprop, units=(128, 32), score=(train=0.986, test=0.942), total=   9.5s
[CV] optimizer=rmsprop, units=(128, 32) ..............................
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 17.6min finished


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Best: 0.946881 using {'optimizer': 'adam', 'units': (128, 32)}
0.936272 (0.013098) with: {'optimizer': 'rmsprop', 'units': (64, 32)}
0.944209 (0.002218) with: {'optimizer': 'rmsprop', 'units': (128, 32)}
0.913767 (0.014826) with: {'optimizer': 'rmsprop', 'units': (32, 16)}
0.927509 (0.009712) with: {'optimizer': 'rmsprop', 'units': (128, 64)}
0.937701 (0.007233) with: {'optimizer': 'adam', 'units': (64, 32)}
0.946881 (0.006831) with: {'optimizer': 'adam', 'units': (128, 32)}
0.920284 (0.002733) with: {'optimizer': 'adam', 'units': (32, 16)}
0.945914 (0.007840) with: {'optimizer': 'adam', 'units': (128, 64)}
0.926816 (0.004071) with: {'optimizer': 'adagrad', 'units': (64, 32)}
0.934578 (0.012508) with: {'optimizer': 'adagrad', 'units': (128, 32)}
0.916691 (0.0

In [9]:
from sklearn.metrics import multilabel_confusion_matrix,classification_report

datagen_test =  ImageDataGenerator(
  rescale=1. / 255)

img_iter_test = datagen_test.flow_from_dataframe(
    data_frame,
    directory=path,
    x_col='filename',
    y_col=columns,
    class_mode='other',
    target_size=(128, 128),
    batch_size=100
)
def prob_to_binary(predict,threshold):
  return (predict >= threshold).astype(int)

X_test,y_test=img_iter_test.next()

print(X_test.shape,y_test.shape)

proba=grid.predict_proba(X_test)
t = 0.3 # threshold value
print(proba)
y_pred_new = prob_to_binary(proba,t)

print(len(y_test),len(proba),len(y_pred_new))
cm=multilabel_confusion_matrix(y_test,y_pred_new)
print("O recall é intuitivamente a capacidade do classificador encontrar todas as amostras positivas.\n")
print("A precisão é intuitivamente a capacidade do classificador não rotular como positiva uma amostra negativa.\n")
print("A pontuação F1 pode ser interpretada como uma média ponderada da precisão e recall.\n")
print("Support é a quantidade de ocorrencia da classe.\n")
print( classification_report(y_test,y_pred_new,target_names=mbl.classes_))

for i,label in enumerate(mbl.classes_):
  print("Matrix confusão do(a) {}".format(label))
  print(cm[i],'\n')

Found 244 validated image filenames.
(100, 128, 128, 3) (100, 9)
[[1.07288361e-06 2.98023224e-07 2.08616257e-07 2.02953815e-05
  1.54078007e-05 2.38418579e-05 3.57627869e-07 1.22487545e-05
  2.42888927e-05]
 [1.30236149e-05 9.08374786e-05 8.45491886e-05 3.95774841e-05
  5.88774681e-04 9.93951023e-01 7.70390034e-05 4.05788422e-04
  8.13722610e-04]
 [2.68220901e-07 5.96046448e-07 9.23871994e-07 2.47359276e-05
  8.67247581e-06 1.14977360e-04 2.38418579e-07 3.38554382e-05
  2.38418579e-06]
 [8.40425491e-06 2.86102295e-06 7.86781311e-06 1.25676394e-04
  9.03606415e-05 1.44335628e-03 4.00543213e-05 1.71542168e-04
  1.11192465e-04]
 [4.10404801e-03 9.98603106e-01 1.15281343e-02 1.13715529e-02
  1.10040307e-02 1.31726265e-04 2.13146210e-04 1.31845474e-04
  7.21210241e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.34110451e-06
  5.96046448e-08 0.00000000e+00 0.00000000e+00 1.19209290e-07
  1.19209290e-07]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 7.80820847e-06
  3.57627869e-07 2.98

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:

from sklearn.metrics import hamming_loss

hamming_loss(y_test,y_pred_new)

0.025555555555555557

In [18]:
model_vgg16=create_model_vgg16(num_classe=len(mbl.classes_),units=(128,64),optimizer='adam')
model_vgg16.fit(X_train,y_train,epochs=500,class_weight=class_weights,verbose=2)

Epoch 1/500
 - 8s - loss: 3.0233 - acc: 0.7952 - f1_m: 0.1847 - precision_m: 0.2539 - recall_m: 0.1711
Epoch 2/500
 - 0s - loss: 2.0604 - acc: 0.8698 - f1_m: 0.3654 - precision_m: 0.5778 - recall_m: 0.2707
Epoch 3/500
 - 0s - loss: 1.6253 - acc: 0.8884 - f1_m: 0.4733 - precision_m: 0.7030 - recall_m: 0.3663
Epoch 4/500
 - 0s - loss: 1.4378 - acc: 0.9065 - f1_m: 0.6118 - precision_m: 0.7169 - recall_m: 0.5431
Epoch 5/500
 - 1s - loss: 1.2583 - acc: 0.9097 - f1_m: 0.6152 - precision_m: 0.7607 - recall_m: 0.5271
Epoch 6/500
 - 0s - loss: 1.0717 - acc: 0.9176 - f1_m: 0.6800 - precision_m: 0.7471 - recall_m: 0.6367
Epoch 7/500
 - 0s - loss: 0.9280 - acc: 0.9345 - f1_m: 0.7417 - precision_m: 0.8231 - recall_m: 0.6892
Epoch 8/500
 - 0s - loss: 0.8063 - acc: 0.9403 - f1_m: 0.7739 - precision_m: 0.8179 - recall_m: 0.7430
Epoch 9/500
 - 0s - loss: 0.7228 - acc: 0.9440 - f1_m: 0.7934 - precision_m: 0.8368 - recall_m: 0.7625
Epoch 10/500
 - 0s - loss: 0.6023 - acc: 0.9551 - f1_m: 0.8303 - precisio

<keras.callbacks.History at 0x7f7d66141358>

In [19]:
proba=model_vgg16.predict(X_test)
t = 0.37 # threshold value
y_pred_new = prob_to_binary(proba,t)

print(len(y_test),len(proba),len(y_pred_new))
cm=multilabel_confusion_matrix(y_test,y_pred_new)
print("O recall é intuitivamente a capacidade do classificador encontrar todas as amostras positivas.\n")
print("A precisão é intuitivamente a capacidade do classificador não rotular como positiva uma amostra negativa.\n")
print("A pontuação F1 pode ser interpretada como uma média ponderada da precisão e recall.\n")
print("Support é a quantidade de ocorrencia da classe.\n")
print( classification_report(y_test,y_pred_new,target_names=mbl.classes_))

for i,label in enumerate(mbl.classes_):
  print("Matrix confusão do(a) {}".format(label))
  print(cm[i],'\n')

hamming_loss(y_test,y_pred_new)

100 100 100
O recall é intuitivamente a capacidade do classificador encontrar todas as amostras positivas.

A precisão é intuitivamente a capacidade do classificador não rotular como positiva uma amostra negativa.

A pontuação F1 pode ser interpretada como uma média ponderada da precisão e recall.

Support é a quantidade de ocorrencia da classe.

              precision    recall  f1-score   support

    adptador       0.95      1.00      0.97        19
     bandeja       1.00      1.00      1.00        26
     bateria       1.00      1.00      1.00        17
        cabo       1.00      0.89      0.94         9
  carregador       1.00      0.88      0.93        16
    cartucho       0.91      0.83      0.87        12
      coldre       0.75      0.60      0.67         5
    pendrive       0.92      0.79      0.85        14
       spark       1.00      1.00      1.00         5

   micro avg       0.97      0.92      0.94       123
   macro avg       0.95      0.89      0.91       123
w

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.015555555555555555

In [16]:
dataset_path=path_name+'dataset/'
print(dataset_path)
# serialize model to JSON
model_json = model_vgg16.to_json()
with open(dataset_path+"model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model_vgg16.save_weights(dataset_path+"model.h5")
print("Saved model to disk")
print('mbl.classes_ ',mbl.classes_)

drive/My Drive/dataset/
Saved model to disk
mbl.classes_  ['adptador' 'bandeja' 'bateria' 'cabo' 'carregador' 'cartucho' 'coldre'
 'pendrive' 'spark']
