In [1]:
from utils.pre_processed import pre_processamento, potencia, alargamento, logaritmico, negativo
from utils.extraction import descritor_histograma, extract_features
from utils.utils import read_json, read_img, save
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier as mlp
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score , precision_score, cohen_kappa_score
from tqdm.notebook import tqdm_notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from skimage.io import imshow, imread
from skimage.color import rgb2gray
import os

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# 1 Aquisição dos dados

In [3]:
path = read_json('../paths.json')

In [4]:
params = read_json('params.json')

In [5]:
df = pd.read_csv(path['metadata']+'metadataVarroa.csv')
df.head()

Unnamed: 0,image,label
0,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,0
1,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,1
2,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,0
3,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,1
4,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,0


In [6]:
# 1 = infectada
# 0 = não infectada
df['label'].value_counts()

0    9562
1    3947
Name: label, dtype: int64

In [7]:
y = df['label'].to_numpy()

In [8]:
y.shape

(13509,)

In [9]:
dfSaf = df[df['label']==0].sort_values(by='image')
dfSaf.index = [i for i in range(len(dfSaf))]
dfSaf

Unnamed: 0,image,label
0,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,0
1,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,0
2,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,0
3,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,0
4,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,0
...,...,...
9557,2017-10-17_16-41-10.mp4-bee_id_7359-16410-1.png,0
9558,2017-10-17_16-41-10.mp4-bee_id_7360-16440-1.png,0
9559,2017-10-17_16-41-10.mp4-bee_id_7361-16455-1.png,0
9560,2017-10-17_16-41-10.mp4-bee_id_7364-16545-1.png,0


In [10]:
dfInf = df[df['label']==1].sort_values(by='image')
dfInf.index = [i for i in range(len(dfInf))]
dfInf

Unnamed: 0,image,label
0,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,1
1,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,1
2,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,1
3,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,1
4,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,1
...,...,...
3942,2017-10-17_16-41-10.mp4-bee_id_7358-16335-1.png,1
3943,2017-10-17_16-41-10.mp4-bee_id_7362-16455-1.png,1
3944,2017-10-17_16-41-10.mp4-bee_id_7366-16575-1.png,1
3945,2017-10-17_16-41-10.mp4-bee_id_7367-16620-1.png,1


In [11]:
dfBal = pd.concat([dfSaf.loc[:1999], dfInf.loc[:1999]])
dfBal.sort_values(by='image', inplace=True)
dfBal.index = [i for i in range(len(dfBal))]
dfBal

Unnamed: 0,image,label
0,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,0
1,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,1
2,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,0
3,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,1
4,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,0
...,...,...
3995,2017-09-20_19-24-55.mp4-bee_id_3601-21915-1.png,1
3996,2017-09-20_19-24-55.mp4-bee_id_3608-22185-1.png,1
3997,2017-09-20_19-24-55.mp4-bee_id_3616-22425-1.png,1
3998,2017-09-20_19-24-55.mp4-bee_id_3619-22530-1.png,1


In [12]:
yBal = dfBal['label'].to_numpy()
yBal.shape

(4000,)

# Pré - processamento

In [13]:
params

{'k': [5, 16, 20, 35],
 'e': [1.25, 1.55, 1.8, 2],
 'c': [9, 13, 30, 64],
 'gama': [0.57, 0.66, 0.84, 0.92]}

In [14]:
imgsAl = {}
imgsLog = {}
imgsPot = {}
imgsNeg = []

In [15]:
#alargamento
for _k,_e in tqdm_notebook(zip(params['k'], params['e']), total=len(params['k'])):
    imgsAl[f'k{_k}_e{_e}'] = pre_processamento(path['img'], dfBal, alargamento, [_k, _e])

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

In [16]:
imgsAl.keys()

dict_keys(['k5_e1.25', 'k16_e1.55', 'k20_e1.8', 'k35_e2'])

In [17]:
#logaritmo
for _c in tqdm_notebook(params['c']):
    imgsLog[f'c{_c}'] = pre_processamento(path['img'], dfBal, logaritmico, [_c])

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

In [18]:
imgsLog.keys()

dict_keys(['c9', 'c13', 'c30', 'c64'])

In [19]:
#potencia
for _c, _gama in tqdm_notebook(zip(params['c'], params['gama']), total=len(params['c'])):
    imgsPot[f'c{_c}_gama{_gama}'] = pre_processamento(path['img'], dfBal, potencia, [_c, _gama])

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

In [20]:
imgsPot.keys()

dict_keys(['c9_gama0.57', 'c13_gama0.66', 'c30_gama0.84', 'c64_gama0.92'])

In [21]:
#negativo
imgsNeg = pre_processamento(path['img'], dfBal, negativo)

  0%|          | 0/4000 [00:00<?, ?it/s]

# 3 Extração de características

In [22]:
save(y, path['pickle']+'labels.pickle')
save(yBal, path['pickle']+'labels_balancead.pickle')

Saved!
Saved!


In [23]:
yBal.shape, y.shape

((4000,), (13509,))

In [24]:
#original
xOriginal = extract_features(df, path=path['img'], op=0)

  0%|          | 0/13509 [00:00<?, ?it/s]

In [25]:
save(xOriginal, path['pickle']+'feats_original.pickle')

Saved!


In [26]:
xBalancead = extract_features(dfBal, path=path['img'], op=0)
save(xBalancead, path['pickle']+'feats_original_balancead.pickle')

  0%|          | 0/4000 [00:00<?, ?it/s]

Saved!


In [27]:
#alargamento
xAlargamento = {}
for _k,_e in tqdm_notebook(zip(params['k'], params['e']), total=len(params['k'])):
    xAlargamento[f'k{_k}_e{_e}'] = extract_features(imgsAl[f'k{_k}_e{_e}'])
    save(xAlargamento[f'k{_k}_e{_e}'] ,path['pickle']+f'feats_alargamento_k{_k}_e{_e}.pickle')

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

Saved!


  0%|          | 0/4000 [00:00<?, ?it/s]

Saved!


  0%|          | 0/4000 [00:00<?, ?it/s]

Saved!


  0%|          | 0/4000 [00:00<?, ?it/s]

Saved!


In [28]:
xLogaritmo = {}
for _c in tqdm_notebook(params['c']):
    xLogaritmo[f'c{_c}'] = extract_features(imgsLog[f'c{_c}'])
    save(xLogaritmo[f'c{_c}'] , path['pickle']+f'feats_logaritmo_c{_c}.pickle')

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

Saved!


  0%|          | 0/4000 [00:00<?, ?it/s]

Saved!


  0%|          | 0/4000 [00:00<?, ?it/s]

Saved!


  0%|          | 0/4000 [00:00<?, ?it/s]

Saved!


In [29]:
xPotencia = {}
for _c, _gama in tqdm_notebook(zip(params['c'], params['gama']), total=len(params['c'])):
    xPotencia[f'c{_c}_gama{_gama}'] = extract_features(imgsPot[f'c{_c}_gama{_gama}'])
    save(xPotencia[f'c{_c}_gama{_gama}'], path['pickle']+f'feats_potencia_c{_c}_gama{_gama}.pickle')

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

Saved!


  0%|          | 0/4000 [00:00<?, ?it/s]

Saved!


  0%|          | 0/4000 [00:00<?, ?it/s]

Saved!


  0%|          | 0/4000 [00:00<?, ?it/s]

Saved!


In [30]:
xNegativo = extract_features(imgsNeg)
save(xNegativo, path['pickle']+'feats_negative.pickle')

  0%|          | 0/4000 [00:00<?, ?it/s]

Saved!


## 3.1 Scatter plots

In [31]:
dfFeats = pd.DataFrame(xOriginal, columns=['media', 'variancia','skewness','entropia','energia','kurtosis'])
dfFeatsBal = pd.DataFrame(xBalancead,  columns=['media', 'variancia','skewness','entropia','energia','kurtosis'])

In [32]:
dfNew = pd.concat([df, dfFeats], axis=1)
dfNewBal = pd.concat([dfBal,dfFeatsBal], axis=1)

In [33]:
dfNewBal

Unnamed: 0,image,label,media,variancia,skewness,entropia,energia,kurtosis
0,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,0,212.322275,93370.445902,1.887126,4.520240,29213202,2.930613
1,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,1,199.111111,51936.969877,0.993701,4.773233,20605996,-0.460175
2,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,0,224.000000,78013.570000,1.429383,4.610245,25637914,1.007680
3,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,1,195.633188,90620.415667,2.581699,4.609847,29516442,6.669476
4,2017-08-28_09-30-00-1_500_dirty_glass.mp4-bee_...,0,201.801802,93535.600357,1.946727,4.525004,29805624,3.012690
...,...,...,...,...,...,...,...,...
3995,2017-09-20_19-24-55.mp4-bee_id_3601-21915-1.png,1,208.372093,97318.903407,3.290610,4.703474,30258634,11.243058
3996,2017-09-20_19-24-55.mp4-bee_id_3608-22185-1.png,1,193.939394,46946.507149,1.949301,4.907691,19533128,4.741169
3997,2017-09-20_19-24-55.mp4-bee_id_3616-22425-1.png,1,202.714932,117723.597469,2.553607,4.452596,35098544,6.331540
3998,2017-09-20_19-24-55.mp4-bee_id_3619-22530-1.png,1,202.714932,36480.249053,1.036624,4.978020,17143764,0.075920


# 4 Classificação

In [139]:
def classification(X,y):

    """
    X é uma matriz onde cada linha corresponde aos atributos calculados com o descritor
    y são os rótulos da classe de cada imagem. Exemplo [0,1,0,0,1,2,2,0,0,0,...,1]
    """
    
    # separar conjuntos de treino e teste
    X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8, shuffle=True)
    # criar instancia do classificador
    clf = mlp()
    # treinar o classificador
    clf.fit(X_train,y_train)
    # realizar a predicao
    pred = clf.predict(X_test)
    # calcular o resultado utilizando a acuracia
    acuracia = accuracy_score(y_test,pred)
    kappa = cohen_kappa_score(y_test,pred)
    precision = precision_score(y_test,pred)
    
    return acuracia

In [140]:
result = {}

In [None]:
result['original'] = classification(xOriginal,y)
result['original_balanceamento'] = classification(xBalancead, yBal)
result['negativo'] = classification(xNegativo,yBal)

In [None]:
for _k,_e in tqdm_notebook(zip(params['k'], params['e']), total=len(params['k'])):
    result[f'alargamento_k{_k}_e{_e}'] = classification(xAlargamento[f'k{_k}_e{_e}'],yBal)

In [None]:
for _c in tqdm_notebook(params['c']):
    result[f'Logaritmo_c{_c}'] = classification(xLogaritmo[f'c{_c}'],yBal)

In [None]:
for _c, _gama in tqdm_notebook(zip(params['c'], params['gama']), total=len(params['c'])):
    result[f'potencia_c{_c}_gama{_gama}'] = classification(xPotencia[f'c{_c}_gama{_gama}'],yBal)

In [None]:
dfRes = pd.DataFrame([result.keys(),result.values()]).T
dfRes.columns = ['tipo_imagem','acuracia']
dfRes