In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import glob, os
import smartbee as sb

In [2]:
# Importing the dataset

'''O link em url possui os datasets do projeto, verificar qual se quer usar para passar como índice do datasets no read_csv
e qual pasta se colocará junto da url no get_files_paths!'''

base='https://raw.githubusercontent.com/rhanielmx/smartbee-files/master/Datasets/Bayer/'
url='https://github.com/rhanielmx/smartbee-files/tree/master/Datasets/'
ext=['csv']
datasets=sb.get_files_paths(url+'Bayer', ext)

#Colunas que vamos importar do dataset
#weight_col='hive_weight';temp_col='hive_temperature';humidity_col='hive_humidity';labels_col='KMeansLabel'
weight_col='Weight(lbs)';temp_hive_col='Temp-Hive(F)';temp_brood_col='Temp-Brood(F)';humidity_hive_col='HRH(%)';humidity_brood_col='BRH(%)';labels_col='Code_Label'

In [3]:
data = pd.read_csv(base+datasets[0][0],usecols=(weight_col, temp_hive_col, temp_brood_col, humidity_hive_col, humidity_brood_col, labels_col))
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [27]:
data[data.Code_Label==1]

Unnamed: 0,Temp-Brood(F),Temp-Hive(F),BRH(%),HRH(%),Weight(lbs),Code_Label
36,87.8,81.2,55.9,55.8,48.8,1.0
37,88.4,85.4,59.9,60.3,51.6,1.0
38,81.5,76.2,57.4,57.8,52.6,1.0
39,77.9,73.1,57.1,58.3,51.7,1.0
40,80.6,76.4,54.0,54.9,51.0,1.0
41,84.8,81.9,53.9,53.8,50.4,1.0


In [4]:
#Splitting the data into train, validate e test sets

last_col=data.shape[1]-1
train_set, validate_set, test_set = sb.separate_for_classes(data,train_size=0.6,validate_size=0.2,test_size=0.2)
X_train, y_train = train_set.iloc[:,0:last_col], train_set.iloc[:,last_col]
X_validate, y_validate = validate_set.iloc[:,0:last_col], validate_set.iloc[:,last_col]
X_test, y_test = test_set.iloc[:,0:last_col], test_set.iloc[:,last_col]

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [6]:
training_set=pd.DataFrame(X_train)
training_set.columns=data.columns[0:5]
training_set=training_set.assign(Code=y_train)
test_set=pd.DataFrame(X_test)
test_set.columns=data.columns[0:5]
test_set=test_set.assign(Code=y_test)

In [7]:
training_set.to_csv('training_HT101.csv',index=False)
test_set.to_csv('test_HT101.csv',index=False)

In [8]:
data.columns

Index(['Temp-Brood(F)', 'Temp-Hive(F)', 'BRH(%)', 'HRH(%)', 'Weight(lbs)',
       'Code_Label'],
      dtype='object')

In [9]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_validate = sc.fit_transform(X_validate)
X_test = sc.transform(X_test)

In [10]:
#Ajustando o classificador com o conjunto de treino
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=2)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=2, p=2,
           weights='uniform')

In [11]:
#Fazendo as predições com o conjunto de teste
y_pred = classifier.predict(X_test)

In [12]:
#Fazendo a Matriz de Confusão
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
sr = 100*(cm.diagonal().sum()/cm.sum())

In [13]:
n_neighbors = classifier.n_neighbors
print('Parâmetros Utilizados:')
print(f'Número de Vizinhos = {n_neighbors}\n')

print(f'Taxa de Acerto: {sr:0.2f}%')
print(f'Matriz de Confusão: \n {cm}')

Parâmetros Utilizados:
Número de Vizinhos = 2

Taxa de Acerto: 73.33%
Matriz de Confusão: 
 [[ 2  0  0]
 [ 2 28  3]
 [ 1 10 14]]


In [14]:
from sklearn.model_selection import GridSearchCV

k_range = list(range(1,16))
metric = ['minkowski']
p_range = list(range(1,4))

param_grid = [dict(n_neighbors = k_range, metric = metric, p = p_range)]

In [15]:
grid = GridSearchCV(classifier, param_grid, cv=10, scoring='accuracy')
grid.fit(X_validate,y_validate);



In [16]:
best_clf = grid.best_estimator_
best_clf.fit(sc.transform(X),y)
opt_y_pred = best_clf.predict(X_test)

In [17]:
# Making the New Confusion Matrix
from sklearn.metrics import confusion_matrix
new_cm = confusion_matrix(y_test, opt_y_pred)
new_sr = 100*(new_cm.diagonal().sum()/new_cm.sum())

In [18]:
n_neighbors = best_clf.n_neighbors
print('Melhores paramêtros definidos pós-CV')
print(f'Número de Vizinhos = {n_neighbors}\n')

print(f'Taxa de Acerto: {new_sr:0.2f}%')
print(f'Matriz de Confusão: \n {new_cm}')

Melhores paramêtros definidos pós-CV
Número de Vizinhos = 3

Taxa de Acerto: 91.67%
Matriz de Confusão: 
 [[ 2  0  0]
 [ 1 29  3]
 [ 0  1 24]]


In [19]:
'''
from sklearn.externals import joblib
joblib.dump(best_clf,'knn.cls')
''';

In [20]:
Params={'metric':grid.best_estimator_.get_params()['metric'],'n_neighbors': grid.best_estimator_.get_params()['n_neighbors'],'p': grid.best_estimator_.get_params()['p']}
grid_values = []

for i in range(0,len(grid.grid_scores_)):
    if Params == grid.grid_scores_[i][0]:
        best_mean, best_std = grid.grid_scores_[i][1], grid.grid_scores_[i][2].std()



In [21]:
for _ in range(0,len(grid.grid_scores_)):
    print(grid.grid_scores_[i][0])

{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 3}
{'metric': 'minkowski', 'n_neig



In [22]:
f'Taxa de acerto = ({best_mean*100:0.2f} ± {best_std*200:0.2f})%'

'Taxa de acerto = (77.14 ± 38.62)%'

In [23]:
means = []
for i in range(0,len(grid.grid_scores_)):
    for k in k_range:
        Params={'metric':grid.best_estimator_.get_params()['metric'],'n_neighbors': k,'p': grid.best_estimator_.get_params()['p']}
        if Params == grid.grid_scores_[i][0]:
            means.append(grid.grid_scores_[i][1])







