In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import glob, os
import smartbee as sb

In [2]:
# Importing the dataset

'''O link em url possui os datasets do projeto, verificar qual se quer usar para passar como índice do datasets no read_csv
e qual pasta se colocará junto da url no get_files_paths!'''

base='https://raw.githubusercontent.com/rhanielmx/smartbee-files/master/Datasets/Bayer/'
url='https://github.com/rhanielmx/smartbee-files/tree/master/Datasets/'
ext=['csv']
datasets=sb.get_files_paths(url+'Bayer', ext)

#Colunas que vamos importar do dataset
#weight_col='hive_weight';temp_col='hive_temperature';humidity_col='hive_humidity';labels_col='KMeansLabel'
weight_col='Weight(lbs)';temp_hive_col='Temp-Hive(F)';temp_brood_col='Temp-Brood(F)';humidity_hive_col='HRH(%)';humidity_brood_col='BRH(%)';labels_col='Code_Label'

In [3]:
data = pd.read_csv(base+datasets[0][0],usecols=(weight_col, temp_hive_col, temp_brood_col, humidity_hive_col, humidity_brood_col, labels_col))
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [4]:
#Splitting the data into train, validate e test sets

last_col=data.shape[1]-1
train_set, validate_set, test_set = sb.separate_for_classes(data,train_size=0.6,validate_size=0.2,test_size=0.2)
X_train, y_train = train_set.iloc[:,0:last_col], train_set.iloc[:,last_col]
X_validate, y_validate = validate_set.iloc[:,0:last_col], validate_set.iloc[:,last_col]
X_test, y_test = test_set.iloc[:,0:last_col], test_set.iloc[:,last_col]

In [5]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_validate = sc.fit_transform(X_validate)
X_test = sc.transform(X_test)

In [6]:
# Fitting classifier to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=96, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=96, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [7]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [8]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
sr = 100*(cm.diagonal().sum()/cm.sum())

In [9]:
n_trees = classifier.n_estimators
n_splits=int(round(np.sqrt(len(data.columns)-1)))
print('Parâmetros Utilizados:')
print(f'Número de Árvores = {n_trees} \tNúmero de características por divisão = {n_splits}\n')

print(f'Taxa de Acerto: {sr:0.2f}%')
print(f'Matriz de Confusão: \n {cm}')

Parâmetros Utilizados:
Número de Árvores = 96 	Número de características por divisão = 2

Taxa de Acerto: 74.29%
Matriz de Confusão: 
 [[ 0  1  0]
 [ 0 17  2]
 [ 0  6  9]]


In [10]:
from sklearn.model_selection import GridSearchCV

ntree = list(range(75,101))
max_feat = list(range(1,4))

param_grid = [dict(n_estimators = ntree, max_features  = max_feat)]

In [11]:
grid = GridSearchCV(classifier, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid.fit(X_validate,y_validate)



GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=96, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'n_estimators': [75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100], 'max_features': [1, 2, 3]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [12]:
best_clf = grid.best_estimator_
best_clf.fit(sc.fit_transform(X),y)
opt_y_pred = best_clf.predict(X_test)

In [13]:
from sklearn.metrics import confusion_matrix
new_cm = confusion_matrix(y_test, opt_y_pred)
new_sr = 100*(new_cm.diagonal().sum()/new_cm.sum())

In [14]:
n_trees = best_clf.n_estimators
n_feat = best_clf.max_features
print('Melhores paramêtros definidos pós-CV:')
print('Número de Árvores = {} \tNúmero de características por divisão = {}\n'.format(n_trees, n_feat))

print(f'Taxa de Acerto: {new_sr:0.2f}%')
print(f'Matriz de Confusão: \n {new_cm}')

Melhores paramêtros definidos pós-CV:
Número de Árvores = 76 	Número de características por divisão = 3

Taxa de Acerto: 88.57%
Matriz de Confusão: 
 [[ 1  0  0]
 [ 0 18  1]
 [ 0  3 12]]


In [15]:
Params={'max_features':grid.best_estimator_.get_params()['max_features'],'n_estimators': grid.best_estimator_.get_params()['n_estimators']}
grid_values = []

for i in range(0,len(grid.grid_scores_)):
    if Params == grid.grid_scores_[i][0]:
        best_mean, best_std = grid.grid_scores_[i][1], grid.grid_scores_[i][2].std()



In [16]:
f'Taxa de acerto = ({best_mean*100:0.2f} ± {best_std*200:0.2f})%'

'Taxa de acerto = (71.43 ± 32.82)%'