In [27]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn import model_selection
from sklearn import linear_model as lm
from sklearn import preprocessing
from sklearn import pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn import neighbors
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn import ensemble
from sklearn import tree
import pickle
from sklearn import decomposition


import os
os.environ['PATH'] = os.environ['PATH'] + ';C:/Program Files (x86)/Graphviz2.38/bin'

%store -r WORKDIR

#if 'WORKDIR' not in dir():
WORKDIR = 'C:/Users/cammy/OneDrive/MIT IA/git/projeto_dogs/dogs_brand'

pandas.set_option("display.precision", 2)

input_table = '../../Data/Raw/dogs.parquet'

model_score_file = WORKDIR + '/Data/Modeling/model_scores.parquet'
model_file = WORKDIR + '/Data/Modeling/trained_models.jbl'



target_variable = ['breed']

# Carga dos Dados

In [28]:
data = pandas.read_parquet(input_table)
print('data shape:', data.shape)

data shape: (159, 4097)


# Treino / Teste

In [29]:
X = data.drop(target_variable, axis = 1).values
Y = data[target_variable].values

In [30]:
classes = data.breed.unique()
classes

array(['affenpinscher', 'african', 'airedale', 'akita'], dtype=object)

In [31]:
Xtrain, Xtest, Ytrain, Ytest = model_selection.train_test_split(X,
                                                                Y,
                                                                test_size=0.2,
                                                                random_state=0,
                                                                stratify=Y)

# Objeto de validacao cruzada
cvfold = model_selection.StratifiedKFold(n_splits = 3, random_state = 0)



# Random Forest

In [32]:
# Configure Pipeline
conf_train_pipe = [
    ('Scaler', StandardScaler()),
    ('PCA', decomposition.PCA(n_components=20)),
    ('Model', ensemble.RandomForestClassifier()),
]
model_pipe = pipeline.Pipeline(conf_train_pipe)

# Grid-Search
param_grid = {
    'Model__n_estimators': [10, 30, 40],
    'Model__max_depth': [4, 6, 8],
}
model_pipe = model_selection.GridSearchCV(model_pipe,
                                          param_grid,
                                          scoring=None,
                                          cv=cvfold)

model_pipe.fit(Xtrain, Ytrain.ravel())


decision_threshold = 0.5
Yhat = model_pipe.predict_proba(Xtest)[:,1]
Ypred= Yhat >= decision_threshold

print('Yhat.shape:', Yhat.shape)

Yhat.shape: (32,)


In [33]:
preds = model_pipe.predict(Xtest)
print("Accuracy:", metrics.accuracy_score(Ytest,preds))

Accuracy: 0.4375


In [34]:
preds = model_pipe.predict(Xtrain)
print("Accuracy:", metrics.accuracy_score(Ytrain,preds))

Accuracy: 0.7559055118110236


# Exportar modelo

In [35]:
with open(model_file, 'wb') as fid:
    pickle.dump(model_pipe, fid)