In [2]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
from collections import Counter
from src.databases import Postgresql 
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier


import warnings
warnings.filterwarnings("ignore")

In [None]:
def random_RF(X,y,n_iter):
    
    print('> Procurando os melhores parametros...')
    n_estimators = [int(x) for x in np.linspace(start = 500, stop = 2000, num = 10)]
    max_features = ['auto', 'sqrt','log2']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    criterion = ['gini','entropy']
    min_samples_split = [2, 5, 10, 12]
    min_samples_leaf = [1, 2, 4, 6, 8]
    bootstrap = [True, False]
    
    random_grid = {'max_features':max_features,
                   'max_depth': max_depth,
                   'min_sample_split': min_sample_split,
                   'min_samples_leaf': min_sample_leaf,
                   'criterion': criterion,
                   'bootstrap': bootstrap}
    
    random_state = 2
    rfc = RandomForestClassifier(n_jobs=-1)
    rf_random = RandomizedSearchCV(estimator=rfc, param_distributions=random_grid, 
                                   n_iter=n_iter, cv=5, verbose=0, random_state=random_state, 
                                   n_jobs=-1, scoring={'AUC':'roc_auc'},refit='AUC')

    # Fit the random search model
    print('> Fitting Modelo...')
    rf_random.fit(X, y)
  
    return rf_random.best_estimator_, rf_random.cv_results_, rf_random.best_params_

In [22]:
# Processo de treinamento.

query = """ SELECT * FROM dataset_final;"""
bd = Postgresql(user='postgres' , password='bruno22#' , host= 'localhost', port= '5432', database = 'brunods')
datasetv1= bd.retrieve_data(query=query)
datasetv1.set_index('sku', inplace=True)
            

    
#divisão dos dados
X= datasetv1.drop('rating', axis=1)
Y= datasetv1['rating']


k_best = SelectKBest(score_func=f_classif, k=18)
selected  = k_best.fit(X,Y)
index = selected.get_support(indices=True)


X1 = X.iloc[:,index]
Y1 = Y


#Divisão entre treino e teste
X_train, X_test, y_train, y_test = train_test_split(X1,Y1,test_size = 0.2, random_state=0)

# Balanciando o dataset
x1, y1 = SMOTE().fit_resample(X_train, y_train)
print('Balanciamento realizado: ', sorted(Counter(y1).items()))

#trained_model, results, params=random_RF(x1,y1,10)  


#trained_model = LogisticRegression(solver = 'liblinear', penalty = 'l1', C = 0.14)  #0.14
trained_model = RandomForestClassifier(n_estimators = 5000, max_features = 'sqrt', criterion = 'entropy', 
                                        bootstrap = True, min_samples_split = 10 , max_depth = 50 , min_samples_leaf= 10)


resultado = []
kf = RepeatedKFold(n_splits = 2, n_repeats=10, random_state=1)

for linhas_treino, linhas_valid in kf.split(x1):

    X_treino, X_valid = x1.iloc[linhas_treino], x1.iloc[linhas_valid]
    y_treino, y_valid = y1.iloc[linhas_treino], y1.iloc[linhas_valid]


    trained_model.fit(X_treino,y_treino)
    p= trained_model.predict(X_valid)
    acc = np.mean(y_valid==p)
    resultado.append(acc)
print(f'ROCAUC com crossvalidação de {roc_auc_score(y_valid,p)}')

#Em dados de teste
trained_model.fit(x1,y1)
p1 = trained_model.predict(X_test)
print(f'ROCAUC em dados de teste de {roc_auc_score(y_test,p1)}')

import pickle
file = 'model_v4.sav'
pickle.dump(trained_model, open(file,'wb'))

Buscando os dados!!!
Conexão com Postgresql fechada
Balanciamento realizado:  [('0', 333), ('1', 333)]
ROCAUC com crossvalidação de 0.9070774114421759
ROCAUC em dados de teste de 0.48214285714285715


In [23]:
print(classification_report(p1,y_test))

              precision    recall  f1-score   support

           0       0.96      0.91      0.94        89
           1       0.00      0.00      0.00         3

    accuracy                           0.88        92
   macro avg       0.48      0.46      0.47        92
weighted avg       0.93      0.88      0.91        92



In [24]:
proba = trained_model.predict_proba(X_test)
proba

data = np.column_stack((y_test, proba))
df = pd.DataFrame(data, columns = ['real', 'classe_0', 'classe_1'])
df.loc[df['classe_1'] > 0.9,:]

Unnamed: 0,real,classe_0,classe_1


In [None]:
# variaveis
import pickle
potencial_de_guarda = int(input('Potencial de Guarda:'))
inox = int(input('Inox:'))
franca = int(input('França:'))
chile = int(input('Chile:'))
chardonnay = int(input('Chardonnay:'))

#transformar as variaveis em DF
data = np.column_stack((potencial_de_guarda,inox, franca, chile, chardonnay))
df = pd.DataFrame(data, columns = ['potencial_de_guarda','inox', 'franca', 'chile', 'chardonnay'])

#Load do modelo
model = pickle.load(open('model_v1.sav','rb'))
result = model.predict(df)

# 1.Rating com mais categorias

In [None]:
#df_final
query = """ SELECT * FROM dataset_finalv1;"""
bd = Postgresql(user='postgres' , password='bruno22#' , host= 'localhost', port= '5432', database = 'brunods')
datasetv1= bd.retrieve_data(query=query)

#rating
query1 = """SELECT sku, CASE WHEN RATING = 3.5 THEN 3
            WHEN RATING = 1.5 THEN 1
            WHEN RATING = 2.5 THEN 2
            WHEN RATING = 4.5 THEN 4
            WHEN RATING = 1 THEN 1
            WHEN RATING = 2 THEN 2
            WHEN RATING = 3 THEN 3
            WHEN RATING = 4 THEN 4
            WHEN RATING = 5 THEN 5
        END AS RATING
FROM rating;"""
bd = Postgresql(user='postgres' , password='bruno22#' , host= 'localhost', port= '5432', database = 'brunods')
rating= bd.retrieve_data(query=query1)

In [None]:
datasetv1.head()

In [None]:
rating[rating.sku == 12815]

In [None]:
def check_len(x):
    if len(x)!=6:
        return x
    else:
        return 'filter'

rating['mask'] = rating['sku'].apply(check_len)

In [None]:
rating_new = rating[rating['mask'] != 'filter']
rating_new['sku'] = rating_new.sku.astype('int64')
rating_new.drop('mask', inplace=True, axis=1)

In [None]:
dataset = pd.merge(datasetv1,rating_new, left_on='sku', right_on='sku', how='left')

In [None]:
dataset.head()