<a href="https://colab.research.google.com/github/angeruzzi/CompeticoesML/blob/main/4a_Flai_202106/submissao5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Bibliotecas

In [26]:
#Bibliotecas Base
import pandas as pd
import numpy as np

#Modelos
from sklearn.ensemble import GradientBoostingClassifier

#Tunagem
from sklearn.model_selection import GridSearchCV

#Validação
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import f1_score

#Dados

In [7]:
#Base de Dados
fonte_treino = 'https://raw.githubusercontent.com/angeruzzi/Datasource/main/DesafioDDS202106_treino.csv'

treino = pd.read_csv(fonte_treino)
treino['Cliente']  = treino['Cliente'].map({'Premium':1, 'Normal':0})

treinoX = treino.drop(['Cliente'], axis = 1) 
treinoy = treino['Cliente']

#TRANSFORMAÇÃO

In [8]:
def Transf(dados):
  dados.drop(['gender'], axis = 1, inplace = True)
  dados.drop(['Dependents'], axis = 1, inplace = True)
  dados.drop(['SeniorCitizen'], axis = 1, inplace = True)
  dados.drop(['PaperlessBilling'], axis = 1, inplace = True)  

  dados['Partner']          = dados['Partner'].map({'Yes':1, 'No':0})
  dados['PhoneService']     = dados['PhoneService'].map({'Yes':1, 'No':0})

  dados['MultipleLines']    = dados['MultipleLines'].map({'Yes':1, 'No':0, 'No phone service':0})
  
  dados['OnlineSecurity']   = dados['OnlineSecurity'].map({'Yes':1, 'No':0, 'No internet service':0})
  dados['OnlineBackup']     = dados['OnlineBackup'].map({'Yes':1, 'No':0, 'No internet service':0})
  dados['DeviceProtection'] = dados['DeviceProtection'].map({'Yes':1, 'No':0, 'No internet service':0})
  dados['TechSupport']      = dados['TechSupport'].map({'Yes':1, 'No':0, 'No internet service':0})
  dados['StreamingTV']      = dados['StreamingTV'].map({'Yes':1, 'No':0, 'No internet service':0})
  dados['StreamingMovies']  = dados['StreamingMovies'].map({'Yes':1, 'No':0, 'No internet service':0})

  dados['tenure']           = dados['tenure']/100
      
  variaveis_toDummie        = ['Contract', 'InternetService', 'PaymentMethod']

  dados = pd.get_dummies(dados, columns = variaveis_toDummie, drop_first = True)
  return dados 

In [9]:
treinoX = Transf(treinoX)

#Hipertunagem

In [22]:
def Tunagem(modelo, treino, targets, parametros, validacao, score):    
    search = GridSearchCV(modelo, param_grid = parametros,
                                  scoring = score, cv = validacao, 
                                  verbose = 1, n_jobs = -1)
    search.fit(treino, targets) 
    bestModel = search.best_estimator_
    bestScore = search.best_score_
    bestParam = search.best_params_

    return {
            'bestModel': bestModel,
            'bestScore': bestScore,
            'bestParam': bestParam
    }

In [28]:
validacao = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 20)
#validacao = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 1)

testeTunModels = [GradientBoostingClassifier()]
testeTunParams = [
                  {
                   'min_samples_split': [2, 5, 10],
                   'min_samples_leaf': [1, 3, 5],
                   'max_depth' : [2, 4, 6],
                   'n_estimators': [50, 100, 150],
                   'learning_rate': np.arange(0.1, 1.1, 0.1)
                  }
]

In [None]:
for i in range(len(testeTunModels)):
  ret = Tunagem(testeTunModels[i], treinoX, treinoy, testeTunParams[i], validacao, 'f1')
  print(ret)

#Validação de Modelos

In [12]:
def CompareML(X, y, lista_de_modelos, nome_dos_modelos, validacao):
  lista_de_medidas = ['accuracy', 'recall', 'precision', 'balanced_accuracy', 'f1']
  nome_das_medidas = ['acurácia', 'sensibilidade', 'precisão', 'eficiência', 'f1-score']
  resultados0 = {}
  
  for i in range(len(lista_de_modelos)):
      print('Rodando modelo: ' + nome_dos_modelos[i])
      accs_vc = cross_validate(lista_de_modelos[i], X, y, cv = validacao, scoring = lista_de_medidas)

      acc = accs_vc['test_accuracy'].mean()
      sen = accs_vc['test_recall'].mean()
      vpp = accs_vc['test_precision'].mean()
      bac = accs_vc['test_balanced_accuracy'].mean()
      f1s = accs_vc['test_f1'].mean()

      resultados0[nome_dos_modelos[i]] = [acc, sen, vpp, f1s, bac]
      
  resultados = pd.DataFrame(resultados0, index = nome_das_medidas).T
  return resultados

In [13]:
validacao = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 20)

#Comparação de modelos GBC
mod_gbc     = GradientBoostingClassifier()
mod_gbc50A  = GradientBoostingClassifier(learning_rate=0.2, max_depth=2, n_estimators=50)
mod_gbc50B  = GradientBoostingClassifier(learning_rate=0.4, max_depth=2, n_estimators=50)
mod_gbc100A = GradientBoostingClassifier(learning_rate=0.2, max_depth=2, n_estimators=100, min_samples_leaf=1, min_samples_split=2)
mod_gbc100B = GradientBoostingClassifier(learning_rate=0.4, max_depth=2, n_estimators=100)
mod_gbc200  = GradientBoostingClassifier(learning_rate=0.2, max_depth=2, n_estimators=200, min_samples_leaf=1, min_samples_split=2)

In [14]:
lista_de_modelos = [
                      mod_gbc     ,                    
                      mod_gbc50A  ,                    
                      mod_gbc50B  ,
                      mod_gbc100A ,
                      mod_gbc100B ,
                      mod_gbc200  
                    ]

nome_dos_modelos = [
                        'GBC',
                        'GBC-50-A' ,
                        'GBC-50-B' ,                    
                        'GBC-100-A',
                        'GBC-100-B',
                        'GBC-200'  
                    ]

In [15]:
resultados = CompareML(treinoX, treinoy, lista_de_modelos, nome_dos_modelos, validacao)
resultados.sort_values(by = 'f1-score', ascending = False)

Rodando modelo: GBC
Rodando modelo: GBC-50-A
Rodando modelo: GBC-50-B
Rodando modelo: GBC-100-A
Rodando modelo: GBC-100-B
Rodando modelo: GBC-200


Unnamed: 0,acurácia,sensibilidade,precisão,eficiência,f1-score
GBC-100-A,0.98704,0.985941,0.986642,0.986229,0.986983
GBC-50-B,0.98662,0.985642,0.986039,0.985786,0.986564
GBC-100-B,0.98656,0.985094,0.986456,0.985711,0.986481
GBC-200,0.986,0.984111,0.986244,0.985111,0.985895
GBC,0.98484,0.984326,0.983657,0.983911,0.984811
GBC-50-A,0.98408,0.984366,0.982022,0.983116,0.984097


In [17]:
dadosSelecionados = treinoX

In [18]:
modelo_decidido = mod_gbc100A
modelo_decidido.fit(dadosSelecionados,treinoy)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.2, loss='deviance', max_depth=2,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [21]:
#preditos Treino
preditos = modelo_decidido.predict(dadosSelecionados)
f1 = f1_score(treinoy, preditos)
f1

0.9949066213921902

#ARQUIVO SUBMISSÃO

In [None]:
fonte_saida  = 'https://raw.githubusercontent.com/angeruzzi/Datasource/main/DesafioDDS202106_teste.csv'
saida  = pd.read_csv(fonte_saida)

saida = Transf(saida)

pred = modelo_decidido.predict(saida)

subm = pd.DataFrame()
subm['Cliente'] = pred
subm.to_csv('submissao5.csv', index = False)

In [None]:
from google.colab import files
files.download("submissao5.csv")