## Teste 2
##### Agora é fazer testes e melhores validações dos 3 melhores modelos

In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from warnings import simplefilter

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

simplefilter(action='ignore', category=FutureWarning)

In [4]:
df = pd.read_csv('datasets/dataset_cler.csv', sep=";", encoding='utf-16')
df.shape

(5802, 412)

In [5]:
df.drop(df.loc[(df['status']!='Loss') & (df['status']!='Gain')].index.values, inplace=True)
df['status'].value_counts()

Gain    3787
Loss    2015
Name: status, dtype: int64

In [6]:
X = df.drop(['take','status','oper'], axis=1) #df[train_features]
y = df['status']

x_columns = X.columns

In [8]:
encoder = LabelEncoder()
columns_categorical = X.select_dtypes(include=['object']).columns
for col_cat in columns_categorical:
    X[col_cat] = encoder.fit_transform(X[col_cat])

In [32]:
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.fit_transform(X)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=True)

## Trainig

#### GradientBoostingClassifier

In [24]:
gbc = GradientBoostingClassifier()
scores = cross_val_score(gbc, X, y, cv=5)
scores, scores.mean()

(array([0.83979328, 0.84926787, 0.84827586, 0.86551724, 0.83534483]),
 0.8476398170423833)

#### MLPClassifier

In [25]:
dl = MLPClassifier()
scores = cross_val_score(dl, X, y, cv=5)
scores, scores.mean()

(array([0.86218777, 0.86821705, 0.86724138, 0.85258621, 0.84482759]),
 0.8590119991683745)

#### RandomForestClassifier

In [26]:
rfc = RandomForestClassifier()
scores = cross_val_score(rfc, X, y, cv=5)
scores, scores.mean()

(array([0.73901809, 0.79242033, 0.7637931 , 0.77844828, 0.77068966]),
 0.7688738899284209)

## Otimizando o modelo GBC

In [29]:
def optimize(n_estimators, learning_rate, min_samples_split, min_samples_leaf, max_depth, 
             max_features, subsample, params, cv = 5):
  np.random.seed(0)

  gbc = GradientBoostingClassifier(n_estimators = n_estimators,
                                  learning_rate = learning_rate,
                                  min_samples_split = min_samples_split,
                                  min_samples_leaf = min_samples_leaf,
                                  max_depth = max_depth,
                                  max_features = max_features,
                                  subsample = subsample,
                                  random_state = 0)
    
  grid_search = GridSearchCV(estimator = gbc, param_grid = params, scoring = 'roc_auc',
                             n_jobs = -1, iid = False, cv = cv)
  grid_search.fit(X_train, y_train)
  results = grid_search.cv_results_
  best_params = grid_search.best_params_
  best_score = grid_search.best_score_
  print(best_params, best_score)
  
  return gbc, best_params, best_score

In [35]:
# Variáveis para coletar os resultados
models = np.array([])
opt_params = dict()
scores = np.array([])

In [30]:
# MODELO 0
learning_rate = 0.1
n_estimators = None
max_depth = 8
min_samples_split = 250
min_samples_leaf = 20
max_features = 'sqrt'
subsample = 0.8
params = {'n_estimators': range(50, 151, 10)}

In [34]:
gbc, opt_param, score = optimize(n_estimators = n_estimators,
                                 learning_rate = learning_rate,
                                 min_samples_split = min_samples_split,
                                 min_samples_leaf = min_samples_leaf,
                                 max_depth = max_depth,
                                 max_features = max_features,
                                 subsample = subsample,
                                 params = params)

{'n_estimators': 100} 0.8568400598118184


In [36]:
# Atualizando as variáveis com os resultados
models = np.append(models, gbc)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [37]:
# MODELO 1: Otimizando max_depth e min_samples_split
n_estimators = opt_params['n_estimators']
max_depth = None
min_samples_split = None
min_samples_leaf = 20
max_features = 'sqrt'
subsample = 0.8
params = {'max_depth': range(3, 12, 2), 'min_samples_split': range(150, 401, 50)}

In [38]:
gbc, opt_param, score = optimize(n_estimators = n_estimators,
                                 learning_rate = learning_rate,
                                 min_samples_split = min_samples_split,
                                 min_samples_leaf = min_samples_leaf,
                                 max_depth = max_depth,
                                 max_features = max_features,
                                 subsample = subsample,
                                 params = params)

{'max_depth': 9, 'min_samples_split': 150} 0.8641101606535772


In [39]:
# Atualizando as variáveis com os resultados
models = np.append(models, gbc)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [40]:
# MODELO 2: Otimizando min_samples_leaf
max_depth = opt_params['max_depth']
min_samples_split = opt_params['min_samples_split']
min_samples_leaf = None
max_features = 'sqrt'
subsample = 0.8
params = {'min_samples_leaf': range(25, 61, 5)}

In [41]:
gbc, opt_param, score = optimize(n_estimators = n_estimators,
                                 learning_rate = learning_rate,
                                 min_samples_split = min_samples_split,
                                 min_samples_leaf = min_samples_leaf,
                                 max_depth = max_depth,
                                 max_features = max_features,
                                 subsample = subsample,
                                 params = params)

{'min_samples_leaf': 25} 0.8686614133161905


In [42]:
# Atualizando as variáveis com os resultados
models = np.append(models, gbc)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [43]:
# MODELO 3: Otimizando max_features
min_samples_leaf = opt_params['min_samples_leaf']
max_features = None
subsample = 0.8
params = {'max_features': range(21, 31, 1)}

In [44]:
gbc, opt_param, score = optimize(n_estimators = n_estimators,
                                 learning_rate = learning_rate,
                                 min_samples_split = min_samples_split,
                                 min_samples_leaf = min_samples_leaf,
                                 max_depth = max_depth,
                                 max_features = max_features,
                                 subsample = subsample,
                                 params = params)

{'max_features': 27} 0.8828589389246367


In [45]:
# Atualizando as variáveis com os resultados
models = np.append(models, gbc)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [46]:
opt_params

{'n_estimators': 100,
 'max_depth': 9,
 'min_samples_split': 150,
 'min_samples_leaf': 25,
 'max_features': 27}

In [47]:
#Otimizando subsample
max_features = opt_params['max_features']
subsample = None
params = {'subsample': np.append(np.arange(0.6, 1, 0.05), 1)}

In [48]:
gbc, opt_param, score = optimize(n_estimators = n_estimators,
                                 learning_rate = learning_rate,
                                 min_samples_split = min_samples_split,
                                 min_samples_leaf = min_samples_leaf,
                                 max_depth = max_depth,
                                 max_features = max_features,
                                 subsample = subsample,
                                 params = params)

{'subsample': 1.0} 0.8929108283977165


In [49]:
# Atualizando as variáveis com os resultados
models = np.append(models, gbc)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

In [50]:
opt_params

{'n_estimators': 100,
 'max_depth': 9,
 'min_samples_split': 150,
 'min_samples_leaf': 25,
 'max_features': 27,
 'subsample': 1.0}