<a href="https://colab.research.google.com/github/ascle/colab_tcc2/blob/dev/6_melhorando_modelo_rf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importando bibliotecas

In [13]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

### Outras configurações<hr style='border: 2px solid red;'>

In [14]:
pd.set_option('display.max_columns', None)
SEED = 7
np.random.seed(SEED)

### Conectando no ambiente que será utilizado

In [15]:
# Importando do Drive
from google.colab import drive
from os import chdir

drive.mount('/content/drive', force_remount=True)
chdir('/content/drive/MyDrive/UFS/tcc-2-ascle-ufs/bases/linkage_marco/')

Mounted at /content/drive


### Leitura dos dados <hr style='border: 2px solid red;'>

In [16]:
data_raw = pd.read_csv('dn_etl.csv', sep=';', low_memory=False)
data_raw.sort_index(axis=1, inplace=True)

In [17]:
data_raw.shape

(192484, 24)

In [18]:
data_raw.head()

Unnamed: 0,ANO,APGAR1,APGAR5,CODANOMAL_QTD,CONSPRENAT,ESCMAEETL_VALOR,ESTCIVMAE_COMPANHEIRO,GRAVIDEZ_VALOR,LOCNASC_DOMICILIO,LOCNASC_ESTAB_SAUDE,LOCNASC_HOSPITAL,LOCNASC_OUTROS,OBT_NEONATAL,PARTO_CESAREO,PARTO_VAGINALL,PESO,QTDFILMORT,QTDPARTCES,SEMAGESTAC,SEXO_FEM,SEXO_MASC,TPAPRESENT_CEFALICO,TPAPRESENT_PELVICO,TPAPRESENT_TRANSVERSA
0,2015,2.0,4.0,0.0,5.0,1,0,1,0,0,1,0,0,1,0,820.0,1.0,,26.0,0,1,1,0,0
1,2015,9.0,10.0,0.0,8.0,4,1,1,0,0,1,0,0,1,0,4200.0,1.0,,39.0,0,1,1,0,0
2,2015,7.0,8.0,0.0,10.0,11,1,1,0,0,1,0,0,0,1,3700.0,0.0,0.0,40.0,1,0,1,0,0
3,2015,7.0,8.0,0.0,7.0,4,0,1,0,0,1,0,0,0,1,3550.0,1.0,0.0,37.0,0,1,1,0,0
4,2015,9.0,10.0,0.0,3.0,11,0,1,0,0,1,0,0,1,0,3800.0,0.0,,39.0,1,0,1,0,0


## Dados X e y<hr style='border: 2px solid red;'>

In [19]:
X_raw= data_raw.drop(columns=['QTDPARTCES', 'ANO', 'OBT_NEONATAL'])
y_raw= data_raw["OBT_NEONATAL"]

## Dados desbalanceados<hr style='border: 2px solid red;'>

In [20]:
from imblearn.over_sampling import SMOTE

In [21]:
smt = SMOTE(random_state=SEED)
X_bal, y_bal = smt.fit_resample(X_raw, y_raw)

##Dados de treino e de teste<hr style='border: 2px solid red;'>

In [22]:
X_treino, X_teste, y_treino, y_teste = train_test_split(X_bal, y_bal, test_size =0.3,
                                                        random_state=SEED,
                                                        stratify=y_bal)
print("Treinaremos com %d elementos e testaremos com %d elementos" % (len(X_treino), len(X_teste)))

Treinaremos com 267512 elementos e testaremos com 114648 elementos


## Random Forest

In [23]:
modelo_rfc = RandomForestClassifier(n_estimators = 100)
modelo_rfc.fit(X_treino, y_treino)
predito_rfc = modelo_rfc.predict(X_teste)

In [24]:
print("A acurácia do modelo Random Forest foi %.2f%%" % (metrics.accuracy_score(y_teste, predito_rfc) * 100))
print("A Precisão do modelo Random Forest foi %.2f%%" % (metrics.precision_score(y_teste, predito_rfc) * 100))
print("A Recall do modelo Random Forest foi %.2f%%" % (metrics.recall_score(y_teste, predito_rfc) * 100))
print("A F1 do modelo Random Forest foi %.2f%%" % (metrics.f1_score(y_teste, predito_rfc) * 100))

A acurácia do modelo Random Forest foi 99.74%
A Precisão do modelo Random Forest foi 99.82%
A Recall do modelo Random Forest foi 99.67%
A F1 do modelo Random Forest foi 99.74%


### K-Best <hr style='border: 2px solid red;'>

In [13]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [21]:
df_kbest = pd.DataFrame(columns=['K', 'Acurácia', 'Precisão', 'Recall', 'F1'])

In [15]:
count = 1
for count in range(1, X_raw.shape[1]):
  selecionar_kmelhores = SelectKBest(chi2, k=count)
  selecionar_kmelhores.fit(X_treino, y_treino)

  X_treino_kbest = selecionar_kmelhores.transform(X_treino)
  X_teste_kbest = selecionar_kmelhores.transform(X_teste)

  modelo_kb = RandomForestClassifier(n_estimators = 100)
  modelo_kb.fit(X_treino_kbest, y_treino)
  predito_kb = modelo_kb.predict(X_teste_kbest)

  metricas = [count,
              round((metrics.accuracy_score(y_teste, predito_kb) * 100), 2),
              round((metrics.precision_score(y_teste, predito_kb) * 100), 2),
              round((metrics.recall_score(y_teste, predito_kb) * 100), 2),
              round((metrics.f1_score(y_teste, predito_kb) * 100), 2)]

  df_kbest.loc[count] = metricas


In [16]:
df_kbest

Unnamed: 0,K,Acurácia,Precisão,Recall,F1
1,1.0,95.88,98.17,93.49,95.78
2,2.0,99.25,99.51,98.99,99.25
3,3.0,99.5,99.64,99.35,99.5
4,4.0,99.66,99.7,99.61,99.66
5,5.0,99.7,99.75,99.65,99.7
6,6.0,99.72,99.76,99.68,99.72
7,7.0,99.72,99.76,99.68,99.72
8,8.0,99.71,99.74,99.69,99.71
9,9.0,99.72,99.76,99.68,99.72
10,10.0,99.72,99.76,99.68,99.72


### RFE <hr style='border: 2px solid red;'>

In [17]:
from sklearn.feature_selection import RFE

In [18]:
df_rfe = pd.DataFrame(columns=['K', 'Acurácia', 'Precisão', 'Recall', 'F1'])

In [23]:
count = 1
for count in range(1, X_raw.shape[1]):
  selecionador_rfe = RFE(estimator = modelo_rfc, n_features_to_select = count, step = 1)
  selecionador_rfe.fit(X_treino, y_treino)

  X_treino_rfe = selecionador_rfe.transform(X_treino)
  X_teste_rfe = selecionador_rfe.transform(X_teste)

  modelo_rfe = RandomForestClassifier(n_estimators = 100)
  modelo_rfe.fit(X_treino_rfe, y_treino)
  predito_rfe = modelo_rfe.predict(X_teste_rfe)

  metricas = [count,
              round((metrics.accuracy_score(y_teste, predito_rfe) * 100), 2),
              round((metrics.precision_score(y_teste, predito_rfe) * 100), 2),
              round((metrics.recall_score(y_teste, predito_rfe) * 100), 2),
              round((metrics.f1_score(y_teste, predito_rfe) * 100), 2)]

  df_rfe.loc[count] = metricas

In [24]:
df_rfe

Unnamed: 0,K,Acurácia,Precisão,Recall,F1
1,1.0,96.3,98.69,93.85,96.21
2,2.0,98.23,99.49,96.96,98.21
3,3.0,99.5,99.64,99.35,99.5
4,4.0,99.66,99.72,99.6,99.66
5,5.0,99.67,99.72,99.63,99.67
6,6.0,99.72,99.76,99.68,99.72
7,7.0,99.73,99.74,99.71,99.73
8,8.0,99.73,99.75,99.71,99.73
9,9.0,99.74,99.81,99.68,99.74
10,10.0,99.73,99.78,99.68,99.73


### Cross Validation <hr style='border: 2px solid red;'>

In [25]:
from sklearn.feature_selection import RFECV

In [26]:
df_cv = pd.DataFrame(columns=['K', 'Acurácia', 'Precisão', 'Recall', 'F1'])
count = 2

In [37]:
count = 10

In [None]:
selecionar_cv = RFECV(estimator = modelo_rfc, cv = count, scoring = "accuracy", step = 1)
selecionar_cv.fit(X_treino, y_treino)

X_treino_cv = selecionar_cv.transform(X_treino)
X_teste_cv = selecionar_cv.transform(X_teste)

modelo_cv = RandomForestClassifier(n_estimators = 100)
modelo_cv.fit(X_treino_cv, y_treino)
predito_cv = modelo_cv.predict(X_teste_cv)

metricas = [count,
            round((metrics.accuracy_score(y_teste, predito_cv) * 100), 2),
            round((metrics.precision_score(y_teste, predito_cv) * 100), 2),
            round((metrics.recall_score(y_teste, predito_cv) * 100), 2),
            round((metrics.f1_score(y_teste, predito_cv) * 100), 2)]

In [35]:
df_cv.loc[count] = metricas

In [36]:
df_cv

Unnamed: 0,K,Acurácia,Precisão,Recall,F1
2,2.0,99.74,99.82,99.67,99.74
3,3.0,99.74,99.82,99.66,99.74


In [None]:
count = 2
for count in range(2, X_raw.shape[1]):
  selecionar_cv = RFECV(estimator = modelo_rfc, cv = count, scoring = "accuracy", step = 1)
  selecionar_cv.fit(X_treino, y_treino)

  X_treino_cv = selecionar_cv.transform(X_treino)
  X_teste_cv = selecionar_cv.transform(X_teste)

  modelo_cv = RandomForestClassifier(n_estimators = 100)
  modelo_cv.fit(X_treino_cv, y_treino)
  predito_cv = modelo_cv.predict(X_teste_cv)

  metricas = [count,
              round((metrics.accuracy_score(y_teste, predito_cv) * 100), 2),
              round((metrics.precision_score(y_teste, predito_cv) * 100), 2),
              round((metrics.recall_score(y_teste, predito_cv) * 100), 2),
              round((metrics.f1_score(y_teste, predito_cv) * 100), 2)]
  print(count)

  df_cv.loc[count] = metricas

In [None]:
df_cv

In [None]:
matriz_confusao = confusion_matrix(teste_y, classificador.predict(teste_rfecv))
sns.heatmap(matriz_confusao, annot = True, fmt = "d").set(xlabel = "Predição", ylabel = "Real")

print("Resultado da classificação %.2f%%" % (classificador.score(teste_rfecv, teste_y)* 100))

In [None]:
plt.xlabel("Número de exames")
plt.ylabel("Acurácia")

plt.plot(range(1, len(selecionador_rfecv.cv_results_['mean_test_score'] ) + 1), selecionador_rfecv.cv_results_['mean_test_score'])
plt.show