# Dados de Internações COVID-19

# Bibliotecas

In [117]:
# Manipulação e tratamento das bases
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)
from scipy import stats
#!pip install -U scikit-learn

#Visualização gráfica
%matplotlib inline
import seaborn as sns
from matplotlib import pyplot
import matplotlib.pyplot as plt
import plotly.express as px
import io
from sklearn import tree

#Pré-Processamento das bases
!pip install imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
#!pip install Boruta
from boruta import BorutaPy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel

#Modelagem de Dados
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier, LogisticRegression, SGDClassifier, PassiveAggressiveClassifier, Perceptron
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.neural_network import MLPClassifier, BernoulliRBM
from sklearn.calibration import CalibratedClassifierCV
from sklearn.dummy import DummyClassifier
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
! pip install XGboost
from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
! pip install catboost
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree, ExtraTreeClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import ConfusionMatrix



#  1- Leitura de Dados e Pré- Processamento



## **A-** Premissas:



Data Frame de Brasil:(**B-** df_Brasil) 
- Convertendo as variáveis com datas para datetime
- Filtro: Criação da coluna IDADE
- DATA LIMITE: 16/01/2021(Primeiro dia de vacinação no Brasil/sp)
- Remoção de duplicadas
- Remoção de variáveis não relevantes à análise (combinadas em reunião) e das que não agregam valor nenhum à Análise (nunique = 1)
- Filtro: HOSPITAL = 1 

Data Frame da cidade de São Paulo:(**C-**df_sp)
- Reset no INDEX
- Remoção de colunas sobre o cód da cidade, estado, UF.. ou seja, com unique() = 1
- Redução de categorias nas colunas 'UTI'  de 1,2,3,9 e missing foram para 1 ou 0. 
-  

## **B**- Análises dos Internados Brasil

Inclusão da Base "antes" do período da Vacina - Primeira parte do estudo

In [118]:
df_brasil_antes= pd.read_csv('INFLUD-13-09-2021.csv', sep= ';')

  exec(code_obj, self.user_global_ns, self.user_ns)


Filtro: Criação da coluna IDADE e drop nas outras .
OBS: No campo de 'NU_IDADE_N','TP_IDADE','COD_IDADE',devemos dar atenção maior se as idades forem menores que 1 ano:

Ex: Paciente apresenta, na coluna da idade, o valor 8, porém na coluna seguinte (que classifica em 1:dias, 2:meses, 3: anos) apresenta o valor 2 para este paciente. Então não são 8 anos e sim, 8 meses.( 0,67 anos)

Vamos então criar uma coluna com idade em float e não int.

In [119]:
idade_aux = []

for i,j in enumerate (df_brasil_antes.TP_IDADE):
  if j == 2:
    idade_aux.append(df_brasil_antes.NU_IDADE_N[i]/12)
  elif j == 1:
    idade_aux.append(df_brasil_antes.NU_IDADE_N[i]/360)
  else:
    idade_aux.append(df_brasil_antes.NU_IDADE_N[i])
    
df_brasil_antes["IDADE_ANOS"] = idade_aux


Seleção das 20 maiores cidades para o período "antes da vacina"

Aplicação do filtro para seleção somente dos pacientes internados

Seleção somente dos pacientes que foram curados ou vieram a óbito por Covid

In [120]:
df_mcid1= df_brasil_antes.loc[df_brasil_antes['CO_MUN_NOT'].isin([355030, 330455, 530010, 292740, 230440, 310620, 130260, 410690,261160, 431490, 520870, 150140, 351880, 350950, 211130, 330490,270430, 330170, 240810, 500270])]
df_mcid1=df_mcid1[(df_mcid1['HOSPITAL']==1.0)] 
df_mcid1= df_mcid1.loc[(df_mcid1['EVOLUCAO']<=2.0)]
df_mcid1=df_mcid1.assign(Periodo=1)
df_mcid1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 440293 entries, 0 to 1198535
Columns: 156 entries, DT_NOTIFIC to Periodo
dtypes: float64(88), int64(10), object(58)
memory usage: 527.4+ MB


- Convertendo as variáveis com datas para datetime

In [121]:
df_mcid1['DT_NOTIFIC'] = pd.to_datetime(df_mcid1['DT_NOTIFIC'], format= "%d/%m/%Y")
df_mcid1['DT_NASC'] = pd.to_datetime(df_mcid1['DT_NASC'], format= "%d/%m/%Y")
df_mcid1['DT_EVOLUCA'] = pd.to_datetime(df_mcid1['DT_EVOLUCA'], format= "%d/%m/%Y")
df_mcid1['DT_ENCERRA'] = pd.to_datetime(df_mcid1['DT_ENCERRA'], format= "%d/%m/%Y")
df_mcid1['DT_ENTUTI'] = pd.to_datetime(df_mcid1['DT_ENCERRA'], format= "%d/%m/%Y")
df_mcid1['DT_SAIDUTI'] = pd.to_datetime(df_mcid1['DT_ENCERRA'], format= "%d/%m/%Y")
df_mcid1[['DT_NOTIFIC','DT_NASC', 'DT_EVOLUCA','DT_ENCERRA','DT_ENTUTI', 'DT_SAIDUTI']].info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 440293 entries, 0 to 1198535
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   DT_NOTIFIC  440293 non-null  datetime64[ns]
 1   DT_NASC     439618 non-null  datetime64[ns]
 2   DT_EVOLUCA  431944 non-null  datetime64[ns]
 3   DT_ENCERRA  435530 non-null  datetime64[ns]
 4   DT_ENTUTI   435530 non-null  datetime64[ns]
 5   DT_SAIDUTI  435530 non-null  datetime64[ns]
dtypes: datetime64[ns](6)
memory usage: 23.5 MB


In [122]:
df_mcid1['DT_NOTIFIC'].max()

Timestamp('2021-09-08 00:00:00')

In [123]:
df_mcid1['DT_NOTIFIC'].min()

Timestamp('2019-12-29 00:00:00')

Inclusão da Base "durante" o período da Vacina - Segunda parte do estudo

In [124]:
df_brasil_durante= pd.read_csv('INFLUD21-13-09-2021.csv', sep= ';')

  exec(code_obj, self.user_global_ns, self.user_ns)


- Filtro: Criação da coluna IDADE e drop nas outras . 

OBS: No campo de 'NU_IDADE_N','TP_IDADE','COD_IDADE',devemos dar atenção maior se as idades forem menores que 1 ano:

Ex: Paciente apresenta, na coluna da idade, o valor 8, porém na coluna seguinte (que classifica em 1:dias, 2:meses, 3: anos) apresenta o valor 2 para este paciente. Então não são 8 anos e sim, 8 meses.( 0,67 anos) 

Vamos então criar uma coluna com idade em float e não int.



In [125]:
idade_aux = []

for i,j in enumerate (df_brasil_durante.TP_IDADE):
  if j == 2:
    idade_aux.append(df_brasil_durante.NU_IDADE_N[i]/12)
  elif j == 1:
    idade_aux.append(df_brasil_durante.NU_IDADE_N[i]/360)
  else:
    idade_aux.append(df_brasil_durante.NU_IDADE_N[i])
    
df_brasil_durante["IDADE_ANOS"] = idade_aux


Seleção das 20 maiores cidades para o período "durante a vacina"

Aplicação do filtro para seleção somente dos pacientes internados

Seleção somente dos pacientes que foram curados ou vieram a óbito por Covid

In [126]:
df_mcid2= df_brasil_durante.loc[df_brasil_durante['CO_MUN_NOT'].isin([355030, 330455, 530010, 292740, 230440, 310620, 130260, 410690,261160, 431490, 520870, 150140, 351880, 350950, 211130, 330490,270430, 330170, 240810, 500270])]
df_mcid2=df_mcid2[(df_mcid2['HOSPITAL']==1.0)] 
df_mcid2= df_mcid2.loc[(df_mcid2['EVOLUCAO']<=2.0)]
df_mcid2=df_mcid2.assign(Periodo=2)
df_mcid2

Unnamed: 0,DT_NOTIFIC,SEM_NOT,DT_SIN_PRI,SEM_PRI,SG_UF_NOT,ID_REGIONA,CO_REGIONA,ID_MUNICIP,CO_MUN_NOT,ID_UNIDADE,CO_UNI_NOT,CS_SEXO,DT_NASC,NU_IDADE_N,TP_IDADE,COD_IDADE,CS_GESTANT,CS_RACA,CS_ETINIA,CS_ESCOL_N,ID_PAIS,CO_PAIS,SG_UF,ID_RG_RESI,CO_RG_RESI,ID_MN_RESI,CO_MUN_RES,CS_ZONA,SURTO_SG,NOSOCOMIAL,AVE_SUINO,FEBRE,TOSSE,GARGANTA,DISPNEIA,DESC_RESP,SATURACAO,DIARREIA,VOMITO,OUTRO_SIN,OUTRO_DES,PUERPERA,FATOR_RISC,CARDIOPATI,HEMATOLOGI,SIND_DOWN,HEPATICA,ASMA,DIABETES,NEUROLOGIC,...,HISTO_VGM,PAIS_VGM,CO_PS_VGM,LO_PS_VGM,DT_VGM,DT_RT_VGM,PCR_SARS2,PAC_COCBO,PAC_DSCBO,OUT_ANIM,DOR_ABD,FADIGA,PERD_OLFT,PERD_PALA,TOMO_RES,TOMO_OUT,DT_TOMO,TP_TES_AN,DT_RES_AN,RES_AN,POS_AN_FLU,TP_FLU_AN,POS_AN_OUT,AN_SARS2,AN_VSR,AN_PARA1,AN_PARA2,AN_PARA3,AN_ADENO,AN_OUTRO,DS_AN_OUT,TP_AM_SOR,SOR_OUT,DT_CO_SOR,TP_SOR,OUT_SOR,DT_RES,RES_IGG,RES_IGM,RES_IGA,ESTRANG,VACINA_COV,DOSE_1_COV,DOSE_2_COV,LAB_PR_COV,LOTE_1_COV,LOTE_2_COV,FNT_IN_COV,IDADE_ANOS,Periodo
1,07/01/2021,1,04/01/2021,1,SP,GVE I CAPITAL,1331.0,SAO PAULO,355030,HOSP STA CRUZ,2082624,M,29/06/1950,70,3,3070,6,4,,9.0,BRASIL,1,SP,GVE I CAPITAL,1331.0,SAO PAULO,355030.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,,2.0,1,1.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2,,,,,,,,,,2.0,1.0,2.0,2.0,2.0,,07/01/2021,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,70.0,2
3,08/01/2021,1,05/01/2021,1,SP,GVE XVII CAMPINAS,1342.0,CAMPINAS,350950,HOSPITAL DAS CLINICAS DA UNICAMP DE CAMPINAS,2079798,F,21/10/1960,60,3,3060,5,1,,9.0,BRASIL,1,SP,GVE XX PIRACICABA,1345.0,ELIAS FAUSTO,351490.0,1.0,2.0,2.0,9.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,9.0,,2.0,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2,,,,,,,,,,2.0,1.0,2.0,2.0,6.0,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,60.0,2
4,11/01/2021,2,07/01/2021,1,SP,GVE I CAPITAL,1331.0,SAO PAULO,355030,HOSP STA MAGGIORE BELA VISTA,2084341,F,27/06/1936,84,3,3084,6,4,,,BRASIL,1,SP,GVE I CAPITAL,1331.0,SAO PAULO,355030.0,1.0,,,,,1.0,,1.0,,1.0,1.0,,1.0,CEFALEIA,,1,,,,,,1.0,,...,0,,,,,,,,,,,,,,5.0,RAD COVID1,10/01/2021,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,84.0,2
7,13/01/2021,2,10/01/2021,2,SP,GVE I CAPITAL,1331.0,SAO PAULO,355030,HOSP MUN DR BENEDICTO MONTENEGRO,2084139,F,15/05/1938,82,3,3082,5,1,,,BRASIL,1,SP,GVE I CAPITAL,1331.0,SAO PAULO,355030.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,,2.0,1,2.0,2.0,2.0,2.0,2.0,1.0,2.0,...,2,,,,,,1.0,,,,2.0,2.0,2.0,2.0,,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,82.0,2
8,11/01/2021,2,07/01/2021,1,SP,GVE XVII CAMPINAS,1342.0,CAMPINAS,350950,COMPLEXO HOSPITALAR PREFEITO EDIVALDO ORSI,6053858,M,28/03/1949,71,3,3071,6,4,,,BRASIL,1,SP,GVE XVII CAMPINAS,1342.0,CAMPINAS,350950.0,,2.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0,1.0,2.0,2.0,1.0,"CORIZA, MIALGIA",2.0,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2,,,,,,1.0,,,,2.0,2.0,2.0,2.0,5.0,,09/01/2021,,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,71.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1471517,16/08/2021,33,14/08/2021,32,MG,BELO HORIZONTE,1449.0,BELO HORIZONTE,310620,HOSPITAL JOAO XXIII,26921,M,27/10/1974,46,3,3046,6,2,,9.0,BRASIL,1,MG,BELO HORIZONTE,1449.0,BELO HORIZONTE,310620.0,1.0,,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,,,2,,,,,,,,...,0,,,,,,,,,,1.0,1.0,1.0,2.0,5.0,TUBERCULOSE E PNEUMONIA,15/08/2021,,,4.0,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,1.0,46.0,2
1471522,15/07/2021,28,10/07/2021,27,DF,,,BRASILIA,530010,HOME HOSPITAL ORTOPEDICO E MEDICINA ESPECIALIZADA,6243495,M,15/03/1931,90,3,3090,6,4,,,BRASIL,1,DF,,,BRASILIA - ASA SUL,530015.0,1.0,,,,1.0,1.0,,1.0,1.0,2.0,2.0,2.0,,,,2,,,,,,,,...,0,,,,,,1.0,,,,2.0,2.0,2.0,2.0,1.0,,15/07/2021,,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,90.0,2
1471530,05/07/2021,27,07/05/2021,18,PE,001,1497.0,RECIFE,261160,PRONTO ATENDIMENTO CAXANGA,9643486,M,04/02/1995,26,3,3026,6,9,,,BRASIL,1,PE,001,1497.0,RECIFE,261160.0,1.0,,,,1.0,1.0,1.0,1.0,,1.0,,,1.0,CORIZA,,1,,,,,,,,...,0,,,,,,1.0,,,,,,,,1.0,,17/05/2021,,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,26.0,2
1471555,02/09/2021,35,29/08/2021,35,SP,GVE I CAPITAL,1331.0,SAO PAULO,355030,HOSPITAL DO SERV PUB EST FCO MORATO DE OLIVEIR...,2058502,M,02/01/2015,6,3,3006,6,1,,,BRASIL,1,SP,GVE I CAPITAL,1331.0,SAO PAULO,355030.0,1.0,,9.0,9.0,,1.0,,,1.0,1.0,,,,,,1,,,,,1.0,,,...,0,,,,,,,,,,,,,,6.0,,,,,5.0,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,6.0,2


- Convertendo as variáveis com datas para datetime

In [127]:
df_mcid2['DT_NOTIFIC'] = pd.to_datetime(df_mcid2['DT_NOTIFIC'], format= "%d/%m/%Y")
df_mcid2['DT_NASC'] = pd.to_datetime(df_mcid2['DT_NASC'], format= "%d/%m/%Y")
df_mcid2['DT_EVOLUCA'] = pd.to_datetime(df_mcid2['DT_EVOLUCA'], format= "%d/%m/%Y")
df_mcid2['DT_ENCERRA'] = pd.to_datetime(df_mcid2['DT_ENCERRA'], format= "%d/%m/%Y")
df_mcid2['DT_ENTUTI'] = pd.to_datetime(df_mcid2['DT_ENCERRA'], format= "%d/%m/%Y")
df_mcid2['DT_SAIDUTI'] = pd.to_datetime(df_mcid2['DT_ENCERRA'], format= "%d/%m/%Y")
df_mcid2[['DT_NOTIFIC','DT_NASC', 'DT_EVOLUCA','DT_ENCERRA','DT_ENTUTI', 'DT_SAIDUTI']].info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 386363 entries, 1 to 1471561
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   DT_NOTIFIC  386363 non-null  datetime64[ns]
 1   DT_NASC     386206 non-null  datetime64[ns]
 2   DT_EVOLUCA  382378 non-null  datetime64[ns]
 3   DT_ENCERRA  381254 non-null  datetime64[ns]
 4   DT_ENTUTI   381254 non-null  datetime64[ns]
 5   DT_SAIDUTI  381254 non-null  datetime64[ns]
dtypes: datetime64[ns](6)
memory usage: 20.6 MB


In [128]:
df_mcid2['DT_NOTIFIC'].max()

Timestamp('2021-09-12 00:00:00')

In [129]:
df_mcid2['DT_NOTIFIC'].min()

Timestamp('2021-01-03 00:00:00')

- DATA LIMITE: 16/01/2021  
df_mcid1 = Base antes da Vacina 

  df_mcid2 = Base durante a Vacina

In [130]:
df_mcid1=df_mcid1.loc[df_mcid1['DT_NOTIFIC']<='2021-01-16']
df_mcid1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 435028 entries, 0 to 1198535
Columns: 156 entries, DT_NOTIFIC to Periodo
dtypes: datetime64[ns](6), float64(88), int64(10), object(52)
memory usage: 521.1+ MB


In [131]:
df_mcid2=df_mcid2.loc[df_mcid2['DT_NOTIFIC']>'2021-01-16']
df_mcid2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 376201 entries, 104 to 1471561
Columns: 164 entries, DT_NOTIFIC to Periodo
dtypes: datetime64[ns](6), float64(90), int64(12), object(56)
memory usage: 473.6+ MB


In [132]:
df_mcid3=pd.concat([df_mcid1, df_mcid2])
df_mcid3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 811229 entries, 0 to 1471561
Columns: 164 entries, DT_NOTIFIC to FNT_IN_COV
dtypes: datetime64[ns](6), float64(91), int64(10), object(57)
memory usage: 1021.2+ MB


- Variáveis : Combinamos de manter as variáveis que tenham pelo menos um SIM [nesta tabela](https://docs.google.com/spreadsheets/d/17-jccz8DWl_fW7NpA0d6A3wQ3Ntc8qKX/edit#gid=1202247063) a partir da interpretação do Dicionário.

OBS: Nesta base atualizada existem 8 variáveis a mais do que a antiga:

'ESTRANG', 'VACINA_COV', 'DOSE_1_COV', 'DOSE_2_COV', 'LAB_PR_COV', 'LOTE_1_COV', 'LOTE_2_COV', 'FNT_IN_COV'.

Essas colunas também foram dropadas.




In [133]:
df_mcid3.drop(columns=['DT_NOTIFIC','SG_UF_NOT','NU_IDADE_N','TP_IDADE','COD_IDADE','DT_NASC','SEM_NOT','DT_SIN_PRI','SEM_PRI','ID_REGIONA','CO_REGIONA', 'ID_UNIDADE', 'CO_UNI_NOT','ID_PAIS','CO_PAIS',
                      'SG_UF', 'ID_RG_RESI','CO_RG_RESI','ID_MN_RESI','CO_MUN_RES','CS_ZONA', 'HOSPITAL',
                      'CS_ETINIA', 'PAC_COCBO','PAC_DSCBO','PAIS_VGM','CO_PS_VGM','LO_PS_VGM','DT_VGM','DT_RT_VGM',
                      'OUTRO_DES','MORB_DESC','DT_UT_DOSE','MAE_VAC', 'DT_VAC_MAE', 'ID_MUNICIP',
                      'M_AMAMENTA','DT_DOSEUNI', 'DT_1_DOSE','DT_2_DOSE','TP_ANTIVIR','OUT_ANTIV','DT_ANTIVIR','DT_INTERNA',
                      'ID_RG_INTE','CO_RG_INTE', 'ID_MN_INTE','RAIOX_OUT','DT_RAIOX','TOMO_OUT','DT_TOMO', 'DT_ENTUTI', 'DT_SAIDUTI',
                      'AMOSTRA','DT_COLETA','TP_AMOSTRA','OUT_AMOST', 'DT_PCR','POS_PCRFLU','TP_FLU_PCR','PCR_FLUASU','FLUASU_OUT',
                      'PCR_FLUBLI','FLUBLI_OUT','POS_PCROUT','PCR_SARS2', 'PCR_VSR','PCR_PARA1','PCR_PARA2','PCR_PARA3','PCR_PARA4',
                      'PCR_ADENO','PCR_METAP','PCR_BOCA','PCR_RINO','PCR_OUTRO', 'DS_PCR_OUT','CLASSI_FIN','CLASSI_OUT','CRITERIO', 'AVE_SUINO','OUTRO_SIN','OUT_MORBI','CO_MU_INTE','RAIOX_RES','TOMO_RES',
                      'TP_TES_AN','DT_RES_AN','RES_AN','POS_AN_FLU','TP_FLU_AN','POS_AN_OUT', 'AN_SARS2','AN_VSR','AN_PARA1', 'SG_UF_INTE', 'CO_MU_INTE', 'DT_ENCERRA',
                      'AN_PARA2','AN_PARA3','AN_ADENO','AN_OUTRO','DS_AN_OUT','TP_AM_SOR','SOR_OUT','DT_CO_SOR', 'TP_SOR','OUT_SOR', 'RES_IGG', 'RES_IGM', 'RES_IGA', 'DT_EVOLUCA',
                      'DT_RES','DT_DIGITA','OBES_IMC', 'OUT_ANIM', 'ESTRANG', 'VACINA_COV', 'DOSE_1_COV', 'DOSE_2_COV', 'LAB_PR_COV', 'LOTE_1_COV', 'LOTE_2_COV', 'FNT_IN_COV'], inplace=True)

In [134]:
df_mcid3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 811229 entries, 0 to 1471561
Data columns (total 41 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   CO_MUN_NOT  811229 non-null  int64  
 1   CS_SEXO     811229 non-null  object 
 2   CS_GESTANT  811229 non-null  int64  
 3   CS_RACA     785969 non-null  float64
 4   CS_ESCOL_N  532926 non-null  float64
 5   SURTO_SG    349023 non-null  float64
 6   NOSOCOMIAL  641445 non-null  float64
 7   FEBRE       671410 non-null  float64
 8   TOSSE       694735 non-null  float64
 9   GARGANTA    539056 non-null  float64
 10  DISPNEIA    707031 non-null  float64
 11  DESC_RESP   636299 non-null  float64
 12  SATURACAO   668813 non-null  float64
 13  DIARREIA    533990 non-null  float64
 14  VOMITO      523577 non-null  float64
 15  PUERPERA    278417 non-null  float64
 16  FATOR_RISC  811229 non-null  object 
 17  CARDIOPATI  399133 non-null  float64
 18  HEMATOLOGI  279638 non-null  float64
 19  S

- Linhas duplicadas


In [135]:
print(f'Temos {df_mcid3.duplicated().sum()} linhas duplicadas.')

Temos 3407 linhas duplicadas.


In [136]:
df_mcid3=df_mcid3.drop_duplicates()

Foi decidido pelo grupo dropar as 3407 linhas duplicadas. Que corresponde a menos de 1%



- Missing

In [137]:
# Porcentagem de missing
#df_mcid3.isnull().sum()/len(df_mcid3)

- Avaliação de Variáveis com unique()= 1

In [138]:
# Analise das colunas que provavelmente vamos dropar:
print(list(df_mcid3.nunique()))
#As que estão com 1 são constantes em todo df_sp.

[20, 3, 8, 6, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 6, 2, 4, 3, 3, 3, 3, 173, 2]


In [139]:
df_mcid3.to_csv(r'Base_Tratada.csv',index = False)

## **C-** PRÉ-PROCESSAMENTO

###- ADEQUAÇÃO DAS CATEGORIAS E SUBSTITUIÇÃO DOS MISSINGS 


In [140]:
df_mcid3['CS_GESTANT'].replace({1.0: 1, 2.0: 1, 3.0  :1, 4.0 : 1}, inplace= True)
df_mcid3['CS_GESTANT'].replace({5.0: 0, 6.0:0, 9.0:0}, inplace= True)
df_mcid3['CS_RACA'].fillna(9,inplace= True)
df_mcid3['CS_ESCOL_N'].fillna(9,inplace= True)
df_mcid3['SURTO_SG'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['SURTO_SG'].fillna(0,inplace= True)
df_mcid3['NOSOCOMIAL'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['NOSOCOMIAL'].fillna(0,inplace= True)
df_mcid3['FEBRE'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['FEBRE'].fillna(0,inplace= True)
df_mcid3['TOSSE'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['TOSSE'].fillna(0,inplace= True)
df_mcid3['GARGANTA'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['GARGANTA'].fillna(0,inplace= True)
df_mcid3['DISPNEIA'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['DISPNEIA'].fillna(0,inplace= True)
df_mcid3['DESC_RESP'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['DESC_RESP'].fillna(0,inplace= True)
df_mcid3['SATURACAO'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['SATURACAO'].fillna(0,inplace= True)
df_mcid3['DIARREIA'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['DIARREIA'].fillna(0,inplace= True)
df_mcid3['VOMITO'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['VOMITO'].fillna(0,inplace= True)
df_mcid3['PUERPERA'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['PUERPERA'].fillna(0,inplace= True)
df_mcid3['CARDIOPATI'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['CARDIOPATI'].fillna(0,inplace= True)
df_mcid3['HEMATOLOGI'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['HEMATOLOGI'].fillna(0,inplace= True)
df_mcid3['SIND_DOWN'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['SIND_DOWN'].fillna(0,inplace= True)
df_mcid3['HEPATICA'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['HEPATICA'].fillna(0,inplace= True)
df_mcid3['ASMA'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['ASMA'].fillna(0,inplace= True)
df_mcid3['DIABETES'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['DIABETES'].fillna(0,inplace= True)
df_mcid3['NEUROLOGIC'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['NEUROLOGIC'].fillna(0,inplace= True)
df_mcid3['PNEUMOPATI'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['PNEUMOPATI'].fillna(0,inplace= True)
df_mcid3['IMUNODEPRE'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['IMUNODEPRE'].fillna(0,inplace= True)
df_mcid3['RENAL'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['RENAL'].fillna(0,inplace= True)
df_mcid3['OBESIDADE'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['OBESIDADE'].fillna(0,inplace= True)
df_mcid3['ASMA'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['ASMA'].fillna(0,inplace= True)
df_mcid3['ANTIVIRAL'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['ANTIVIRAL'].fillna(0,inplace= True)
df_mcid3['UTI'].replace({2.0: 0, 9.0: 0}, inplace= True)
df_mcid3['UTI'].fillna(0,inplace= True)
df_mcid3['SUPORT_VEN'].replace({3.0: 0, 9.0: 0}, inplace= True)
df_mcid3['SUPORT_VEN'].fillna(0,inplace= True)
df_mcid3['PCR_RESUL'].fillna(4,inplace= True)
df_mcid3['HISTO_VGM'].replace({0: 2}, inplace= True)
df_mcid3['DOR_ABD'].replace({9.0: 0, 2.0 :0}, inplace= True)
df_mcid3['DOR_ABD'].fillna(0,inplace= True)
df_mcid3['FADIGA'].replace({9.0: 0, 2.0 :0}, inplace= True)
df_mcid3['FADIGA'].fillna(0,inplace= True)
df_mcid3['PERD_OLFT'].replace({9.0: 0, 2.0 :0}, inplace= True)
df_mcid3['PERD_OLFT'].fillna(0,inplace= True)
df_mcid3['PERD_PALA'].replace({9.0: 0, 2.0 :0}, inplace= True)
df_mcid3['PERD_PALA'].fillna(0,inplace= True)
df_mcid3['VACINA'].fillna(0,inplace= True)
df_mcid3['FATOR_RISC'].replace({'S': 1, 'N':2, '1':1, '2':2}, inplace= True)
df_mcid3['FATOR_RISC'].fillna(0,inplace= True)


Conferência das categorias

- Resetando o Index novamente.

In [141]:
df_mcid3= df_mcid3.reset_index(drop=True)
df_mcid3.head()

Unnamed: 0,CO_MUN_NOT,CS_SEXO,CS_GESTANT,CS_RACA,CS_ESCOL_N,SURTO_SG,NOSOCOMIAL,FEBRE,TOSSE,GARGANTA,DISPNEIA,DESC_RESP,SATURACAO,DIARREIA,VOMITO,PUERPERA,FATOR_RISC,CARDIOPATI,HEMATOLOGI,SIND_DOWN,HEPATICA,ASMA,DIABETES,NEUROLOGIC,PNEUMOPATI,IMUNODEPRE,RENAL,OBESIDADE,VACINA,ANTIVIRAL,UTI,SUPORT_VEN,PCR_RESUL,EVOLUCAO,HISTO_VGM,DOR_ABD,FADIGA,PERD_OLFT,PERD_PALA,IDADE_ANOS,Periodo
0,530010,M,0,4.0,5.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0,2,0.0,0.0,0.0,0.0,0.583333,1
1,530010,M,0,4.0,5.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,4.0,1.0,2,0.0,0.0,0.0,0.0,1.0,1
2,230440,M,0,4.0,5.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,2,0.0,0.0,0.0,0.0,0.75,1
3,530010,F,0,4.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,2.0,1.0,2,0.0,0.0,0.0,0.0,55.0,1
4,410690,F,1,1.0,3.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,2.0,1.0,1.0,2,0.0,0.0,0.0,0.0,29.0,1


In [142]:
#df_mcid3.isnull().sum()/len(df_mcid3)

Aplicação da Dummy nas Features Categóricas

In [143]:
df_mcid3=pd.get_dummies(df_mcid3, columns=['CS_SEXO', 'CS_GESTANT', 'CS_RACA', 'CS_ESCOL_N',
       'SURTO_SG', 'NOSOCOMIAL', 'FEBRE', 'TOSSE', 'GARGANTA', 'DISPNEIA',
       'DESC_RESP', 'SATURACAO', 'DIARREIA', 'VOMITO', 'PUERPERA',
       'FATOR_RISC', 'CARDIOPATI', 'HEMATOLOGI', 'SIND_DOWN', 'HEPATICA',
       'ASMA', 'DIABETES', 'NEUROLOGIC', 'PNEUMOPATI', 'IMUNODEPRE', 'RENAL',
       'OBESIDADE', 'VACINA', 'ANTIVIRAL', 'UTI', 'SUPORT_VEN', 'PCR_RESUL',
       'HISTO_VGM', 'DOR_ABD', 'FADIGA', 'PERD_OLFT', 'PERD_PALA'], drop_first=True)
df_mcid3.head()

Unnamed: 0,CO_MUN_NOT,EVOLUCAO,IDADE_ANOS,Periodo,CS_SEXO_I,CS_SEXO_M,CS_GESTANT_1,CS_RACA_2.0,CS_RACA_3.0,CS_RACA_4.0,CS_RACA_5.0,CS_RACA_9.0,CS_ESCOL_N_1.0,CS_ESCOL_N_2.0,CS_ESCOL_N_3.0,CS_ESCOL_N_4.0,CS_ESCOL_N_5.0,CS_ESCOL_N_9.0,SURTO_SG_1.0,NOSOCOMIAL_1.0,FEBRE_1.0,TOSSE_1.0,GARGANTA_1.0,DISPNEIA_1.0,DESC_RESP_1.0,SATURACAO_1.0,DIARREIA_1.0,VOMITO_1.0,PUERPERA_1.0,FATOR_RISC_2,CARDIOPATI_1.0,HEMATOLOGI_1.0,SIND_DOWN_1.0,HEPATICA_1.0,ASMA_1.0,DIABETES_1.0,NEUROLOGIC_1.0,PNEUMOPATI_1.0,IMUNODEPRE_1.0,RENAL_1.0,OBESIDADE_1.0,VACINA_1.0,VACINA_2.0,VACINA_9.0,ANTIVIRAL_1.0,UTI_1.0,SUPORT_VEN_1.0,SUPORT_VEN_2.0,PCR_RESUL_2.0,PCR_RESUL_3.0,PCR_RESUL_4.0,PCR_RESUL_5.0,PCR_RESUL_9.0,HISTO_VGM_2,HISTO_VGM_9,DOR_ABD_1.0,FADIGA_1.0,PERD_OLFT_1.0,PERD_PALA_1.0
0,530010,1.0,0.583333,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
1,530010,1.0,1.0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0
2,230440,1.0,0.75,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3,530010,1.0,55.0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0
4,410690,1.0,29.0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0


CONFERÊNCIA DO BALANEAMENTO DA BASE

In [144]:
df_mcid3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 807822 entries, 0 to 807821
Data columns (total 59 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   CO_MUN_NOT      807822 non-null  int64  
 1   EVOLUCAO        807822 non-null  float64
 2   IDADE_ANOS      807822 non-null  float64
 3   Periodo         807822 non-null  int64  
 4   CS_SEXO_I       807822 non-null  uint8  
 5   CS_SEXO_M       807822 non-null  uint8  
 6   CS_GESTANT_1    807822 non-null  uint8  
 7   CS_RACA_2.0     807822 non-null  uint8  
 8   CS_RACA_3.0     807822 non-null  uint8  
 9   CS_RACA_4.0     807822 non-null  uint8  
 10  CS_RACA_5.0     807822 non-null  uint8  
 11  CS_RACA_9.0     807822 non-null  uint8  
 12  CS_ESCOL_N_1.0  807822 non-null  uint8  
 13  CS_ESCOL_N_2.0  807822 non-null  uint8  
 14  CS_ESCOL_N_3.0  807822 non-null  uint8  
 15  CS_ESCOL_N_4.0  807822 non-null  uint8  
 16  CS_ESCOL_N_5.0  807822 non-null  uint8  
 17  CS_ESCOL_N

In [145]:
df_mcid3["EVOLUCAO"].value_counts(normalize=True)

1.0    0.705055
2.0    0.294945
Name: EVOLUCAO, dtype: float64

Split da Base

In [146]:
X = df_mcid3.drop(columns=["CO_MUN_NOT", "Periodo", 'EVOLUCAO', 'CS_SEXO_I', 'CS_RACA_9.0', 'CS_ESCOL_N_9.0', 'VACINA_9.0',
                           'PCR_RESUL_9.0', 'HISTO_VGM_9'])
y = df_mcid3['EVOLUCAO']

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)

BALANCEAMENTO DA BASE

In [147]:
Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape

((565475, 50), (242347, 50), (565475,), (242347,))

In [148]:
smote = SMOTE(sampling_strategy = 'minority', random_state = 42)
Xtrain_over, ytrain_over = smote.fit_resample(Xtrain,ytrain)

Xtest_over, ytest_over = smote.fit_resample(Xtest,ytest)
Xtrain_over.shape, ytrain_over.shape, Xtest_over.shape, ytest_over.shape

((797540, 50), (797540,), (341578, 50), (341578,))

In [149]:
Xtest_over.head()

Unnamed: 0,IDADE_ANOS,CS_SEXO_M,CS_GESTANT_1,CS_RACA_2.0,CS_RACA_3.0,CS_RACA_4.0,CS_RACA_5.0,CS_ESCOL_N_1.0,CS_ESCOL_N_2.0,CS_ESCOL_N_3.0,CS_ESCOL_N_4.0,CS_ESCOL_N_5.0,SURTO_SG_1.0,NOSOCOMIAL_1.0,FEBRE_1.0,TOSSE_1.0,GARGANTA_1.0,DISPNEIA_1.0,DESC_RESP_1.0,SATURACAO_1.0,DIARREIA_1.0,VOMITO_1.0,PUERPERA_1.0,FATOR_RISC_2,CARDIOPATI_1.0,HEMATOLOGI_1.0,SIND_DOWN_1.0,HEPATICA_1.0,ASMA_1.0,DIABETES_1.0,NEUROLOGIC_1.0,PNEUMOPATI_1.0,IMUNODEPRE_1.0,RENAL_1.0,OBESIDADE_1.0,VACINA_1.0,VACINA_2.0,ANTIVIRAL_1.0,UTI_1.0,SUPORT_VEN_1.0,SUPORT_VEN_2.0,PCR_RESUL_2.0,PCR_RESUL_3.0,PCR_RESUL_4.0,PCR_RESUL_5.0,HISTO_VGM_2,DOR_ABD_1.0,FADIGA_1.0,PERD_OLFT_1.0,PERD_PALA_1.0
0,70.0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1
1,76.0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0
2,40.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0
3,29.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
4,53.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0


Seleção de Features via Feature Importance

In [150]:
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(Xtrain_over, ytrain_over)

RandomForestClassifier(random_state=42)

In [151]:
previsoes = random_forest.predict(Xtest_over)
previsoes

array([1., 1., 2., ..., 1., 2., 1.])

In [152]:
accuracy_score(ytest_over, previsoes)

0.7500453776297068

In [153]:
Xtest_over.columns

Index(['IDADE_ANOS', 'CS_SEXO_M', 'CS_GESTANT_1', 'CS_RACA_2.0', 'CS_RACA_3.0', 'CS_RACA_4.0', 'CS_RACA_5.0', 'CS_ESCOL_N_1.0', 'CS_ESCOL_N_2.0', 'CS_ESCOL_N_3.0', 'CS_ESCOL_N_4.0', 'CS_ESCOL_N_5.0', 'SURTO_SG_1.0', 'NOSOCOMIAL_1.0', 'FEBRE_1.0', 'TOSSE_1.0', 'GARGANTA_1.0', 'DISPNEIA_1.0', 'DESC_RESP_1.0', 'SATURACAO_1.0', 'DIARREIA_1.0', 'VOMITO_1.0', 'PUERPERA_1.0', 'FATOR_RISC_2', 'CARDIOPATI_1.0', 'HEMATOLOGI_1.0', 'SIND_DOWN_1.0', 'HEPATICA_1.0', 'ASMA_1.0', 'DIABETES_1.0', 'NEUROLOGIC_1.0', 'PNEUMOPATI_1.0', 'IMUNODEPRE_1.0', 'RENAL_1.0', 'OBESIDADE_1.0', 'VACINA_1.0', 'VACINA_2.0', 'ANTIVIRAL_1.0', 'UTI_1.0', 'SUPORT_VEN_1.0', 'SUPORT_VEN_2.0', 'PCR_RESUL_2.0', 'PCR_RESUL_3.0', 'PCR_RESUL_4.0', 'PCR_RESUL_5.0', 'HISTO_VGM_2', 'DOR_ABD_1.0', 'FADIGA_1.0', 'PERD_OLFT_1.0', 'PERD_PALA_1.0'], dtype='object')

In [154]:
x = random_forest.feature_importances_
np.set_printoptions(suppress=True)
print(x)

[0.31070363 0.02927715 0.00173459 0.00787578 0.00279867 0.02587794
 0.00034614 0.00922029 0.00887299 0.01207835 0.00633338 0.00642121
 0.01342383 0.00496942 0.02194834 0.0200926  0.0127873  0.01949143
 0.0192871  0.02126978 0.01066228 0.00880577 0.00067609 0.02091687
 0.01748897 0.00213283 0.00070913 0.00238349 0.00747066 0.01764349
 0.00753305 0.00723807 0.00624393 0.00792971 0.00831589 0.00853445
 0.01727968 0.01175183 0.04843125 0.11560373 0.03218872 0.02229644
 0.00139069 0.01091168 0.0071867  0.01526416 0.00546164 0.01240784
 0.00523649 0.00509452]


Seleção e teste de performance das 13 features

In [155]:
X = Xtrain_over[['IDADE_ANOS','CS_SEXO_M','CS_RACA_4.0','FEBRE_1.0','DISPNEIA_1.0','SATURACAO_1.0','UTI_1.0',
                 'SUPORT_VEN_1.0', 'SUPORT_VEN_2.0', 'PCR_RESUL_2.0','TOSSE_1.0','DESC_RESP_1.0', 'FATOR_RISC_2']]

y = ytrain_over

X_test = Xtest_over[['IDADE_ANOS','CS_SEXO_M','CS_RACA_4.0','FEBRE_1.0','DISPNEIA_1.0','SATURACAO_1.0','UTI_1.0',
                 'SUPORT_VEN_1.0', 'SUPORT_VEN_2.0', 'PCR_RESUL_2.0','TOSSE_1.0','DESC_RESP_1.0', 'FATOR_RISC_2']]

y_test = ytest_over


In [156]:
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X, y)

RandomForestClassifier(random_state=42)

In [157]:
previsoes = random_forest.predict(X_test)
previsoes

array([1., 1., 2., ..., 1., 2., 2.])

In [158]:
accuracy_score(y_test, previsoes)

0.7301055688598211

In [164]:
print(classification_report(y_test, previsoes))

              precision    recall  f1-score   support

         1.0       0.71      0.78      0.74    170789
         2.0       0.75      0.68      0.72    170789

    accuracy                           0.73    341578
   macro avg       0.73      0.73      0.73    341578
weighted avg       0.73      0.73      0.73    341578



# Seleção da Base Pré-Processada com as Features Selecionadas

In [159]:
Base_Final = df_mcid3[['Periodo','CO_MUN_NOT','IDADE_ANOS','CS_SEXO_M','CS_RACA_4.0','FEBRE_1.0','DISPNEIA_1.0',
                       'SATURACAO_1.0','UTI_1.0','SUPORT_VEN_1.0','SUPORT_VEN_2.0','PCR_RESUL_2.0','TOSSE_1.0',
                       'DESC_RESP_1.0', 'FATOR_RISC_2', 'EVOLUCAO']]


In [57]:
Base_Final.to_csv(r'Base_Final.csv',index = False)