# Importações e Preparação do Dataset

In [23]:
# Importações necessárias
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.linear_model import LinearRegression, Perceptron
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

In [3]:
# Carregar o dataset
df = pd.read_csv("Covid Data.csv")

In [4]:
# Usei para não errar o nome das colunas no pré-processamento
print(df.columns)

Index(['USMER', 'MEDICAL_UNIT', 'SEX', 'PATIENT_TYPE', 'DATE_DIED', 'INTUBED',
       'PNEUMONIA', 'AGE', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR',
       'HIPERTENSION', 'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY',
       'RENAL_CHRONIC', 'TOBACCO', 'CLASIFFICATION_FINAL', 'ICU'],
      dtype='object')


# Pré-processamento

In [5]:
# Substituir valores específicos por NaN
df.replace({97: None, 99: None, '9999-99-99': None}, inplace=True)

# Verificar e imputar valores faltantes
imputer = SimpleImputer(strategy='most_frequent')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [6]:
# Seleção de atributos e target
X = df_imputed.drop(columns=['DATE_DIED', 'CLASIFFICATION_FINAL'])
y = df_imputed['CLASIFFICATION_FINAL'].apply(lambda x: 1 if x < 4 else 0)  # 1: risco alto, 0: risco baixo

In [7]:
# Divisão em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Imputation of missing values before scaling
imputer = SimpleImputer(strategy='mean') # You can choose a different strategy if needed
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [9]:
# Normalização
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# Verificar se há valores faltantes após a imputação
print("Verificação de valores faltantes:")
print("Valores faltantes em X_train:", pd.isnull(X_train).sum().sum())
print("Valores faltantes em X_test:", pd.isnull(X_test).sum().sum())

Verificação de valores faltantes:
Valores faltantes em X_train: 0
Valores faltantes em X_test: 0


In [11]:
# Função para treinar e avaliar os modelos
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{model_name}")
    print(classification_report(y_test, y_pred))

# Árvore de Decisão CART (padrão)

In [12]:
cart = DecisionTreeClassifier()
train_and_evaluate_model(cart, X_train, y_train, X_test, y_test, "Árvore de Decisão CART")


Árvore de Decisão CART
              precision    recall  f1-score   support

           0       0.54      0.64      0.59      3614
           1       0.62      0.52      0.57      4084

    accuracy                           0.58      7698
   macro avg       0.58      0.58      0.58      7698
weighted avg       0.59      0.58      0.58      7698



# Support Vector Machine (padrão)

In [13]:
svm = SVC()
train_and_evaluate_model(svm, X_train, y_train, X_test, y_test, "Support Vector Machine (padrão)")

Support Vector Machine (padrão)
              precision    recall  f1-score   support

           0       0.61      0.60      0.61      3614
           1       0.65      0.66      0.66      4084

    accuracy                           0.63      7698
   macro avg       0.63      0.63      0.63      7698
weighted avg       0.63      0.63      0.63      7698



# Support Vector Machine (mudando o kernel por outro)

In [14]:
svm_poly = SVC(kernel='poly')
train_and_evaluate_model(svm_poly, X_train, y_train, X_test, y_test, "Support Vector Machine (kernel poli)")

Support Vector Machine (kernel poli)
              precision    recall  f1-score   support

           0       0.59      0.64      0.61      3614
           1       0.65      0.61      0.63      4084

    accuracy                           0.62      7698
   macro avg       0.62      0.62      0.62      7698
weighted avg       0.62      0.62      0.62      7698



# KNN (padrão)

In [15]:
knn = KNeighborsClassifier()
train_and_evaluate_model(knn, X_train, y_train, X_test, y_test, "KNN (padrão)")

KNN (padrão)
              precision    recall  f1-score   support

           0       0.57      0.54      0.55      3614
           1       0.61      0.64      0.62      4084

    accuracy                           0.59      7698
   macro avg       0.59      0.59      0.59      7698
weighted avg       0.59      0.59      0.59      7698



# KNN (mudando o valor de k)

In [16]:
knn_k5 = KNeighborsClassifier(n_neighbors=5)
train_and_evaluate_model(knn_k5, X_train, y_train, X_test, y_test, "KNN (k=5)")

KNN (k=5)
              precision    recall  f1-score   support

           0       0.57      0.54      0.55      3614
           1       0.61      0.64      0.62      4084

    accuracy                           0.59      7698
   macro avg       0.59      0.59      0.59      7698
weighted avg       0.59      0.59      0.59      7698



# KNN (mudando a medida de distância)

In [17]:
knn_minkowski = KNeighborsClassifier(metric='minkowski')
train_and_evaluate_model(knn_minkowski, X_train, y_train, X_test, y_test, "KNN (métrica minkowski)")

KNN (métrica minkowski)
              precision    recall  f1-score   support

           0       0.57      0.54      0.55      3614
           1       0.61      0.64      0.62      4084

    accuracy                           0.59      7698
   macro avg       0.59      0.59      0.59      7698
weighted avg       0.59      0.59      0.59      7698



# Gaussian Naive Bayes (padrão)

In [18]:
gnb = GaussianNB()
train_and_evaluate_model(gnb, X_train, y_train, X_test, y_test, "Gaussian Naive Bayes (padrão)")

Gaussian Naive Bayes (padrão)
              precision    recall  f1-score   support

           0       0.60      0.50      0.55      3614
           1       0.62      0.71      0.66      4084

    accuracy                           0.61      7698
   macro avg       0.61      0.60      0.60      7698
weighted avg       0.61      0.61      0.61      7698



# Categorical Naive Bayes (padrão), considerando todos os atributos categóricos

In [28]:
# Pré-processamento para Categorical Naive Bayes
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

X_cat_imputed = categorical_transformer.fit_transform(X_cat)

# Divisão em treino e teste para os dados categóricos
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X_cat_imputed, y, test_size=0.2, random_state=42)

cat_nb = CategoricalNB()
train_and_evaluate_model(cat_nb, X_train_cat, y_train_cat, X_test_cat, y_test_cat, "Categorical Naive Bayes (padrão)")

Categorical Naive Bayes (padrão)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3614
           1       1.00      1.00      1.00      4084

    accuracy                           1.00      7698
   macro avg       1.00      1.00      1.00      7698
weighted avg       1.00      1.00      1.00      7698



# 1.9 Regressão Linear (quadrados mínimos), regressão Linear não é adequada para classificação binária, então usamos LinearDiscriminantAnalysis

In [20]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
train_and_evaluate_model(lda, X_train, y_train, X_test, y_test, "Linear Discriminant Analysis (LDA)")

Linear Discriminant Analysis (LDA)
              precision    recall  f1-score   support

           0       0.58      0.63      0.60      3614
           1       0.65      0.60      0.62      4084

    accuracy                           0.61      7698
   macro avg       0.61      0.61      0.61      7698
weighted avg       0.61      0.61      0.61      7698



# Perceptron

In [21]:
perceptron = Perceptron()
train_and_evaluate_model(perceptron, X_train, y_train, X_test, y_test, "Perceptron")

Perceptron
              precision    recall  f1-score   support

           0       0.45      0.43      0.44      3614
           1       0.52      0.53      0.52      4084

    accuracy                           0.49      7698
   macro avg       0.48      0.48      0.48      7698
weighted avg       0.48      0.49      0.49      7698

