In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.linear_model import LinearRegression, Perceptron
from sklearn.metrics import accuracy_score, mean_squared_error

# Carregar o dataset
df = pd.read_csv('Covid Data.csv')

# Exibir as primeiras linhas do dataset para inspeção
df.head()


Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,03/05/2020,97,1,65,2,2,...,2,2,1,2,2,2,2,2,3,97
1,2,1,2,1,03/06/2020,97,1,72,97,2,...,2,2,1,2,2,1,1,2,5,97
2,2,1,2,2,09/06/2020,1,2,55,97,1,...,2,2,2,2,2,2,2,2,3,2
3,2,1,1,1,12/06/2020,97,2,53,2,2,...,2,2,2,2,2,2,2,2,7,97
4,2,1,2,1,21/06/2020,97,2,68,97,1,...,2,2,1,2,2,2,2,2,3,97


In [2]:
# Exibir informações do dataset
df.info()

# Garantir que a coluna 'CLASIFFICATION_FINAL' é categórica
df['CLASIFFICATION_FINAL'] = df['CLASIFFICATION_FINAL'].astype(str)

target_column = 'CLASIFFICATION_FINAL'

# Separar atributos qualitativos e quantitativos
qualitative = df.select_dtypes(include=['object']).columns
quantitative = df.select_dtypes(include=[np.number]).columns

# Tratar a coluna 'DATE_DIED' como qualitativa, codificando-a
label_encoders = {}
for column in qualitative:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Normalizar atributos quantitativos
scaler = StandardScaler()
df[quantitative] = scaler.fit_transform(df[quantitative])

# Separar variáveis independentes (X) e dependentes (y)
X = df.drop(target_column, axis=1)
y = df[target_column]

# Dividir dataset em conjunto de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   USMER                 1048575 non-null  int64 
 1   MEDICAL_UNIT          1048575 non-null  int64 
 2   SEX                   1048575 non-null  int64 
 3   PATIENT_TYPE          1048575 non-null  int64 
 4   DATE_DIED             1048575 non-null  object
 5   INTUBED               1048575 non-null  int64 
 6   PNEUMONIA             1048575 non-null  int64 
 7   AGE                   1048575 non-null  int64 
 8   PREGNANT              1048575 non-null  int64 
 9   DIABETES              1048575 non-null  int64 
 10  COPD                  1048575 non-null  int64 
 11  ASTHMA                1048575 non-null  int64 
 12  INMSUPR               1048575 non-null  int64 
 13  HIPERTENSION          1048575 non-null  int64 
 14  OTHER_DISEASE         1048575 non-null  int64 
 15

In [4]:
# Função para treinar e avaliar classificadores
def train_and_evaluate(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return accuracy_score(y_test, y_pred)

# 1. Árvore de Decisão (CART)
print("Treinando Decision Tree...")
clf_cart = DecisionTreeClassifier()
accuracy_cart = train_and_evaluate(clf_cart, X_train, X_test, y_train, y_test)
print(f'Decision Tree accuracy: {accuracy_cart}\n')

# 2. Support Vector Machine (padrão)
# print("Treinando SVM (default)...")
# clf_svc_default = SVC()
# accuracy_svc_default = train_and_evaluate(clf_svc_default, X_train, X_test, y_train, y_test)
# print(f'SVM (default) accuracy: {accuracy_svc_default}')

# 3. Support Vector Machine (mudando o kernel por outro)
# print("Treinando SVM (rbf kernel)...")
# clf_svc_rbf = SVC(kernel='rbf')
# accuracy_svc_rbf = train_and_evaluate(clf_svc_rbf, X_train, X_test, y_train, y_test)
# print(f'SVM (rbf kernel) accuracy: {accuracy_svc_rbf}')

# 4. Knn (padrão)
# print("Treinando KNN (default)...")
# clf_knn_default = KNeighborsClassifier()
# accuracy_knn_default = train_and_evaluate(clf_knn_default, X_train, X_test, y_train, y_test)
# print(f'KNN (default) accuracy: {accuracy_knn_default}')

# 5. Knn (mudando o valor de k)
# print("Treinando KNN (k=5)...")
# clf_knn_k5 = KNeighborsClassifier(n_neighbors=5)
# accuracy_knn_k5 = train_and_evaluate(clf_knn_k5, X_train, X_test, y_train, y_test)
# print(f'KNN (k=5) accuracy: {accuracy_knn_k5}')

# 6. Knn (mudando a medida de distância)
# print("Treinando KNN (minkowski)...")
# clf_knn_minkowski = KNeighborsClassifier(metric='minkowski')
# accuracy_knn_minkowski = train_and_evaluate(clf_knn_minkowski, X_train, X_test, y_train, y_test)
# print(f'KNN (minkowski) accuracy: {accuracy_knn_minkowski}')

# 7. Gaussian Naive Bayes
print("Treinando Gaussian Naive Bayes...")
clf_gnb = GaussianNB()
accuracy_gnb = train_and_evaluate(clf_gnb, X_train, X_test, y_train, y_test)
print(f'Gaussian Naive Bayes accuracy: {accuracy_gnb}\n')

# 8. Categorical Naive Bayes
print("Treinando Categorical Naive Bayes...")
# Remover atributos quantitativos
X_cat = df[qualitative]
X_cat_train, X_cat_test, y_cat_train, y_cat_test = train_test_split(X_cat, y, test_size=0.2, random_state=42)
clf_cnb = CategoricalNB()
accuracy_cnb = train_and_evaluate(clf_cnb, X_cat_train, X_cat_test, y_cat_train, y_cat_test)
print(f'Categorical Naive Bayes accuracy: {accuracy_cnb}\n')

# 9. Regressão Linear
print("Treinando Linear Regression...")
# Ajustar para regressão
X_reg = df.drop(target_column, axis=1)
y_reg = df[target_column].astype(float)  # Convertendo para float para regressão
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
clf_lr = LinearRegression()
clf_lr.fit(X_reg_train, y_reg_train)
y_reg_pred = clf_lr.predict(X_reg_test)
mse_lr = mean_squared_error(y_reg_test, y_reg_pred)
print(f'Linear Regression MSE: {mse_lr}\n')

# 10. Perceptron
print("Treinando Perceptron...")
clf_perceptron = Perceptron()
accuracy_perceptron = train_and_evaluate(clf_perceptron, X_train, X_test, y_train, y_test)
print(f'Perceptron accuracy: {accuracy_perceptron}\n')

# Exibir resultados
results = {
    'Decision Tree': accuracy_cart,
    # 'SVM (default)': accuracy_svc_default,
    # 'SVM (rbf kernel)': accuracy_svc_rbf,
    # 'KNN (default)': accuracy_knn_default,
    # 'KNN (k=5)': accuracy_knn_k5,
    # 'KNN (minkowski)': accuracy_knn_minkowski,
    'GaussianNB': accuracy_gnb,
    'CategoricalNB': accuracy_cnb,
    'Linear Regression (MSE)': mse_lr,
    'Perceptron': accuracy_perceptron
}

for clf, score in results.items():
    print(f'{clf}: {score}')

Treinando Decision Tree...
Decision Tree accuracy: 0.5194478220442028

Treinando Gaussian Naive Bayes...
Gaussian Naive Bayes accuracy: 0.5203919605178456

Treinando Categorical Naive Bayes...
Categorical Naive Bayes accuracy: 1.0

Treinando Linear Regression...
Linear Regression MSE: 3.321897907993462

Treinando Perceptron...
Perceptron accuracy: 0.4311422645018239

Decision Tree: 0.5194478220442028
GaussianNB: 0.5203919605178456
CategoricalNB: 1.0
Linear Regression (MSE): 3.321897907993462
Perceptron: 0.4311422645018239
