In [142]:
# 1. Carregar dados

import pandas as pd

# Carregar o dataset
df = pd.read_csv('./data/dataset.csv')
df.head()

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,Common Cold,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,Eczema,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive


In [143]:
# 2. Preparação e divisão de dados

from sklearn.model_selection import train_test_split

df['Fever'] = df['Fever'].map({'Yes': 1, 'No': 0})
df['Cough'] = df['Cough'].map({'Yes': 1, 'No': 0})
df['Fatigue'] = df['Fatigue'].map({'Yes': 1, 'No': 0})
df['Difficulty Breathing'] = df['Difficulty Breathing'].map({'Yes': 1, 'No': 0})
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
df['Blood Pressure'] = df['Blood Pressure'].map({'Low': 0, 'Normal': 1, 'High': 2})
df['Cholesterol Level'] = df['Cholesterol Level'].map({'Low': 0, 'Normal': 1, 'High': 2})
df['Outcome Variable'] = df['Outcome Variable'].map({'Positive': 1, 'Negative': 0})

# Filter rows where Disease is 'Influenza'
df = df[df['Disease'] == 'Influenza']

X = df.drop('Disease', axis=1)
y = df['Outcome Variable']

# Dividir o dataset em treino e teste
# Training set: 80% of the data
# Testing set: 20% of the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42)

In [144]:
# Verificação de tipos de valores

df.dtypes

Disease                 object
Fever                    int64
Cough                    int64
Fatigue                  int64
Difficulty Breathing     int64
Age                      int64
Gender                   int64
Blood Pressure           int64
Cholesterol Level        int64
Outcome Variable         int64
dtype: object

In [145]:
# 3. Normalização

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = (scaler.transform(X_test))

In [146]:
# 4. Treinar modelo - Árvore de Decisão

from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(
    random_state=42, max_depth=30, min_samples_split=2, min_samples_leaf=1)
model.fit(X_train, y_train)

In [147]:
# 5. Avalição do modelo

from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

# Exemplo de avaliação com accuracy, classification_report e outras métricas
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Calculando precision, recall e f1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Imprimindo as métricas
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.75
Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.67      0.80         3

    accuracy                           0.75         4
   macro avg       0.75      0.83      0.73         4
weighted avg       0.88      0.75      0.77         4

Precision: 1.0
Recall: 0.6666666666666666
F1-score: 0.8


In [148]:
import numpy as np

from sklearn.model_selection import StratifiedKFold

# Definir o número de dobras e a estratificação
n_splits = 2
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Realizar a validação cruzada manualmente
scores = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Treinar e avaliar seu modelo aqui
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    scores.append(score)

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))


Cross-validation scores: [0.5, 0.75]
Mean cross-validation score: 0.625


In [149]:
# 7. Grid Search
from sklearn.model_selection import GridSearchCV, StratifiedKFold

param_grid = {'max_depth': [10, 20, 30, 40], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
strat_kfold = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(model, param_grid, cv=strat_kfold)
grid_search.fit(X_resampled, y_resampled)

# Print best parameters and best cross-validation score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best cross-validation score: 0.9
