André Bamberg Pan, RA: 2079844

In [2]:
from google.colab import drive
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt

#dataset de classificação para doença de alzheimer, com paciente de 4751 até 6900
file_path = '/content/drive/My Drive/datasets/alzheimers_disease_data.csv'

df = pd.read_csv(file_path)
df.head()


Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [3]:
#removendo as variaveis que não são importantes para a classificação
X = df.drop(columns=['PatientID', 'DoctorInCharge', 'Diagnosis'])

#variavel alvo
y = df['Diagnosis']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [5]:
clf = RandomForestClassifier(random_state=42)

In [6]:
#hiperparâmetros para GridSearchCV
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4]
}

#K-Fold Cross Validation com 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

#GridSearchCV com K-Fold Cross Validation
grid_search = GridSearchCV(clf, param_grid, cv=kf, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [7]:
#melhor combinação de hiperparâmetros
best_params = grid_search.best_params_
print("Melhores hiperparâmetros encontrados:", best_params)

Melhores hiperparâmetros encontrados: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}


In [8]:
#modelo com os melhores hiperparâmetros encontrados
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

In [9]:
#acurácia
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)

print(f'Acurácia no conjunto de treino: {accuracy_train:.2f}')
print(f'Acurácia no conjunto de teste: {accuracy_test:.2f}')


Acurácia no conjunto de treino: 0.97
Acurácia no conjunto de teste: 0.89


In [10]:
#diferença entre a acurácia de treino e teste
gap = accuracy_train - accuracy_test
print(f"A diferença é: {gap:.2f}")

#verificação de overfitting
threshold = 0.1
if gap > threshold:
    print("Overfitting detected!")
else:
    print("No overfitting detected.")

A diferença é: 0.08
No overfitting detected.


In [11]:
cv_scores = cross_val_score(best_model, X, y, cv=kf, scoring='accuracy')

#acurácia média do K-Fold
mean_accuracy = cv_scores.mean()
print(f"A acurácia média do modelo com K-Fold é: {mean_accuracy:.2f}")


A acurácia média do modelo com K-Fold é: 0.94


In [12]:
#desvio padrão das acurácias do K-Fold
cv_std = cv_scores.std()

#margem de erro usando a distribuição normal padrão
margin_error = norm.ppf(1 - 0.05 / 2) * cv_std

#limites inferior e superior do intervalo de confiança
lower_bound = mean_accuracy - margin_error
upper_bound = mean_accuracy + margin_error

print(f"95% Confidence interval: [{lower_bound:.2f}, {upper_bound:.2f}]")

95% Confidence interval: [0.90, 0.97]
