<a href="https://colab.research.google.com/github/andre-arantes/ia/blob/master/tp1/etapa4/MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Projeto de Previsão de Diabetes

Este projeto tem como objetivo prever a probabilidade de uma pessoa ter diabetes com base em vários fatores, como idade, nível de HbA1c, nível de glicose no sangue e gênero. O conjunto de dados usado neste projeto é o Diabetes Prediction Dataset.

Usaremos o algoritmo Random Forest Classifier para treinar nosso modelo e prever a probabilidade de diabetes. Também usaremos várias técnicas de pré-processamento de dados, como subamostragem, para melhorar a precisão do nosso modelo.

In [None]:
# Importando bibliotecas
import pandas as pd
from sklearn import preprocessing
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
# Carregando os dados
base = pd.read_csv('/content/sample_data/diabetes_prediction_dataset.csv')

# Ler dados da base

In [None]:
base.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [None]:
# checar existencias de campos nulos
base.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [None]:
# checar natureza dos dados
for i in base.columns:
    print(base[i].value_counts())
    print('\n')

Female    58552
Male      41430
Other        18
Name: gender, dtype: int64


80.00    5621
51.00    1619
47.00    1574
48.00    1568
53.00    1542
         ... 
0.48       83
1.00       83
0.40       66
0.16       59
0.08       36
Name: age, Length: 102, dtype: int64


0    92515
1     7485
Name: hypertension, dtype: int64


0    96058
1     3942
Name: heart_disease, dtype: int64


No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: smoking_history, dtype: int64


27.32    25495
23.00      103
27.12      101
27.80      100
24.96      100
         ...  
58.23        1
48.18        1
55.57        1
57.07        1
60.52        1
Name: bmi, Length: 4247, dtype: int64


6.6    8540
5.7    8413
6.5    8362
5.8    8321
6.0    8295
6.2    8269
6.1    8048
3.5    7662
4.8    7597
4.5    7585
4.0    7542
5.0    7471
8.8     661
8.2     661
9.0     654
7.5     643
6.8     642
7.0     634
Name: HbA1c_level, dtype: int6

**Separando os atributos de entrada e de classe**

In [None]:
X = base.copy()
y = X.pop('diabetes')

**indetificação de outliers**

Outliers de age

In [None]:
Q1 = X['age'].quantile(0.25)
Q3 = X['age'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5*IQR
upper = Q3 + 1.5*IQR
print(Q1)
print(Q3)
print(IQR)
print(lower)
print(upper)

24.0
60.0
36.0
-30.0
114.0


In [None]:
upper_array_age = np.where(X['age']>=upper)[0]
upper_array_age

array([], dtype=int64)

Outliers de bmi

In [None]:
Q1 = X['bmi'].quantile(0.25)
Q3 = X['bmi'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5*IQR
upper = Q3 + 1.5*IQR
print(Q1)
print(Q3)
print(IQR)
print(lower)
print(upper)

23.63
29.58
5.949999999999999
14.705
38.504999999999995


In [None]:
upper_array_bmi = np.where(X['bmi']>=upper)[0]
upper_array_bmi

array([   11,    39,    59, ..., 99953, 99960, 99993])

In [None]:
lower_array_bmi = np.where(X['bmi']<=lower)[0]
lower_array_bmi

array([   24,   155,   221, ..., 99841, 99906, 99933])

Outliers de HbA1c_level

In [None]:
Q1 = X['HbA1c_level'].quantile(0.25)
Q3 = X['HbA1c_level'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5*IQR
upper = Q3 + 1.5*IQR
print(Q1)
print(Q3)
print(IQR)
print(lower)
print(upper)

4.8
6.2
1.4000000000000004
2.6999999999999993
8.3


In [None]:
upper_array_HbA1c_level = np.where(X['HbA1c_level']>=upper)[0]
upper_array_HbA1c_level

array([   40,    55,    59, ..., 99706, 99740, 99929])

In [None]:
lower_array_HbA1c_level = np.where(X['HbA1c_level']<=lower)[0]
lower_array_HbA1c_level


array([], dtype=int64)

Outliers de blood_glucose_level

In [None]:
Q1 = X['blood_glucose_level'].quantile(0.25)
Q3 = X['blood_glucose_level'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5*IQR
upper = Q3 + 1.5*IQR
print(Q1)
print(Q3)
print(IQR)
print(lower)
print(upper)

100.0
159.0
59.0
11.5
247.5


In [None]:
upper_array_blood_glucose_level = np.where(X['blood_glucose_level']>=upper)[0]
upper_array_blood_glucose_level

array([   38,    94,   104, ..., 99867, 99938, 99957])

In [None]:
lower_array_blood_glucose_level = np.where(X['blood_glucose_level']<=lower)[0]
lower_array_blood_glucose_level

array([], dtype=int64)

**tratamento de outliers**

In [None]:
"""
temp = np.unique(np.concatenate((upper_array_age,lower_array_bmi),0))
temp = np.unique(np.concatenate((temp,upper_array_bmi),0))
temp = np.unique(np.concatenate((temp,upper_array_HbA1c_level),0))
temp = np.unique(np.concatenate((temp,lower_array_HbA1c_level),0))
temp = np.unique(np.concatenate((temp,upper_array_blood_glucose_level),0))
temp = np.unique(np.concatenate((temp,lower_array_blood_glucose_level),0))
X.drop(index=temp, inplace=True)
y.drop(index=temp, inplace=True)
"""

#Pré-processamento dos dados

In [None]:
# remover campo 'smoking_history' devido a quantidade de informações faltantes
X.drop('smoking_history',axis=1,inplace=True)

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder_gender = LabelEncoder()
X.iloc[:,0] = label_encoder_gender.fit_transform(X.iloc[:,0])


  X.iloc[:,0] = label_encoder_gender.fit_transform(X.iloc[:,0])


**Divisão dos dados em treino e teste**

foi feita a divisão dos dados em 80\% para treino e 20\% para teste.

In [None]:
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size = 0.20, random_state = 42)

**Balanciamento dos dados**

Para o balanceamento de dados, foi verificado que haviam muitas instâncias da classe majoritária, então foi decidido implementar o undersampling na base de treino.

In [None]:
from imblearn.under_sampling import RandomUnderSampler
# balanceia a base de dados de forma que a base majoritaria fique 40% maior
us=RandomUnderSampler(sampling_strategy=0.4)
X_treino,y_treino = us.fit_resample(X_treino,y_treino)

**normalização de dados**

In [None]:
sc_X = StandardScaler()
X_trainscaled=sc_X.fit_transform(X_treino)
X_testscaled=sc_X.transform(X_teste)

**treinamento do modelo**

In [None]:
clf = MLPClassifier(hidden_layer_sizes=(256,128,64,32),activation="relu",random_state=1).fit(X_trainscaled, y_treino)
y_pred=clf.predict(X_testscaled)

  


In [None]:
print(classification_report(y_teste, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.91      0.95     17177
           1       0.31      0.75      0.44       901

    accuracy                           0.91     18078
   macro avg       0.65      0.83      0.70     18078
weighted avg       0.95      0.91      0.92     18078

