# Dataset Exploratory Data Analysis and Pre-processing Pipeline

In [None]:
# Import needed libraries and modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [None]:
# Fetch dataset from UCI Repository
!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo
heart_disease = fetch_ucirepo(id=45)



In [None]:
# Dataset overview
df = heart_disease.data.original
df.dropna(inplace=True)
display(df.head())
display(df.info())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 0 to 301
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       297 non-null    int64  
 1   sex       297 non-null    int64  
 2   cp        297 non-null    int64  
 3   trestbps  297 non-null    int64  
 4   chol      297 non-null    int64  
 5   fbs       297 non-null    int64  
 6   restecg   297 non-null    int64  
 7   thalach   297 non-null    int64  
 8   exang     297 non-null    int64  
 9   oldpeak   297 non-null    float64
 10  slope     297 non-null    int64  
 11  ca        297 non-null    float64
 12  thal      297 non-null    float64
 13  num       297 non-null    int64  
dtypes: float64(3), int64(11)
memory usage: 34.8 KB


None

In [None]:
# Metadata
print(heart_disease.metadata)

# Variables information
variables = heart_disease.variables
display(variables)


{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M. Pfisterer, J. Schmid, S. Sa

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,age,Feature,Integer,Age,,years,no
1,sex,Feature,Categorical,Sex,,,no
2,cp,Feature,Categorical,,,,no
3,trestbps,Feature,Integer,,resting blood pressure (on admission to the ho...,mm Hg,no
4,chol,Feature,Integer,,serum cholestoral,mg/dl,no
5,fbs,Feature,Categorical,,fasting blood sugar > 120 mg/dl,,no
6,restecg,Feature,Categorical,,,,no
7,thalach,Feature,Integer,,maximum heart rate achieved,,no
8,exang,Feature,Categorical,,exercise induced angina,,no
9,oldpeak,Feature,Integer,,ST depression induced by exercise relative to ...,,no


## Binarizing the target:

In [None]:
different_values = df['num'].unique()
print(different_values)

[0 2 1 3 4]


In [None]:
df["num_binarized"] = df["num"].apply(lambda x: 1 if x != 0 else 0)

print(df)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   1       145   233    1        2      150      0      2.3   
1     67    1   4       160   286    0        2      108      1      1.5   
2     67    1   4       120   229    0        2      129      1      2.6   
3     37    1   3       130   250    0        0      187      0      3.5   
4     41    0   2       130   204    0        2      172      0      1.4   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
297   57    0   4       140   241    0        0      123      1      0.2   
298   45    1   1       110   264    0        0      132      0      1.2   
299   68    1   4       144   193    1        0      141      0      3.4   
300   57    1   4       130   131    0        0      115      1      1.2   
301   57    0   2       130   236    0        2      174      0      0.0   

     slope   ca  thal  num  num_binarized  
0        3  0.0   6.0    0              0  

### Train-test split:

In [None]:
Test_Size = 0.2
Random_Seed = 82024
dataset_name = "df"

In [None]:
#index = df.index
#train_index, test_index = train_test_split(index, test_size = Test_Size, random_state=Random_Seed)
#train_df = df.loc[train_index]
#test_df = df.loc[test_index]



#X_train = train_df.reindex(features, axis=1).values
#y_train = train_df.reindex(target, axis=1).values
#X_test = test_df.reindex(features, axis=1).values
#y_test = test_df.reindex(target, axis=1).values

#print("Train:")
#print(train_df)
#print()
#print("Test:")
#print(test_df)



# Extraindo os nomes das colunas para X e y
features = df.columns[:-2]  # Todas as colunas, exceto a última (ou ajuste conforme necessário)
target = "num_binarized"    # Nome da coluna target

# Verificando se os nomes das colunas foram extraídos corretamente
print("Features:", features)
print("Target:", target)

# Dividindo o índice do DataFrame para treino e teste
index = df.index
train_index, test_index = train_test_split(index, test_size=Test_Size, random_state=Random_Seed)

# Separando o DataFrame em treino e teste
train_df = df.loc[train_index]
test_df = df.loc[test_index]

# Extraindo as features e o target para treino e teste
X_train = train_df.reindex(columns=features).values
y_train = train_df[target].values
X_test = test_df.reindex(columns=features).values
y_test = test_df[target].values

# Verificando as shapes dos dados
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)





Features: Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')
Target: num_binarized
X_train shape: (237, 13)
y_train shape: (237,)
X_test shape: (60, 13)
y_test shape: (60,)


### Gaussian Process Model:

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

# Cria o modelo
model_gp = GaussianProcessClassifier(random_state=Random_Seed, kernel=RBF(length_scale=1.0))

In [None]:

# train the model
model_gp.fit(X_train, y_train)

# make predictions using the trained model
y_pred = model_gp.predict(X_test)

# calculates the probabilities of the model predictions
y_pred_proba = model_gp.predict_proba(X_test)

# calculates the incertainty like standard deviation of probabilities to each one of the classes
y_pred_std = np.std(y_pred_proba, axis=1)

# calculates the mean incertainty
mean_std = np.mean(y_pred_std)

#calculates the area under the ROC curve(AUC-ROC)
roc_auc_gp = roc_auc_score(y_test, y_pred_proba[:, 1])

#print(f"Prevision of the gaussian process model: {y_pred}")
#print(f"Score of the gaussian process model: {model_gp.score(X_train, y_train)}")
#print(f"Mean incertainty of the gaussian process model: {mean_std}")
print(f"AUC-ROC of the gaussian process model: {roc_auc_gp}")

AUC-ROC of the gaussian process model: 0.510662177328844
