# 1. Limpiar y normalizar datos

In [1]:
import pandas as pd

 ## Attribute Information

* **Age**: age of the patient [years]

* **Sex**: sex of the patient [M: Male, F: Female]

* **ChestPainType**: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]

* **RestingBP**: resting blood pressure [mm Hg]

* **Cholesterol**: serum cholesterol [mm/dl]

* **FastingBS**: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]

* **RestingECG**: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]

* **MaxHR**: maximum heart rate achieved [Numeric value between 60 and 202]

* **ExerciseAngina**: exercise-induced angina [Y: Yes, N: No]

* **Oldpeak**: oldpeak = ST [Numeric value measured in depression], ejercicio relativo al descanso (oldpeak). Intervalo de tiempo entre latido y latido.


* **ST_Slope**: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]. La pendiente que sigue el electrocadiograma después de un latido, debe ser plano para considerarse normal.

* **HeartDisease**: output class [1: heart disease, 0: Normal]

* **Link**: [DataSet de Kaggle](https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction)

# Cargamos el Dataframe:

In [2]:
df = pd.read_csv("heart.csv")
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


# Comprobamos que no hayan ni valores nulos ni duplicados:

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [3]:
df.drop_duplicates()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


Gracias Dios

# Asignamos valores númericos a las columnas con cadenas de texto:
<h3>

* Asignamos 1 o 0 a las columnas con valores True o False, hombre o mujer...etc, respectivamente.

* Asignamos valores ponderados de forma ascendente a las columnas como "ChestPainType", "RestingECG"... en función de su gravedad e influencia negativa sobre nuestra variable target, "HeartDisease". (A mayor gravedad, mayor será el número asignado)

### SEX 
(Mujer = 0, hombre = 1)

In [4]:
df['Sex'].unique()

array(['M', 'F'], dtype=object)

In [5]:
df['Sex']=df['Sex'].str.replace('M',"1")
df['Sex']=df['Sex'].str.replace('F',"0")

df['Sex'] = pd.to_numeric(df['Sex'])

In [6]:
df['Sex'].unique()

array([1, 0], dtype=int64)

### ChestPainType

In [7]:
df['ChestPainType'].unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [8]:
df['ChestPainType']=df['ChestPainType'].str.replace('TA',"4")
df['ChestPainType']=df['ChestPainType'].str.replace('ATA',"3")
df['ChestPainType']=df['ChestPainType'].str.replace('ASY',"2")
df['ChestPainType']=df['ChestPainType'].str.replace('NAP',"1")

In [9]:
df['ChestPainType']=df['ChestPainType'].str.replace('A4',"3")
df['ChestPainType'] = pd.to_numeric(df['ChestPainType'])

In [10]:
df['ChestPainType'].unique()

array([3, 1, 2, 4], dtype=int64)

### RestingECG

In [11]:
df["RestingECG"].unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [12]:
df['RestingECG']=df['RestingECG'].str.replace('Normal',"1")
df['RestingECG']=df['RestingECG'].str.replace('ST',"2")
df['RestingECG']=df['RestingECG'].str.replace('LVH',"3")

df['RestingECG'] = pd.to_numeric(df['RestingECG'])


In [13]:
df["RestingECG"].unique()

array([1, 2, 3], dtype=int64)

### ExerciseAngina

In [14]:
df['ExerciseAngina'].unique()

array(['N', 'Y'], dtype=object)

In [15]:
df['ExerciseAngina']=df['ExerciseAngina'].str.replace('N',"0")
df['ExerciseAngina']=df['ExerciseAngina'].str.replace('Y',"1")

df['ExerciseAngina'] = pd.to_numeric(df['ExerciseAngina'])


In [16]:
df['ExerciseAngina'].unique()

array([0, 1], dtype=int64)

### ST_Slope

In [17]:
df['ST_Slope'].unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [18]:
df['ST_Slope']=df['ST_Slope'].str.replace('Up',"2")
df['ST_Slope']=df['ST_Slope'].str.replace('Flat',"1")
df['ST_Slope']=df['ST_Slope'].str.replace('Down',"3")

df['ST_Slope'] = pd.to_numeric(df['ST_Slope'])

* Down = isquemia miocardica 

* Up = posible isquemia miocardica

* Flat = Bien


In [19]:
df['ST_Slope'].unique()

array([2, 1, 3], dtype=int64)

In [20]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,3,140,289,0,1,172,0,0.0,2,0
1,49,0,1,160,180,0,1,156,0,1.0,1,1
2,37,1,3,130,283,0,2,98,0,0.0,2,0
3,48,0,2,138,214,0,1,108,1,1.5,1,1
4,54,1,1,150,195,0,1,122,0,0.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,4,110,264,0,1,132,0,1.2,1,1
914,68,1,2,144,193,1,1,141,0,3.4,1,1
915,57,1,2,130,131,0,1,115,1,1.2,1,1
916,57,0,3,130,236,0,3,174,0,0.0,1,1


# Normalizamos los datos (1,-1)

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
ss = StandardScaler()
std_data = ss.fit_transform(df)
origin_data = ss.inverse_transform(std_data)

In [23]:
df.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [24]:
df_std = pd.DataFrame(std_data, columns = ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'])

In [25]:
df_normal = pd.DataFrame(origin_data, columns = ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'])

In [26]:
df_std

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,-1.433140,0.515952,1.198363,0.410909,0.825070,-0.551341,-0.749180,1.382928,-0.823556,-0.832432,0.699073,-1.113115
1,-0.478484,-1.938163,-1.371957,1.491752,-0.171961,-0.551341,-0.749180,0.754157,-0.823556,0.105664,-0.917423,0.898380
2,-1.751359,0.515952,1.198363,-0.129513,0.770188,-0.551341,0.492241,-1.525138,-0.823556,-0.832432,0.699073,-1.113115
3,-0.584556,-1.938163,-0.086797,0.302825,0.139040,-0.551341,-0.749180,-1.132156,1.214246,0.574711,-0.917423,0.898380
4,0.051881,0.515952,-1.371957,0.951331,-0.034755,-0.551341,-0.749180,-0.581981,-0.823556,-0.832432,0.699073,-1.113115
...,...,...,...,...,...,...,...,...,...,...,...,...
913,-0.902775,0.515952,2.483522,-1.210356,0.596393,-0.551341,-0.749180,-0.188999,-0.823556,0.293283,-0.917423,0.898380
914,1.536902,0.515952,-0.086797,0.627078,-0.053049,1.813758,-0.749180,0.164684,-0.823556,2.357094,-0.917423,0.898380
915,0.370100,0.515952,-0.086797,-0.129513,-0.620168,-0.551341,-0.749180,-0.857069,1.214246,0.293283,-0.917423,0.898380
916,0.370100,-1.938163,1.198363,-0.129513,0.340275,-0.551341,1.733661,1.461525,-0.823556,-0.832432,-0.917423,0.898380


In [27]:
df_normal

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40.0,1.0,3.0,140.0,289.0,0.0,1.0,172.0,0.0,0.0,2.0,0.0
1,49.0,0.0,1.0,160.0,180.0,0.0,1.0,156.0,0.0,1.0,1.0,1.0
2,37.0,1.0,3.0,130.0,283.0,0.0,2.0,98.0,0.0,0.0,2.0,0.0
3,48.0,0.0,2.0,138.0,214.0,0.0,1.0,108.0,1.0,1.5,1.0,1.0
4,54.0,1.0,1.0,150.0,195.0,0.0,1.0,122.0,0.0,0.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45.0,1.0,4.0,110.0,264.0,0.0,1.0,132.0,0.0,1.2,1.0,1.0
914,68.0,1.0,2.0,144.0,193.0,1.0,1.0,141.0,0.0,3.4,1.0,1.0
915,57.0,1.0,2.0,130.0,131.0,0.0,1.0,115.0,1.0,1.2,1.0,1.0
916,57.0,0.0,3.0,130.0,236.0,0.0,3.0,174.0,0.0,0.0,1.0,1.0


# Comprobamos que los datos no se han alterado con la normalización

In [28]:
df.corr()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
Age,1.0,0.05575,-0.085635,0.254399,-0.095282,0.198039,0.213152,-0.382045,0.215793,0.258612,-0.093424,0.282039
Sex,0.05575,1.0,-0.047979,0.005133,-0.200092,0.120076,-0.018343,-0.189186,0.190664,0.105734,-0.066831,0.305445
ChestPainType,-0.085635,-0.047979,1.0,0.026814,0.089145,-0.034611,-0.01984,0.111837,-0.134245,-0.057019,0.090097,-0.119143
RestingBP,0.254399,0.005133,0.026814,1.0,0.100893,0.070193,0.097661,-0.112135,0.155101,0.164803,-0.083418,0.107589
Cholesterol,-0.095282,-0.200092,0.089145,0.100893,1.0,-0.260974,0.112095,0.235792,-0.034166,0.050148,0.00711,-0.232741
FastingBS,0.198039,0.120076,-0.034611,0.070193,-0.260974,1.0,0.050707,-0.131438,0.060451,0.052698,-0.043534,0.267291
RestingECG,0.213152,-0.018343,-0.01984,0.097661,0.112095,0.050707,1.0,0.048552,0.036119,0.114428,-0.005282,0.061011
MaxHR,-0.382045,-0.189186,0.111837,-0.112135,0.235792,-0.131438,0.048552,1.0,-0.370425,-0.160691,0.246927,-0.400421
ExerciseAngina,0.215793,0.190664,-0.134245,0.155101,-0.034166,0.060451,0.036119,-0.370425,1.0,0.408752,-0.253181,0.494282
Oldpeak,0.258612,0.105734,-0.057019,0.164803,0.050148,0.052698,0.114428,-0.160691,0.408752,1.0,-0.097323,0.403951


In [29]:
df_normal.corr()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
Age,1.0,0.05575,-0.085635,0.254399,-0.095282,0.198039,0.213152,-0.382045,0.215793,0.258612,-0.093424,0.282039
Sex,0.05575,1.0,-0.047979,0.005133,-0.200092,0.120076,-0.018343,-0.189186,0.190664,0.105734,-0.066831,0.305445
ChestPainType,-0.085635,-0.047979,1.0,0.026814,0.089145,-0.034611,-0.01984,0.111837,-0.134245,-0.057019,0.090097,-0.119143
RestingBP,0.254399,0.005133,0.026814,1.0,0.100893,0.070193,0.097661,-0.112135,0.155101,0.164803,-0.083418,0.107589
Cholesterol,-0.095282,-0.200092,0.089145,0.100893,1.0,-0.260974,0.112095,0.235792,-0.034166,0.050148,0.00711,-0.232741
FastingBS,0.198039,0.120076,-0.034611,0.070193,-0.260974,1.0,0.050707,-0.131438,0.060451,0.052698,-0.043534,0.267291
RestingECG,0.213152,-0.018343,-0.01984,0.097661,0.112095,0.050707,1.0,0.048552,0.036119,0.114428,-0.005282,0.061011
MaxHR,-0.382045,-0.189186,0.111837,-0.112135,0.235792,-0.131438,0.048552,1.0,-0.370425,-0.160691,0.246927,-0.400421
ExerciseAngina,0.215793,0.190664,-0.134245,0.155101,-0.034166,0.060451,0.036119,-0.370425,1.0,0.408752,-0.253181,0.494282
Oldpeak,0.258612,0.105734,-0.057019,0.164803,0.050148,0.052698,0.114428,-0.160691,0.408752,1.0,-0.097323,0.403951


In [30]:
df_std.corr()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
Age,1.0,0.05575,-0.085635,0.254399,-0.095282,0.198039,0.213152,-0.382045,0.215793,0.258612,-0.093424,0.282039
Sex,0.05575,1.0,-0.047979,0.005133,-0.200092,0.120076,-0.018343,-0.189186,0.190664,0.105734,-0.066831,0.305445
ChestPainType,-0.085635,-0.047979,1.0,0.026814,0.089145,-0.034611,-0.01984,0.111837,-0.134245,-0.057019,0.090097,-0.119143
RestingBP,0.254399,0.005133,0.026814,1.0,0.100893,0.070193,0.097661,-0.112135,0.155101,0.164803,-0.083418,0.107589
Cholesterol,-0.095282,-0.200092,0.089145,0.100893,1.0,-0.260974,0.112095,0.235792,-0.034166,0.050148,0.00711,-0.232741
FastingBS,0.198039,0.120076,-0.034611,0.070193,-0.260974,1.0,0.050707,-0.131438,0.060451,0.052698,-0.043534,0.267291
RestingECG,0.213152,-0.018343,-0.01984,0.097661,0.112095,0.050707,1.0,0.048552,0.036119,0.114428,-0.005282,0.061011
MaxHR,-0.382045,-0.189186,0.111837,-0.112135,0.235792,-0.131438,0.048552,1.0,-0.370425,-0.160691,0.246927,-0.400421
ExerciseAngina,0.215793,0.190664,-0.134245,0.155101,-0.034166,0.060451,0.036119,-0.370425,1.0,0.408752,-0.253181,0.494282
Oldpeak,0.258612,0.105734,-0.057019,0.164803,0.050148,0.052698,0.114428,-0.160691,0.408752,1.0,-0.097323,0.403951


# Exportamos el Dataframe con los datos normalizados: (Para el modelo)

In [31]:
df_std.to_csv("Heart_std.csv", index = False)

# Exportamos el Dataframe limpio: (Para el análisis)

In [32]:
df_normal.to_csv("Heart_limpio.csv", index = False)