In [1]:
import numpy as np
import pandas as pd

## Cargar de csv
Cargar los archivos:
- diabetes_v2/diabetes_train_analysis.csv
- diabetes_v2/diabetes_train_info.csv
en dataframes

In [42]:
df_ana = pd.read_csv('diabetes_v2/diabetes_train_analysis.csv')
df_inf = pd.read_csv('diabetes_v2/diabetes_train_info.csv')

## Ver las primeras filas de ambos dataframes

In [43]:
df_ana.head()

Unnamed: 0,id,cholesterol,gluc,smoke,alco,active,pressure,diabetes
0,62538,low,low,0,0,1,100/80,0
1,49159,low,low,0,0,1,120/82,0
2,60683,low,low,0,0,1,120/80,0
3,42924,low,low,0,0,0,120\80,0
4,52888,low,low,0,0,0,120/80,0


In [44]:
df_inf.head()

Unnamed: 0,id,age,height,weight,gender
0,0,50,168,62.0,f
1,1,55,156,85.0,m
2,2,18857,165,64.0,male
3,3,17623,169,82.0,f
4,4,47,156,56.0,m


## Unir dataframes
Unir ambos dataframes por medio del campo id y eliminar dicho campo

In [45]:
df = pd.merge(df_ana,df_inf,on="id")
df = df.drop("id",axis=1)

In [46]:
df.head()

Unnamed: 0,cholesterol,gluc,smoke,alco,active,pressure,diabetes,age,height,weight,gender
0,low,low,0,0,1,100/80,0,54,169,76.0,f
1,low,low,0,0,1,120/82,0,49,165,65.0,m
2,low,low,0,0,1,120/80,0,21962,170,56.0,m
3,low,low,0,0,0,120\80,0,20287,169,62.0,m
4,low,low,0,0,0,120/80,0,16202,166,67.0,male


## Corregir strings de pressure
El campo pressure a veces tiene el caracter \ como separador y a veces /
Unificar el formato

In [47]:
df.pressure = df.pressure.str.replace("\\","/",regex=False)

In [48]:
df.head()

Unnamed: 0,cholesterol,gluc,smoke,alco,active,pressure,diabetes,age,height,weight,gender
0,low,low,0,0,1,100/80,0,54,169,76.0,f
1,low,low,0,0,1,120/82,0,49,165,65.0,m
2,low,low,0,0,1,120/80,0,21962,170,56.0,m
3,low,low,0,0,0,120/80,0,20287,169,62.0,m
4,low,low,0,0,0,120/80,0,16202,166,67.0,male


## Separar el campo pressure en dos
Utilizar el separador del campo presure para dividir en dos dicho campo y luego eliminarlo

In [49]:
df[["pressure1","pressure2"]] = df.pressure.str.split("/", expand=True)
df = df.drop("pressure",axis=1)

In [50]:
df.head()

Unnamed: 0,cholesterol,gluc,smoke,alco,active,diabetes,age,height,weight,gender,pressure1,pressure2
0,low,low,0,0,1,0,54,169,76.0,f,100,80
1,low,low,0,0,1,0,49,165,65.0,m,120,82
2,low,low,0,0,1,0,21962,170,56.0,m,120,80
3,low,low,0,0,0,0,20287,169,62.0,m,120,80
4,low,low,0,0,0,0,16202,166,67.0,male,120,80


## Unificar el formato del campo gender
Femenino a veces se encuentra como female y a veces como f
Masculino a veces se encuentra como male y a veces como m
Unificar la notacion

In [51]:
df.gender = df.gender.str.replace("female","f")
df.gender = df.gender.str.replace("male","m")

In [52]:
df.head()

Unnamed: 0,cholesterol,gluc,smoke,alco,active,diabetes,age,height,weight,gender,pressure1,pressure2
0,low,low,0,0,1,0,54,169,76.0,f,100,80
1,low,low,0,0,1,0,49,165,65.0,m,120,82
2,low,low,0,0,1,0,21962,170,56.0,m,120,80
3,low,low,0,0,0,0,20287,169,62.0,m,120,80
4,low,low,0,0,0,0,16202,166,67.0,m,120,80


## Unificar el campo edad
El campo edad a veces se encuentra expresado en dias y a veces en años. Unificarlo

In [53]:
df.loc[df["age"] < 150, "age"] = df.loc[df["age"] < 150, "age"] * 365

In [54]:
df.head()

Unnamed: 0,cholesterol,gluc,smoke,alco,active,diabetes,age,height,weight,gender,pressure1,pressure2
0,low,low,0,0,1,0,19710,169,76.0,f,100,80
1,low,low,0,0,1,0,17885,165,65.0,m,120,82
2,low,low,0,0,1,0,21962,170,56.0,m,120,80
3,low,low,0,0,0,0,20287,169,62.0,m,120,80
4,low,low,0,0,0,0,16202,166,67.0,m,120,80


## Verificar si hay valores faltantes en algun campo

In [55]:
df.isna().sum()

cholesterol       0
gluc              0
smoke             0
alco              0
active            0
diabetes          0
age               0
height            0
weight         1998
gender            0
pressure1         0
pressure2         0
dtype: int64

## Reemplazar valores faltantes
Reemplazar los valores faltantes del campo weight por la media de dicho campo

In [56]:
df.weight.fillna(df.weight.mean(), inplace=True)

In [57]:
df.head()

Unnamed: 0,cholesterol,gluc,smoke,alco,active,diabetes,age,height,weight,gender,pressure1,pressure2
0,low,low,0,0,1,0,19710,169,76.0,f,100,80
1,low,low,0,0,1,0,17885,165,65.0,m,120,82
2,low,low,0,0,1,0,21962,170,56.0,m,120,80
3,low,low,0,0,0,0,20287,169,62.0,m,120,80
4,low,low,0,0,0,0,16202,166,67.0,m,120,80


## Convertir variables categoricas a numericas
Convertir variables categoricas a numericas utilizando la funcion

In [58]:
df = pd.get_dummies(df, columns=["cholesterol","gluc","gender"])

In [59]:
df.head()

Unnamed: 0,smoke,alco,active,diabetes,age,height,weight,pressure1,pressure2,cholesterol_high,cholesterol_low,cholesterol_medium,gluc_high,gluc_low,gluc_medium,gender_f,gender_m
0,0,0,1,0,19710,169,76.0,100,80,0,1,0,0,1,0,1,0
1,0,0,1,0,17885,165,65.0,120,82,0,1,0,0,1,0,0,1
2,0,0,1,0,21962,170,56.0,120,80,0,1,0,0,1,0,0,1
3,0,0,0,0,20287,169,62.0,120,80,0,1,0,0,1,0,0,1
4,0,0,0,0,16202,166,67.0,120,80,0,1,0,0,1,0,0,1
