# Data Transformation

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


## Data Preprocessing

Cleaning the Dataset.

### Data Summarization

In [2]:
data = pd.read_csv("Metabolic Syndrome.csv", index_col="seqn")
data["MetabolicSyndrome"] = data["MetabolicSyndrome"].astype("bool")
categorical_variables = ["Sex", "Marital", "Race"]

### Replace the missing values, Imputer

Replacing the missing values by the mean of the column. 

Warning : does not take in count the Metabolic Syndrom state of the patient

In [3]:
data_imputed = data

for col in data_imputed.select_dtypes(include="number").columns:
    imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
    imputer.fit(data_imputed[[col]])
    data_imputed[col] = imputer.transform(data_imputed[[col]])
data_imputed.head()

Unnamed: 0_level_0,Age,Sex,Marital,Income,Race,WaistCirc,BMI,Albuminuria,UrAlbCr,UricAcid,BloodGlucose,HDL,Triglycerides,MetabolicSyndrome
seqn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
62161,22.0,Male,Single,8200.0,White,81.0,23.3,0.0,3.88,4.9,92.0,41.0,84.0,False
62164,44.0,Female,Married,4500.0,White,80.1,23.2,0.0,8.55,4.5,82.0,28.0,56.0,False
62169,21.0,Male,Single,800.0,Asian,69.6,20.1,0.0,5.07,5.4,107.0,43.0,78.0,False
62172,43.0,Female,Single,2000.0,Black,120.4,33.3,0.0,5.22,5.0,104.0,73.0,141.0,False
62177,51.0,Male,Married,4005.25394,Asian,81.1,20.1,0.0,8.13,5.0,95.0,43.0,126.0,False


### Transform continuous variables into categorical variable and create an ”hot one encoding” structure

In [4]:
albuminurie = {0: "zero", 1: "one", 2:"two"}
data_encoded = data_imputed.replace({"Albuminuria":albuminurie})
data_encoded = pd.get_dummies(data_encoded)
data_encoded.head()

Unnamed: 0_level_0,Age,Income,WaistCirc,BMI,UrAlbCr,UricAcid,BloodGlucose,HDL,Triglycerides,MetabolicSyndrome,...,Marital_Widowed,Race_Asian,Race_Black,Race_Hispanic,Race_MexAmerican,Race_Other,Race_White,Albuminuria_one,Albuminuria_two,Albuminuria_zero
seqn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
62161,22.0,8200.0,81.0,23.3,3.88,4.9,92.0,41.0,84.0,False,...,False,False,False,False,False,False,True,False,False,True
62164,44.0,4500.0,80.1,23.2,8.55,4.5,82.0,28.0,56.0,False,...,False,False,False,False,False,False,True,False,False,True
62169,21.0,800.0,69.6,20.1,5.07,5.4,107.0,43.0,78.0,False,...,False,True,False,False,False,False,False,False,False,True
62172,43.0,2000.0,120.4,33.3,5.22,5.0,104.0,73.0,141.0,False,...,False,False,True,False,False,False,False,False,False,True
62177,51.0,4005.25394,81.1,20.1,8.13,5.0,95.0,43.0,126.0,False,...,False,True,False,False,False,False,False,False,False,True


### Scale the continuous variables

Warning : does not take in count the multiple outliers

In [5]:
data_scaled = data_encoded
for col in data_scaled.select_dtypes(include="number").columns:
    scaler = StandardScaler()
    scaler.fit_transform(data_scaled[[col]])
data_scaled.head()

Unnamed: 0_level_0,Age,Income,WaistCirc,BMI,UrAlbCr,UricAcid,BloodGlucose,HDL,Triglycerides,MetabolicSyndrome,...,Marital_Widowed,Race_Asian,Race_Black,Race_Hispanic,Race_MexAmerican,Race_Other,Race_White,Albuminuria_one,Albuminuria_two,Albuminuria_zero
seqn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
62161,22.0,8200.0,81.0,23.3,3.88,4.9,92.0,41.0,84.0,False,...,False,False,False,False,False,False,True,False,False,True
62164,44.0,4500.0,80.1,23.2,8.55,4.5,82.0,28.0,56.0,False,...,False,False,False,False,False,False,True,False,False,True
62169,21.0,800.0,69.6,20.1,5.07,5.4,107.0,43.0,78.0,False,...,False,True,False,False,False,False,False,False,False,True
62172,43.0,2000.0,120.4,33.3,5.22,5.0,104.0,73.0,141.0,False,...,False,False,True,False,False,False,False,False,False,True
62177,51.0,4005.25394,81.1,20.1,8.13,5.0,95.0,43.0,126.0,False,...,False,True,False,False,False,False,False,False,False,True


## Data Saving

save the dataset into a csv file.

In [6]:
data_scaled.to_csv("metabo_encoded.csv", index = True)

In [7]:
data_scaled.isna().count()

Age                  2401
Income               2401
WaistCirc            2401
BMI                  2401
UrAlbCr              2401
UricAcid             2401
BloodGlucose         2401
HDL                  2401
Triglycerides        2401
MetabolicSyndrome    2401
Sex_Female           2401
Sex_Male             2401
Marital_Divorced     2401
Marital_Married      2401
Marital_Separated    2401
Marital_Single       2401
Marital_Widowed      2401
Race_Asian           2401
Race_Black           2401
Race_Hispanic        2401
Race_MexAmerican     2401
Race_Other           2401
Race_White           2401
Albuminuria_one      2401
Albuminuria_two      2401
Albuminuria_zero     2401
dtype: int64