# Data Transformation

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


## Data Preprocessing

Cleaning the Dataset.

### Data Summarization

In [None]:
data = pd.read_csv("Metabolic Syndrome.csv", index_col="seqn")
data["MetabolicSyndrome"] = data["MetabolicSyndrome"].astype("bool")
categorical_variables = ["Sex", "Marital", "Race"]

### Replace the missing values, Imputer

Replacing the missing values by the mean of the column. 

Warning : does not take in count the Metabolic Syndrom state of the patient

In [None]:
data_imputed = data

for col in data_imputed.select_dtypes(include="number").columns:
    imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
    imputer.fit(data_imputed[[col]])
    data_imputed[col] = imputer.transform(data_imputed[[col]])
data_imputed.head()

### Transform continuous variables into categorical variable and create an ”hot one encoding” structure

In [None]:
albuminurie = {0: "zero", 1: "one", 2:"two"}
data_encoded = data_imputed.replace({"Albuminuria":albuminurie})
data_encoded = pd.get_dummies(data_encoded)
data_encoded.head()

### Scale the continuous variables

Warning : does not take in count the multiple outliers

In [None]:
data_scaled = data_encoded
for col in data_imputed.select_dtypes(include="number").columns:
    scaler = StandardScaler()
    scaler.fit_transform(data_scaled[[col]])
data_scaled.head()

## Data Saving

save the dataset into a csv file.

In [None]:
data_scaled.to_csv("metabo_encoded.csv", index = True)