# Chloé's Work 

In [2]:
import pandas as pd
import plotly as plt
import plotly.express as px
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

Importation des modules nécessaires

## Exploratory Data Analysis

Exoloratory Data Analysis is an important step in Data Analysis.

### Data Summarization

In [3]:
data = pd.read_csv("Metabolic Syndrome.csv", index_col="seqn")
data["MetabolicSyndrome"] = data["MetabolicSyndrome"].astype("bool")
data_info = data.info()
data_describe = data.describe(include='all')
data_shape = data.shape
data_is_nan = data.isnull().sum()
categorical_variables = ["Sex", "Marital", "Race"]
print(f'info : \n{data_info}\n describe: \n{data_describe}\n NaN : \n{data_is_nan}\n')

<class 'pandas.core.frame.DataFrame'>
Index: 2401 entries, 62161 to 71915
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                2401 non-null   int64  
 1   Sex                2401 non-null   object 
 2   Marital            2193 non-null   object 
 3   Income             2284 non-null   float64
 4   Race               2401 non-null   object 
 5   WaistCirc          2316 non-null   float64
 6   BMI                2375 non-null   float64
 7   Albuminuria        2401 non-null   int64  
 8   UrAlbCr            2401 non-null   float64
 9   UricAcid           2401 non-null   float64
 10  BloodGlucose       2401 non-null   int64  
 11  HDL                2401 non-null   int64  
 12  Triglycerides      2401 non-null   int64  
 13  MetabolicSyndrome  2401 non-null   bool   
dtypes: bool(1), float64(5), int64(5), object(3)
memory usage: 265.0+ KB
info : 
None
 describe: 
                Age     Sex

Our dataset has 14 variables, 3 categorical and ordinal (Sex, Marital Status and Race), and 11 numerical (Age, Income, Waist Circumference, BMI, Albuminuria, UrAlbCr, Uric Acid, Blood Glucose Concentration, HDL, Tryglycerides and MetabolicSyndrome) variables.

- Nominal variables : Marital Status, Sex, Race
- Numerical discrete variables : Age, Albuminuria, Blood Glucose, HDL, Triglycerides 
- Numerical continuous variables : Income, Waist circumference, BMI, Urinary albumin-to-creatinine ratio and Uric acid levels in the blood
#### Data Visualisation

##### Distribution of the age regarding the race and sex

In [4]:
fig = px.histogram(data_frame=data, x = "Age", color = "Race" ,  barmode="group", title="Distribution of the age regarding the race and sex")

fig.show()

##### Histograms

In [5]:
fig = px.histogram(data, "UrAlbCr",  color="Race", facet_col= "Sex")
fig.show()

##### Boxplots

In [6]:
fig = px.violin(data_frame=data, x = "Age", y = "Race" )
fig.show()

### Replace the missing values

In [17]:
data_imputed = data
numeric_cols = data.select_dtypes(include='number').columns

num_imputer = SimpleImputer(strategy='median')
data_imputed[numeric_cols] = num_imputer.fit_transform(data_imputed[numeric_cols])
print(data_imputed.head)

<bound method NDFrame.head of         Age     Sex  Marital  Income         Race  WaistCirc   BMI  \
seqn                                                                 
62161  22.0    Male   Single  8200.0        White       81.0  23.3   
62164  44.0  Female  Married  4500.0        White       80.1  23.2   
62169  21.0    Male   Single   800.0        Asian       69.6  20.1   
62172  43.0  Female   Single  2000.0        Black      120.4  33.3   
62177  51.0    Male  Married  2500.0        Asian       81.1  20.1   
...     ...     ...      ...     ...          ...        ...   ...   
71901  48.0  Female  Married  1000.0        Other       97.0  59.7   
71904  30.0  Female   Single  2000.0        Asian       97.0  18.0   
71909  28.0    Male   Single   800.0  MexAmerican      100.8  29.4   
71911  27.0    Male  Married  8200.0  MexAmerican      106.6  31.3   
71915  60.0    Male   Single  6200.0        White      106.6  27.5   

       Albuminuria  UrAlbCr  UricAcid  BloodGlucose   HDL  

### Scale the continuous variables

In [18]:
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data_imputed[numeric_cols]), columns=numeric_cols)

print(data_scaled.head())

        Age    Income  WaistCirc       BMI  Albuminuria   UrAlbCr  UricAcid  \
0 -1.514069  1.472417  -1.081465 -0.813727    -0.364574 -0.153924 -0.409328   
1 -0.266138  0.195983  -1.137853 -0.828820    -0.364574 -0.135839 -0.687287   
2 -1.570793 -1.080450  -1.795718 -1.296710    -0.364574 -0.149316 -0.061878   
3 -0.322862 -0.666472   1.387093  0.695596    -0.364574 -0.148735 -0.339838   
4  0.130931 -0.493981  -1.075199 -1.296710    -0.364574 -0.137465 -0.339838   

   BloodGlucose       HDL  Triglycerides  
0     -0.466711 -0.814723      -0.463003  
1     -0.753957 -1.670979      -0.756804  
2     -0.035843 -0.682991      -0.525960  
3     -0.122017  1.292984       0.135092  
4     -0.380538 -0.682991      -0.022301  


### Transform continuous variables into categorical variable and create an ”hot one encoding” structure

In [23]:
data_encoded = pd.DataFrame(data.select_dtypes(include="object"))
print(data_encoded.head)

for column in data_encoded.columns : 
    print(data_encoded[column].unique(), list(range(len(data_encoded[column].unique()))))
    data_encoded[column] = data_encoded[column].replace(to_replace=data_encoded[column].unique(), value=list(range(len(data_encoded[column].unique()))), inplace=True)

print(data_encoded.head)

<bound method NDFrame.head of           Sex  Marital         Race
seqn                               
62161    Male   Single        White
62164  Female  Married        White
62169    Male   Single        Asian
62172  Female   Single        Black
62177    Male  Married        Asian
...       ...      ...          ...
71901  Female  Married        Other
71904  Female   Single        Asian
71909    Male   Single  MexAmerican
71911    Male  Married  MexAmerican
71915    Male   Single        White

[2401 rows x 3 columns]>
['Male' 'Female'] [0, 1]
['Single' 'Married' 'Widowed' 'Divorced' nan 'Separated'] [0, 1, 2, 3, 4, 5]
['White' 'Asian' 'Black' 'MexAmerican' 'Hispanic' 'Other'] [0, 1, 2, 3, 4, 5]
<bound method NDFrame.head of         Sex Marital  Race
seqn                     
62161  None    None  None
62164  None    None  None
62169  None    None  None
62172  None    None  None
62177  None    None  None
...     ...     ...   ...
71901  None    None  None
71904  None    None  None
71909 

In [11]:
df_encoded = pd.get_dummies(data_scaled)
df_encoded.dropna()
df_encoded.describe(include="all")



ValueError: Data must be 1-dimensional, got ndarray of shape (2401, 10) instead

In [None]:

pca = PCA(n_components=5)
df_pca = pd.DataFrame(pca.fit_transform(df_encoded))

fig = px.scatter_matrix(df_pca, dimensions=df_pca.columns.tolist())
fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
features = data.columns.tolist()
useless_feat = ["Race",  "MetabolicSyndrome"]
for i in useless_feat:
    features.remove(i)

print(features)

fig = px.scatter_matrix(
    data,
    dimensions=features,
    color="Race"
)
fig.update_traces(diagonal_visible=False)
fig.show()

['Age', 'Sex', 'Marital', 'Income', 'WaistCirc', 'BMI', 'Albuminuria', 'UrAlbCr', 'UricAcid', 'BloodGlucose', 'HDL', 'Triglycerides']


In [None]:


pca = PCA()
components = pca.fit_transform(df[features])
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    components,
    labels=labels,
    dimensions=range(4),
    color=df["Race"]
)
fig.update_traces(diagonal_visible=False)
fig.show()

ValueError: could not convert string to float: 'Male'