# User Story 4 : Analyse des clusters

## Moyennes des caractéristiques par cluster

In [1]:
import pandas as pd
file = r"B:\Machine Learning\Sprint1\YC2_DiabetesTrackAI\data\Clustered_Data.csv"
content = pd.read_csv(file)


content.groupby("Cluster").mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.413157,0.681864,0.461416,0.517877,0.636883,0.52206,0.132541,0.535884
1,-0.340536,-0.562011,-0.380312,-0.426849,-0.524937,-0.430296,-0.109244,-0.441691


## Nombre d’observations par groupe.

In [2]:
content["Cluster"].value_counts()

Cluster
1    421
0    347
Name: count, dtype: int64

## Identification des clusters à haut risque de diabète

In [3]:
import joblib
import numpy as np

scaler = joblib.load('../models/scaler.pkl')
kmeans = joblib.load('../models/kmeans.pkl')

original_centers = scaler.inverse_transform(kmeans.cluster_centers_)

cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 
        'BMI', 'DiabetesPedigreeFunction', 'Age']

df_centers = pd.DataFrame(original_centers,columns=cols)
df_centers['DiabetesPedigreeFunction'] = np.expm1(df_centers['DiabetesPedigreeFunction'])

for i,row in df_centers.iterrows():
    if(row["Glucose"]>126) and (row["BMI"]>30) and (row["DiabetesPedigreeFunction"]>0.45):
        print(f"Le cluster {i} est à haut risque de diabète.")
    else:
        print(f"Le cluster {i} n'est pas à haut risque.")
df_centers.head()

Le cluster 0 est à haut risque de diabète.
Le cluster 1 n'est pas à haut risque.


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,5.236311,142.507205,4.360919,33.797118,5.225389,36.055331,0.479362,39.538905
1,2.698337,104.638955,4.214998,24.84038,4.524152,29.503705,0.410079,28.049881


## Ajout d’une colonne de catégorie de risque par cluster

In [4]:
content["risk_category"] = np.where(content["Cluster"] == 1, "Risque faible", "Risque élevé")
content.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Cluster,risk_category
0,0.639947,0.862288,0.054979,0.644753,0.838548,0.165148,0.612059,1.425995,0,Risque élevé
1,-0.844885,-1.207101,-0.439758,0.011894,-0.88958,-0.852384,-0.324994,-0.190672,1,Risque faible
2,1.23388,2.011949,-0.614571,0.054084,1.839357,-1.332078,0.749586,-0.105584,0,Risque élevé
3,-0.844885,-1.075711,-0.439758,-0.620966,-0.475688,-0.634341,-1.063014,-1.041549,1,Risque faible
4,-1.141852,0.500966,-3.272736,0.644753,0.478673,1.546085,4.158488,-0.020496,0,Risque élevé
