# Analyse des données d'assurance maladie

In [1]:
import pandas as pd
import matplotlib.pyplot as mpl
import seaborn as sbn

In [None]:
# Chargement des données
path = r"B:\Machine Learning\YC1_SmartSante\data\assurance-maladie-68d92978e362f464596651.csv"
content = pd.read_csv(path)

FileNotFoundError: [Errno 2] No such file or directory: 'data\\assurance-maladie-68d92978e362f464596651.csv'

In [4]:
# Aperçu des colonnes et informations générales
content.columns
content.head()
content.info()

NameError: name 'content' is not defined

### Conversion des colonnes catégorielles en numériques

In [None]:
content['sex'] = content['sex'].map({'male':1,'female':0})
content['smoker'] = content['smoker'].map({'yes':1,'no':0})
content = pd.get_dummies(content, columns=['region'])

content['region_northeast'] = content['region_northeast'].astype(int)
content['region_northwest'] = content['region_northwest'].astype(int)
content['region_southeast'] = content['region_southeast'].astype(int)
content['region_southwest'] = content['region_southwest'].astype(int)

In [None]:
# Vérification des types et aperçu des données
print(content.info())
print(content.head())
print(content.tail())

In [None]:
# Statistiques descriptives
print(content[['age', 'bmi', 'children', 'charges']].describe().round(2))

In [None]:
# Comptage des valeurs pour certaines colonnes
print(content['sex'].value_counts())
print(content['smoker'].value_counts())
print(content[['region_northeast','region_northwest','region_southeast','region_southwest']].sum())

In [None]:
# Vérification des valeurs manquantes
print(content.isnull().sum())
print(content.isnull().any())

In [None]:
# Vérification et suppression des doublons
print(content.duplicated().sum())
content = content.drop_duplicates()

### Visualisations des distributions

In [None]:
mpl.figure(figsize=(6,4))

mpl.subplot(2,2,1)
sbn.histplot(content['age'], kde=True)
mpl.title("Distribution de l'âge")

mpl.subplot(2,2,2)
sbn.histplot(content['children'], kde=True)
mpl.title("Distribution du nombre d'enfants")

mpl.subplot(2,2,3)
sbn.histplot(content['charges'], kde=True)
mpl.title("Distribution des charges")

mpl.subplot(2,2,4)
sbn.histplot(content['bmi'], kde=True)
mpl.title("Distribution du BMI")

mpl.tight_layout()
mpl.show()

In [None]:
# Pairplot
sbn.pairplot(content[['age', 'bmi', 'children', 'charges']])
mpl.show()

In [None]:
# Heatmap de corrélation
sbn.heatmap(content[['age', 'bmi', 'children', 'charges']].corr(), annot=True, cmap="coolwarm")
mpl.title("Corrélations entre variables")
mpl.show()