# 🩺 Data Audit & Exploratory Data Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")


## 🔹 Load Datasets

In [None]:
# Replace these with your actual file paths
df1 = pd.read_csv('../data/health_dataset_1.csv')
df2 = pd.read_csv('../data/health_dataset_2.csv')

df1.head()


## 🔹 Dataset 1 Summary

In [None]:
df1.describe(include='all')

In [None]:
df1.info()

## 🔹 Check for Missing Values

In [None]:
df1.isnull().sum()

## 🔹 Distribution of Key Health Metrics

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14,10))
sns.histplot(df1['BMI'], bins=30, ax=axes[0,0], kde=True)
axes[0,0].set_title('BMI Distribution')
sns.histplot(df1['Level_of_Hemoglobin'], bins=30, ax=axes[0,1], kde=True)
axes[0,1].set_title('Hemoglobin Level')
sns.histplot(df1['Genetic_Pedigree_Coefficient'], bins=30, ax=axes[1,0], kde=True)
axes[1,0].set_title('Genetic Pedigree Coefficient')
sns.histplot(df1['salt_content_in_the_diet'], bins=30, ax=axes[1,1], kde=True)
axes[1,1].set_title('Salt Content in Diet')
plt.tight_layout()
plt.show()


## 🔹 Dataset 2 Aggregation (Physical Activity)

In [None]:
# Aggregate average steps per person
df2_agg = df2.groupby('Patient_Number')['Physical_activity'].agg(['mean', 'std']).reset_index()
df2_agg.columns = ['Patient_Number', 'avg_steps', 'std_steps']
df2_agg.head()


## 🔹 Merge Datasets Temporarily for Analysis

In [None]:
merged_df = pd.merge(df1, df2_agg, on='Patient_Number', how='left')
merged_df.head()


## 🔹 Correlation Heatmap

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(merged_df.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix")
plt.show()
