In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: For a nice auto report (install with pip install pandas-profiling or ydata-profiling)
# from ydata_profiling import ProfileReport  

# 📌 2. Load Raw Data
raw_data_path = "../data/raw/your_dataset.csv"  # 🔁 Change filename if needed
df = pd.read_csv(raw_data_path)

# 📌 3. Basic Exploration
print("Shape of dataset:", df.shape)
print("\nFirst 5 rows:")
display(df.head())

print("\nColumn types:")
print(df.dtypes)

print("\nMissing values:")
print(df.isnull().sum())

print("\nDuplicate rows:", df.duplicated().sum())

# 📌 4. Descriptive Stats
print("\nDescriptive statistics:")
display(df.describe(include='all'))

# 📌 5. Data Distribution
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

df[numerical_cols].hist(bins=30, figsize=(15, 10), color='skyblue')
plt.tight_layout()
plt.show()

# 📌 6. Correlation Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df[numerical_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# 📌 7. Optional: Auto EDA report
# profile = ProfileReport(df, title="EDA Report", explorative=True)
# profile.to_notebook_iframe()  # or profile.to_file("eda_report.html")
