# Intro to Exploratory Data Analysis (EDA) in Python
This notebook was auto-generated by **AI Data Agent**.

It follows a Kaggle-style EDA workflow:
- Dataset overview
- Cleaning (duplicates, missing values, dropping columns)
- Visual EDA (distributions, correlation heatmap, scatter)
- Feature Engineering
- Modeling starter code


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from agents.cleaning import clean_data
from agents.feature_engineering import engineer_features
from agents.eda import generate_eda, target_eda
from agents.feature_importance import feature_importance


## 1. Load Dataset

In [None]:
df_raw = pd.read_csv('your_dataset.csv')
df_raw.head()

## 2. Dataset Overview

In [None]:
df_raw.shape

In [None]:
df_raw.info()

In [None]:
df_raw.describe(include='all').T

## 3. Data Cleaning

In [None]:
df_cleaned, cleaning_stats, cleaning_text = clean_data(df_raw)
cleaning_text

In [None]:
df_cleaned.head()

## 4. Exploratory Data Analysis (EDA)

In [None]:
eda_report = generate_eda(df_cleaned)
eda_report

### 4.1 Correlation Heatmap

In [None]:
numeric_cols = df_cleaned.select_dtypes(include='number').columns.tolist()
if len(numeric_cols) >= 2:
    plt.figure(figsize=(7,5))
    corr = df_cleaned[numeric_cols].corr()
    sns.heatmap(corr, cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.show()

### 4.2 Numeric Distributions

In [None]:
for col in numeric_cols[:2]:
    plt.figure(figsize=(6,4))
    sns.histplot(df_cleaned[col], kde=True)
    plt.title(f'Distribution: {col}')
    plt.show()

### 4.3 Numeric Relationship

In [None]:
if len(numeric_cols) >= 2:
    plt.figure(figsize=(6,4))
    sns.scatterplot(x=df_cleaned[numeric_cols[0]], y=df_cleaned[numeric_cols[1]])
    plt.title(f'{numeric_cols[0]} vs {numeric_cols[1]}')
    plt.show()

## 5. Feature Engineering

In [None]:
df_features, feature_report = engineer_features(df_cleaned)
feature_report

In [None]:
df_features.head()