# Exploratory Data Analysis (EDA)
This notebook performs **statistical + visualization-based EDA** on the cleaned dataset exported from `data_cleaning.ipynb`.

✅ Input: `data/cleaned_students.csv`

✅ Outputs (optional):
- `reports/eda_summary.md`
- `reports/target_distribution.csv`
- `reports/numeric_target_summary.csv`
- `reports/categorical_target_summary.csv`

EDA Focus:
- Target distribution (`exam_score_class`)
- Univariate + Bivariate analysis
- Target-wise patterns for numeric & categorical features
- Correlations + interaction insights


## 0) Setup

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)
pd.set_option("display.max_rows", 80)

RANDOM_STATE = 42

print("✅ Setup complete")

## 1) Load Cleaned Data

In [None]:
DATA_PATH = Path("data/cleaned_students.csv")
if not DATA_PATH.exists():
    raise FileNotFoundError(
        f"❌ Could not find {DATA_PATH}. Run data_cleaning.ipynb first."
    )

df = pd.read_csv(DATA_PATH)
print("✅ Loaded:", DATA_PATH)
print("Shape:", df.shape)

df.head()

## 2) Quick Overview

In [None]:
display(df.info())

# Missing values check
missing = df.isna().sum().sort_values(ascending=False)
display(pd.DataFrame({"missing": missing, "missing_%": (missing/len(df)*100).round(3)}).head(20))

# Basic stats
display(df.describe(include="all").T)

## 3) Target Distribution

In [None]:
TARGET_COL = "exam_score_class"
SCORE_COL = "exam_score"

if TARGET_COL not in df.columns:
    raise ValueError(f"❌ Target column `{TARGET_COL}` not found. Check cleaning step.")

target_counts = df[TARGET_COL].value_counts().reset_index()
target_counts.columns = [TARGET_COL, "count"]
target_counts["percent"] = (target_counts["count"] / len(df) * 100).round(2)
display(target_counts)

fig = px.bar(target_counts, x=TARGET_COL, y="count", text="percent",
             title="Target Class Distribution (Counts)",
             template="plotly_white")
fig.update_traces(texttemplate='%{text}%', textposition='outside')
fig.show()

fig2 = px.pie(target_counts, names=TARGET_COL, values="count",
              title="Target Class Distribution (Pie)",
              template="plotly_white")
fig2.show()

## 4) Numeric Feature Exploration (Univariate)

In [None]:
numeric_cols = df.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c not in ["student_id"]]  # keep score for univariate

print("Numeric columns:", numeric_cols)

# Histograms
for col in numeric_cols:
    fig = px.histogram(df, x=col, nbins=40, title=f"Distribution: {col}", template="plotly_white")
    fig.show()

# Boxplots (overall)
for col in numeric_cols:
    fig = px.box(df, y=col, title=f"Boxplot: {col}", template="plotly_white")
    fig.show()

## 5) Categorical Feature Exploration (Univariate)

In [None]:
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
cat_cols = [c for c in cat_cols if c not in [TARGET_COL]]

print("Categorical columns:", cat_cols)

for col in cat_cols:
    vc = df[col].value_counts().reset_index()
    vc.columns = [col, "count"]
    vc["percent"] = (vc["count"]/len(df)*100).round(2)
    display(vc)

    fig = px.bar(vc, x=col, y="count", text="percent",
                 title=f"Counts: {col}", template="plotly_white")
    fig.update_traces(texttemplate='%{text}%', textposition='outside')
    fig.update_layout(xaxis_tickangle=-25)
    fig.show()

## 6) Target-wise Numeric Analysis (Bivariate)
How do numeric features behave across **Low / Medium / High** classes?

In [None]:
num_for_target = [c for c in numeric_cols if c not in [SCORE_COL]]

# Summary table: mean/median by class
summary_tables = []
for col in num_for_target:
    g = df.groupby(TARGET_COL)[col].agg(["count", "mean", "median", "std", "min", "max"]).reset_index()
    g.insert(1, "feature", col)
    summary_tables.append(g)

numeric_target_summary = pd.concat(summary_tables, ignore_index=True)
display(numeric_target_summary.head(25))

# Visuals: box + violin
for col in num_for_target:
    fig = px.box(df, x=TARGET_COL, y=col, points="all",
                 title=f"{col} vs {TARGET_COL} (Box + Points)",
                 template="plotly_white")
    fig.show()

    fig2 = px.violin(df, x=TARGET_COL, y=col, box=True, points="all",
                     title=f"{col} vs {TARGET_COL} (Violin)",
                     template="plotly_white")
    fig2.show()

## 7) Target-wise Categorical Analysis (Bivariate)
Stacked distributions: how categories split across target classes.

In [None]:
cat_target_tables = []
for col in cat_cols:
    ct = pd.crosstab(df[col], df[TARGET_COL], normalize="index") * 100
    ct = ct.round(2).reset_index()
    ct.insert(1, "feature", col)
    cat_target_tables.append(ct)

cat_target_summary = pd.concat(cat_target_tables, ignore_index=True)
display(cat_target_summary.head(30))

# Visual stacked bar for each categorical feature
for col in cat_cols:
    tmp = pd.crosstab(df[col], df[TARGET_COL], normalize="index").reset_index()
    tmp_melt = tmp.melt(id_vars=[col], var_name=TARGET_COL, value_name="ratio")

    fig = px.bar(tmp_melt, x=col, y="ratio", color=TARGET_COL,
                 title=f"{col} → Target Class Composition (Stacked)",
                 template="plotly_white")
    fig.update_layout(barmode="stack", xaxis_tickangle=-25)
    fig.show()

## 8) Correlation & Multicollinearity Checks

In [None]:
num_corr_cols = [c for c in numeric_cols if c not in ["student_id"]]
corr = df[num_corr_cols].corr(numeric_only=True)

fig = px.imshow(corr, text_auto=True, title="Correlation Heatmap (Numeric Features)", template="plotly_white")
fig.show()

# Top absolute correlation pairs
pairs = []
cols = corr.columns.tolist()
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        pairs.append((cols[i], cols[j], abs(corr.iloc[i, j])))

corr_pairs = pd.DataFrame(pairs, columns=["feature_1", "feature_2", "abs_corr"]).sort_values("abs_corr", ascending=False)
display(corr_pairs.head(20))

## 9) Interaction Visuals (Competition-style)

In [None]:
# Study hours vs Attendance vs Target
if "study_hours" in df.columns and "class_attendance" in df.columns:
    fig = px.scatter(
        df.sample(min(5000, len(df)), random_state=RANDOM_STATE),
        x="study_hours", y="class_attendance", color=TARGET_COL,
        title="Study Hours vs Attendance (colored by Target Class)",
        template="plotly_white",
        opacity=0.7
    )
    fig.show()

# Sleep hours vs Exam score with color by sleep_quality if exists
if "sleep_hours" in df.columns and SCORE_COL in df.columns:
    color_col = "sleep_quality" if "sleep_quality" in df.columns else TARGET_COL
    fig = px.scatter(
        df.sample(min(5000, len(df)), random_state=RANDOM_STATE),
        x="sleep_hours", y=SCORE_COL, color=color_col,
        title="Sleep Hours vs Exam Score",
        template="plotly_white",
        opacity=0.7
    )
    fig.show()

# Course vs Exam score (box)
if "course" in df.columns and SCORE_COL in df.columns:
    fig = px.box(df, x="course", y=SCORE_COL, points="all",
                 title="Exam Score by Course (Box + Points)",
                 template="plotly_white")
    fig.update_layout(xaxis_tickangle=-25)
    fig.show()

## 10) Save EDA Report (Optional)

In [None]:
REPORT_DIR = Path("reports")
REPORT_DIR.mkdir(parents=True, exist_ok=True)

target_counts.to_csv(REPORT_DIR / "target_distribution.csv", index=False)
numeric_target_summary.to_csv(REPORT_DIR / "numeric_target_summary.csv", index=False)
cat_target_summary.to_csv(REPORT_DIR / "categorical_target_summary.csv", index=False)

summary_md = []
summary_md.append("# EDA Summary\n")
summary_md.append(f"- Dataset shape: {df.shape}\n")
summary_md.append("## Target Distribution\n")
summary_md.append(target_counts.to_markdown(index=False))
summary_md.append("\n\n## Notes / Observations\n")
summary_md.append("- Check target imbalance; consider `class_weight='balanced'` if needed.\n")
summary_md.append("- Study hours, attendance, and sleep-related features often show separation between performance classes.\n")
summary_md.append("- Use correlation checks to avoid redundant features.\n")

(REPORT_DIR / "eda_summary.md").write_text("\n".join(summary_md), encoding="utf-8")

print("✅ Reports saved under:", REPORT_DIR.resolve())

## 11) Next Step
Next notebook: **feature_engineering.ipynb**

It will:
- Create engineered features (efficiency, sleep deficit, interactions)
- Encode ordinal vs nominal categories correctly
- Export `data/featured_students.csv` (optional) for faster training iterations
