# EDA and Preprocessing\nThis notebook downloads (if needed) and preprocesses the UCI glioma dataset, then shows EDA.

In [None]:
import os
from pathlib import Path
DATA_CSV = Path('../data/glioma_grading.csv')
if not DATA_CSV.exists():
    print('Preprocessed CSV not found; running download_and_preprocess.py')
    os.system('python ../scripts/download_and_preprocess.py')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(DATA_CSV)
print('Loaded preprocessed data:', df.shape)
print(df.head())

# Basic EDA
print('\nInfo:')
print(df.info())
print('\nTarget distribution:')
print(df['Grade'].value_counts())

# Histogram example for IDH1 if present
if 'IDH1' in df.columns:
    plt.figure(figsize=(6,4))
    plt.hist(df['IDH1'], bins=10)
    plt.title('IDH1 distribution')
    plt.show()

# Correlation heatmap for numeric columns
num = df.select_dtypes(include=['number'])
if not num.empty:
    plt.figure(figsize=(10,8))
    sns.heatmap(num.corr(), annot=True, fmt='.2f')
    plt.title('Correlation heatmap (numeric)')
    plt.show()
