# Data Setup & Exploration

## Imports

In [2]:
import kagglehub
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from src.__00__paths import raw_data_dir, processed_data_dir
from pathlib import Path
import shutil

✔️ Created Directory: data/raw
✔️ Created Directory: data/processed
✔️ Created Directory: outputs/models


In [3]:
splits = ["Testing", "Training"]
classes = ["glioma", "meningioma", "notumor", "pituitary"]

# Construct a list of all expected directory paths.
raw_dataset = [raw_data_dir / split / cls for split in splits for cls in classes]

# Check if the dataset is already downloaded and organized.
if all(path.exists() for path in raw_dataset):
    print("✔️ Dataset is already downloaded and ready.")
else:
    print("Downloading Datasets...")

    dataset_path = Path(kagglehub.dataset_download("masoudnickparvar/brain-tumor-mri-dataset"))
    if not dataset_path.exists():
        raise FileNotFoundError("❌ Dataset not found after download.")

    # Account for a potential "Data" wrapper directory in the downloaded archive.
    data_root = dataset_path / "Data" if (dataset_path / "Data").exists() else dataset_path

    raw_data_dir.mkdir(parents=True, exist_ok=True)

    # Copy data into the target directory, overwriting old files to ensure a clean state.
    for split in splits:
        src_split = data_root / split
        dst_split = raw_data_dir / split
        if src_split.exists():
            if dst_split.exists():
                shutil.rmtree(dst_split)
            shutil.copytree(src_split, dst_split)

    print("✔️ Dataset successfully downloaded and placed in:", raw_data_dir)

✔️ Dataset is already downloaded and ready.
