# Data Exploration - Setup - Splitting

## Imports

In [4]:
# Data Handling
import pandas as pd
import numpy as np
import random

# Visualization
import matplotlib.pyplot as plt
from PIL import Image

# Paths
from src.__00__paths import raw_data_dir, processed_spectrogram_dir, spectrogram_train_dir, spectrogram_test_dir, \
    spectrogram_validation_dir
import shutil
from pathlib import Path
from PIL import Image

# Datasets Source
import kagglehub

## Download GTZAN Dataset - Music Genre Classification

In [5]:
# List of files to check
data_items = [
    raw_data_dir / "features_3_sec.csv",
    raw_data_dir / "features_30_sec.csv",
    raw_data_dir / "genres_original",
    raw_data_dir / "images_original",
]

# Check and download
if all(item.exists() for item in data_items):
    print("✔️ Dataset is already downloaded.")
else:
    # Download dataset
    dataset_path = Path(kagglehub.dataset_download("andradaolteanu/gtzan-dataset-music-genre-classification"))

    if not dataset_path.exists():
        raise FileNotFoundError("⚠ Dataset not found.")

    # Check for an extra "Data" folder
    data_root = dataset_path / "Data" if (dataset_path / "Data").exists() else dataset_path

    # Copy files/folders to raw_data_dir
    for item in data_root.iterdir():
        target = raw_data_dir / item.name
        if item.is_file():
            shutil.copy2(item, target)
        elif item.is_dir():
            shutil.copytree(item, target, dirs_exist_ok=True)

    print("✔️ Dataset successfully downloaded.")

✔️ Dataset successfully downloaded.


## Image Resize & Borders Crop

In [8]:
def crop_white_borders(image, threshold=250):
    image_np = np.array(image)
    gray = np.mean(image_np, axis=2)
    non_white_rows = np.where(np.mean(gray, axis=1) < threshold)[0]
    non_white_cols = np.where(np.mean(gray, axis=0) < threshold)[0]

    if non_white_rows.size == 0 or non_white_cols.size == 0:
        print("Image appears to be fully white.")
        return None

    top, bottom = non_white_rows[0], non_white_rows[-1]
    left, right = non_white_cols[0], non_white_cols[-1]

    return image.crop((left, top, right + 1, bottom + 1))


IMG_SIZE = (128, 128)

## Image Processing

In [9]:
for genre_dir in (raw_data_dir / "images_original").iterdir():
    if genre_dir.is_dir():
        out_dir = processed_spectrogram_dir / genre_dir.name
        out_dir.mkdir(exist_ok=True)
        for img_file in genre_dir.glob("*.png"):
            img = Image.open(img_file)
            img = crop_white_borders(img)

            if img:
                img = img.resize(IMG_SIZE)
                img.save(out_dir / img_file.name)

## Split Data

In [10]:
for genre_dir in processed_spectrogram_dir.iterdir():
    if genre_dir.is_dir():
        images = list(genre_dir.glob("*.png"))
        random.shuffle(images)

        train_split = int(0.7 * len(images))
        val_split = int(0.85 * len(images))

        train_images = images[:train_split]
        val_images = images[train_split:val_split]
        test_images = images[val_split:]

        # Create dirs
        (spectrogram_train_dir / genre_dir.name).mkdir(parents=True, exist_ok=True)
        (spectrogram_test_dir / genre_dir.name).mkdir(parents=True, exist_ok=True)
        (spectrogram_validation_dir / genre_dir.name).mkdir(parents=True, exist_ok=True)

        # Copy
        for img in train_images:
            shutil.copy(img, spectrogram_train_dir / genre_dir.name / img.name)
        for img in val_images:
            shutil.copy(img, spectrogram_validation_dir / genre_dir.name / img.name)
        for img in test_images:
            shutil.copy(img, spectrogram_test_dir / genre_dir.name / img.name)

print(f"✔️ Train/Validation/Test split complete in {'/'.join(processed_spectrogram_dir.parts[-3:])}")

✔️ Train/Validation/Test split complete in data/processed/spectrogram
