# Generate Small Dataset of the one from Kaggle

In [1]:
import kagglehub
# Download latest version
path = kagglehub.dataset_download("shashwatwork/knee-osteoarthritis-dataset-with-severity")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/shashwatwork/knee-osteoarthritis-dataset-with-severity?dataset_version_number=1...


100%|██████████| 204M/204M [00:03<00:00, 56.7MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/shashwatwork/knee-osteoarthritis-dataset-with-severity/versions/1


## Process data into a small subset

In [2]:
import shutil
import os
from PIL import Image

# Source path from the download cell
source_root = path
# Destination for the reduced dataset
dest_root = '/content/small_dataset'

# Clean up previous runs if any
if os.path.exists(dest_root):
    shutil.rmtree(dest_root)

# Configuration for size reduction
target_size = (128, 128) # Thumbnail size
quality = 50             # JPEG quality
max_images_per_class = 40 # Limit images per class to ensure we fit in 5MB
max_total_size = 5 * 1024 * 1024 # 5 MB limit

current_total_size = 0
print(f"Creating reduced dataset at {dest_root} (Target < 5MB)...")

for split in ['train', 'val', 'test']:
    source_split = os.path.join(source_root, split)
    dest_split = os.path.join(dest_root, split)

    if not os.path.exists(source_split):
        print(f"Skipping {split}, not found in source.")
        continue

    for class_name in os.listdir(source_split):
        source_class = os.path.join(source_split, class_name)
        dest_class = os.path.join(dest_split, class_name)

        if not os.path.isdir(source_class): continue
        os.makedirs(dest_class, exist_ok=True)

        # Take a subset of files to ensure we don't exceed size
        files = sorted(os.listdir(source_class))[:max_images_per_class]

        for fname in files:
            if current_total_size >= max_total_size:
                break

            src_file = os.path.join(source_class, fname)
            dst_file = os.path.join(dest_class, fname)

            try:
                with Image.open(src_file) as img:
                    img = img.convert('RGB').resize(target_size, Image.Resampling.LANCZOS)
                    img.save(dst_file, 'JPEG', quality=quality)

                current_total_size += os.path.getsize(dst_file)
            except Exception as e:
                print(f"Error processing {fname}: {e}")

        if current_total_size >= max_total_size: break
    if current_total_size >= max_total_size: break

print(f"Finished. New dataset size: {current_total_size/1024/1024:.2f} MB")

# Update directory variables to point to the new reduced dataset
train_dir = os.path.join(dest_root, 'train')
val_dir = os.path.join(dest_root, 'val')
test_dir = os.path.join(dest_root, 'test')



Creating reduced dataset at /content/small_dataset (Target < 5MB)...
Finished. New dataset size: 0.91 MB


## Download Zip File

In [3]:
from google.colab import files

# Define paths
dataset_dir = '/content/small_dataset'
output_filename = '/content/small_dataset'

# Create a zip archive
print(f"Zipping {dataset_dir}...")
shutil.make_archive(output_filename, 'zip', dataset_dir)

# Verify zip file creation and size
zip_path = output_filename + '.zip'
zip_size = os.path.getsize(zip_path) / (1024 * 1024)
print(f"Created {zip_path} ({zip_size:.2f} MB)")

# Trigger download
files.download(zip_path)

Zipping /content/small_dataset...
Created /content/small_dataset.zip (0.89 MB)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>