# Generate Small Dataset of the one from Kaggle

In [3]:
import kagglehub
# Download latest version
path = kagglehub.dataset_download("shashwatwork/knee-osteoarthritis-dataset-with-severity")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'knee-osteoarthritis-dataset-with-severity' dataset.
Path to dataset files: /kaggle/input/knee-osteoarthritis-dataset-with-severity


## Process data into a small subset

In [10]:
import shutil
import os
import random

# Source path from the download cell
source_root = path
# Destination for the reduced dataset
dest_root = '/content/small_dataset'

# Clean up previous runs if any
if os.path.exists(dest_root):
    shutil.rmtree(dest_root)

# Configuration for size reduction
max_total_size = 50 * 1024 * 1024
subset_fraction = 0.14 # Take 14% of the files from each class

current_total_size = 0

for split in ['test']:
    source_split = os.path.join(source_root, split)
    dest_split = os.path.join(dest_root, split)

    if not os.path.exists(source_split):
        print(f"Skipping {split}, not found in source.")
        continue

    for class_name in os.listdir(source_split):
        source_class = os.path.join(source_split, class_name)
        dest_class = os.path.join(dest_split, class_name)

        if not os.path.isdir(source_class): continue
        os.makedirs(dest_class, exist_ok=True)

        # Get list of all files
        all_files = sorted(os.listdir(source_class))

        # Calculate limit based on proportion (preserving natural imbalance)
        class_limit = max(1, int(len(all_files) * subset_fraction))

        # Take the subset
        files = all_files[:class_limit]

        for fname in files:
            if current_total_size >= max_total_size:
                break

            src_file = os.path.join(source_class, fname)
            dst_file = os.path.join(dest_class, fname)

            try:
                # Copy original file to preserve quality
                shutil.copy2(src_file, dst_file)
                current_total_size += os.path.getsize(dst_file)
            except Exception as e:
                print(f"Error processing {fname}: {e}")

        if current_total_size >= max_total_size: break
    if current_total_size >= max_total_size: break

print(f"Finished. New dataset size: {current_total_size/1024/1024:.2f} MB")

# Update directory variables to point to the new reduced dataset
train_dir = os.path.join(dest_root, 'train')
val_dir = os.path.join(dest_root, 'val')
test_dir = os.path.join(dest_root, 'test')

Finished. New dataset size: 4.71 MB


## Download Zip File

In [11]:
from google.colab import files

# Define paths
dataset_dir = '/content/small_dataset'
output_filename = '/content/small_dataset'

# Create a zip archive
print(f"Zipping {dataset_dir}...")
shutil.make_archive(output_filename, 'zip', dataset_dir)

# Verify zip file creation and size
zip_path = output_filename + '.zip'
zip_size = os.path.getsize(zip_path) / (1024 * 1024)
print(f"Created {zip_path} ({zip_size:.2f} MB)")

# Trigger download
files.download(zip_path)

Zipping /content/small_dataset...
Created /content/small_dataset.zip (4.74 MB)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>