In [6]:
!wget https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000325/data/data.tar.gz
!tar -xzvf data.tar.gz && rm data.tar.gz

--2024-10-01 08:34:10--  https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000325/data/data.tar.gz
Resolving aistages-api-public-prod.s3.amazonaws.com (aistages-api-public-prod.s3.amazonaws.com)... 52.219.202.39, 52.219.58.122, 52.219.60.46, ...
Connecting to aistages-api-public-prod.s3.amazonaws.com (aistages-api-public-prod.s3.amazonaws.com)|52.219.202.39|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2161679447 (2.0G) [binary/octet-stream]
Saving to: ‘data.tar.gz’


2024-10-01 08:35:06 (37.4 MB/s) - ‘data.tar.gz’ saved [2161679447/2161679447]



In [69]:
import json
import numpy as np
import os
import shutil
import pandas as pd
from collections import Counter
from sklearn.model_selection import StratifiedGroupKFold

def copy_images(image_ids, src_dir, dest_dir):
    """
    Copies images from the source directory to the destination directory based on image_ids.
    
    Parameters:
    - image_ids (list): List of image IDs to copy.
    - src_dir (str): Directory where the images are stored.
    - dest_dir (str): Directory to copy images to.
    """
    for image_id in image_ids:
        image_filename = f"{image_id}.jpg"
        source_image_path = os.path.join(src_dir, image_filename)
        destination_image_path = os.path.join(dest_dir, image_filename)

        # Copy the image if it exists
        if os.path.exists(source_image_path):
            shutil.copy(source_image_path, destination_image_path)

def get_distribution(y):
    """
    Calculates the class distribution as a percentage for the given labels.

    Parameters:
    - y (np.array): Array of labels.

    Returns:
    - list: List of class distribution percentages as strings.
    """
    y_counter = Counter(y)
    total = sum(y_counter.values())
    return [f'{y_counter[i] / total:.2%}' for i in range(np.max(y) + 1)]


def split_data_and_copy_images2(annotation_path, image_dir='dataset/train', n_splits=5, random_state=42):
    """
    Splits dataset annotations and images into training and validation sets using StratifiedGroupKFold.
    Saves the split annotations and copies the images to respective directories.
    
    Parameters:
    - annotation_path (str): Path to the JSON annotation file.
    - image_dir (str): Directory where the images are stored.
    - n_splits (int): Number of splits for StratifiedGroupKFold.

    Returns:
    - pd.DataFrame: Dataframe containing class distributions for original, train, and validation sets.
    """
    
    # Load the JSON annotation file
    with open(annotation_path) as f:
        data = json.load(f)

    # Group annotations by image_id and create (image_id, category_id) pairs
    image_annotations = {}
    image_category_pairs = []
    
    for ann in data['annotations']:
        image_id = ann['image_id']
        # Initialize the list for image_id if not already present
        if image_id not in image_annotations:
            image_annotations[image_id] = []
        image_annotations[image_id].append(ann)

    # Collect the image_id and its category_id (assuming all annotations for an image share the same category_id)
    for image_id, anns in image_annotations.items():
        category_id = anns[0]['category_id']
        image_category_pairs.append((image_id, category_id))

    # Prepare data for StratifiedGroupKFold
    X = np.ones((len(image_category_pairs), 1))  # Dummy variable for input
    y = np.array([pair[1] for pair in image_category_pairs])  # Array of category_ids
    groups = np.array([pair[0] for pair in image_category_pairs])  # Array of image_ids

    # Set up StratifiedGroupKFold for splitting the dataset
    cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Directory to save the split data
    output_dir = f'data/{n_splits}splits_{random_state}'
    os.makedirs(output_dir, exist_ok=True)

    # Prepare to collect class distributions for each fold
    distributions = []
    index = []

    # Get distribution of the original dataset
    distributions.append(get_distribution(y))
    index.append('Original dataset')

    # Perform the split and save annotations and images for each fold
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
        # Extract image_ids for the current train and validation sets
        train_image_ids = groups[train_idx]
        val_image_ids = groups[val_idx]

        # Gather annotations for training and validation sets
        train_annotations = [ann for image_id in train_image_ids for ann in image_annotations[image_id]]
        val_annotations = [ann for image_id in val_image_ids for ann in image_annotations[image_id]]

        # Save training and validation annotations to JSON files
        train_file = os.path.join(output_dir, f'fold{fold + 1}-train.json')
        val_file = os.path.join(output_dir, f'fold{fold + 1}-val.json')
        with open(train_file, 'w') as f:
            json.dump({'annotations': train_annotations}, f, indent=4)
        with open(val_file, 'w') as f:
            json.dump({'annotations': val_annotations}, f, indent=4)

        # # Create directories for the images of the current fold
        # train_image_dir = os.path.join(output_dir, f'fold{fold + 1}-train-images')
        # val_image_dir = os.path.join(output_dir, f'fold{fold + 1}-val-images')
        # os.makedirs(train_image_dir, exist_ok=True)
        # os.makedirs(val_image_dir, exist_ok=True)

        # # Copy images for the training set
        # copy_images(train_image_ids, image_dir, train_image_dir)
        
        # # Copy images for the validation set
        # copy_images(val_image_ids, image_dir, val_image_dir)

        # Collect class distribution information for the current fold
        train_y, val_y = y[train_idx], y[val_idx]
        distributions.append(get_distribution(train_y))
        distributions.append(get_distribution(val_y))
        index.append(f'train - fold {fold + 1}')
        index.append(f'val - fold {fold + 1}')

        print(f"Fold {fold + 1} saved: Train -> {train_file}, Val -> {val_file}")

    # Create a DataFrame to display the class distribution
    category_names = [category['name'] for category in data['categories']]
    distribution_df = pd.DataFrame(distributions, index=index, columns=[category_names[i] for i in range(np.max(y) + 1)])
    return distribution_df


In [70]:
n_splits= [5,8,10]
random_states = [42, 1005, 7, 333, 1999]

for split in n_splits:
    for rd_state in random_states:
        split_data_and_copy_images2('dataset/train.json', n_splits=split, random_state=rd_state)

Fold 1 saved: Train -> data/5splits_42/fold1-train.json, Val -> data/5splits_42/fold1-val.json
Fold 2 saved: Train -> data/5splits_42/fold2-train.json, Val -> data/5splits_42/fold2-val.json
Fold 3 saved: Train -> data/5splits_42/fold3-train.json, Val -> data/5splits_42/fold3-val.json
Fold 4 saved: Train -> data/5splits_42/fold4-train.json, Val -> data/5splits_42/fold4-val.json
Fold 5 saved: Train -> data/5splits_42/fold5-train.json, Val -> data/5splits_42/fold5-val.json
Fold 1 saved: Train -> data/5splits_1005/fold1-train.json, Val -> data/5splits_1005/fold1-val.json
Fold 2 saved: Train -> data/5splits_1005/fold2-train.json, Val -> data/5splits_1005/fold2-val.json
Fold 3 saved: Train -> data/5splits_1005/fold3-train.json, Val -> data/5splits_1005/fold3-val.json
Fold 4 saved: Train -> data/5splits_1005/fold4-train.json, Val -> data/5splits_1005/fold4-val.json
Fold 5 saved: Train -> data/5splits_1005/fold5-train.json, Val -> data/5splits_1005/fold5-val.json
Fold 1 saved: Train -> data/5s