## **ILSVRC Data Extraction from 155GB zipped File**

This script contains the process to extract data from 150GB zipped file

In [None]:
import zipfile
import pandas as pd
import os
import random
import shutil
from pathlib import Path

# Set random seed for reproducibility
random.seed(42)

# Providing the source path of zipped file
zip_path = "D:/DeepLearning_CS-898BD/Task1_dataset/imagenet-object-localization-challenge.zip"

# This path is used for a spliting task 
# Because we are targeting files from the Data folder and within train folder
target_path = "ILSVRC/Data/CLS-LOC/train/"

# Provide the local directory path to store extracted image files
local_folder = 'my_dataset/train'  
os.makedirs(local_folder, exist_ok=True)  # Ensure base directory exist.

# Opening the ZIP file 
with zipfile.ZipFile(zip_path, 'r') as zip_ref:

    # first list all files in the ZIP archive 
    all_paths = zip_ref.namelist()

    # Extracting class and image file names (only .jpeg files)
    data = []  # creat a list to store path to image files

    # Looping over all the path
    for path in all_paths:
        # Check if it's inside the target path 
        if target_path not in path:
            continue  # Skip to the next iteration
        
        # Split the path to get class name and image file from train folder with Data folder
        # and taking files which has .jpeg format
        if path.lower().endswith('.jpeg') and target_path in path:
            # Now splitting the path based on the target path
            parts = path.split(target_path)
            if len(parts) >= 2:
                remainder = parts[1]  # Everything after the target_path this contains class/image_name 
                if '/' in remainder:  # further spliting to get class and image name seperatly
                    class_name = remainder.split('/')[0]
                    image_file = os.path.basename(path)   
                    # Store class name , image path and full path as a tupple in tupple list
                    data.append((class_name, image_file, path))  

# Create a DataFrame and store data in the dataframe
df = pd.DataFrame(data, columns=['Class', 'ImageFile', 'FullPath'])
print(f"Found {len(df)} images in {len(df['Class'].unique())} classes")

# Now, Randomly select 100 classes. 100 classes are selected randomly maintaing reproducability
unique_classes = df['Class'].unique()
selected_classes = pd.Series(unique_classes).sample(n=min(100, len(unique_classes)), random_state=42).tolist()

# For each selected class, select random sample up to 500 images
sampled_data = []
for class_name in selected_classes:
    class_images = df[df['Class'] == class_name]
    sampled_images = class_images.sample(n=min(500, len(class_images)), random_state=42)
    sampled_data.append(sampled_images)

# Combine results into a single DataFrame. By this time we must get all 100x500 images into sampled df
sampled_df = pd.concat(sampled_data)
print(f"Selected {len(sampled_df)} images from {len(selected_classes)} classes")

# Final process of etracting images to local directory
with zipfile.ZipFile(zip_path, 'r') as zip_ref:

    # Create a temporary extraction directory to avoide file path issue
    temp_dir = Path('temp_extraction')
    temp_dir.mkdir(exist_ok=True)
    
    # itterate over the rows in sampled_df
    for _, row in sampled_df.iterrows():
        full_path = row['FullPath']
        class_name = row['Class']
        image_file = row['ImageFile']

        # Create a subdirectory for the class
        target_dir = Path(local_folder) / class_name
        target_dir.mkdir(parents=True, exist_ok=True)
        
        try:
            # Extracting Process starts
            # Extract to temporary directory first
            zip_ref.extract(full_path, path=temp_dir)
            
            # Source file path in the temp directory
            source_file = temp_dir / full_path
            
            # Destination file path
            dest_file = target_dir / image_file
            
            # Copy the file to the final destination which is local folder 
            # named as my_dataset in the project
            shutil.copy2(source_file, dest_file)
            
            print(f"Extracted: {image_file} to {target_dir}")
            
            # Clean up the extracted file to save space
            os.remove(source_file)
            
        except KeyError:
            print(f"File not found in ZIP: {full_path}")
        except Exception as e:
            print(f"Error processing {full_path}: {e}")

# Clean up temporary directory when finished
try:
    shutil.rmtree(temp_dir)
except Exception as e:
    print(f"Warning: Could not remove temporary directory: {e}")

print("Extraction complete!")

Found 1281167 images in 1000 classes
Selected 50000 images from 100 classes
Extracted: n03133878_2015.JPEG to my_dataset\train\n03133878
Extracted: n03133878_7055.JPEG to my_dataset\train\n03133878
Extracted: n03133878_8572.JPEG to my_dataset\train\n03133878
Extracted: n03133878_11508.JPEG to my_dataset\train\n03133878
Extracted: n03133878_7347.JPEG to my_dataset\train\n03133878
Extracted: n03133878_3060.JPEG to my_dataset\train\n03133878
Extracted: n03133878_7856.JPEG to my_dataset\train\n03133878
Extracted: n03133878_6966.JPEG to my_dataset\train\n03133878
Extracted: n03133878_204.JPEG to my_dataset\train\n03133878
Extracted: n03133878_2425.JPEG to my_dataset\train\n03133878
Extracted: n03133878_13546.JPEG to my_dataset\train\n03133878
Extracted: n03133878_7993.JPEG to my_dataset\train\n03133878
Extracted: n03133878_931.JPEG to my_dataset\train\n03133878
Extracted: n03133878_4854.JPEG to my_dataset\train\n03133878
Extracted: n03133878_13915.JPEG to my_dataset\train\n03133878
Extracte