In [5]:
import pandas as pd

# Load the partition file
partition_file = r"C:\Users\adamf\Desktop\EWB\Raw Data\list_eval_partition.csv"
partitions = pd.read_csv(partition_file)

# Count images in each partition
print(partitions["partition"].value_counts())


partition
0    162770
2     19962
1     19867
Name: count, dtype: int64


In [6]:
# Filter for training images (partition == 0) and sample 20,000
training_partitions = partitions[partitions["partition"] == 0]
training_subset = training_partitions.sample(n=20000, random_state=42)

# Verify
print(training_subset.head())
print(f"Training images sampled: {len(training_subset)}")


          image_id  partition
56353   056354.jpg          0
130367  130368.jpg          0
98886   098887.jpg          0
39402   039403.jpg          0
80964   080965.jpg          0
Training images sampled: 20000


In [7]:
# Filter for validation images (partition == 1) and sample 2,500
validation_partitions = partitions[partitions["partition"] == 1]
validation_subset = validation_partitions.sample(n=2500, random_state=42)

# Verify
print(validation_subset.head())
print(f"Validation images sampled: {len(validation_subset)}")


          image_id  partition
171444  171445.jpg          1
176301  176302.jpg          1
166355  166356.jpg          1
169233  169234.jpg          1
165315  165316.jpg          1
Validation images sampled: 2500


In [None]:
# Filter for test images (partition == 2) and sample 2,500
test_partitions = partitions[partitions["partition"] == 2]
test_subset = test_partitions.sample(n=2500, random_state=42)

# Verify
print(test_subset.head())
print(f"Test images sampled: {len(test_subset)}")


In [8]:
import os

# Define paths for the zip file and output folder
zip_path = r"C:\Users\adamf\Desktop\EWB\Raw Data\archive.zip"
output_folder = r"C:\Users\adamf\Desktop\EWB\Partitioned Data"

# Define subfolder paths
train_folder = os.path.join(output_folder, "train")
validation_folder = os.path.join(output_folder, "validation")
test_folder = os.path.join(output_folder, "test")

# Create subfolders if they don't exist
os.makedirs(train_folder, exist_ok=True)
os.makedirs(validation_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)


In [9]:
import zipfile

def extract_images(subset, destination_folder):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        for image_id in subset["image_id"]:
            # Path inside the zip file
            zip_internal_path = f"img_align_celeba/img_align_celeba/{image_id}"
            try:
                # Read and save the file
                file_data = zip_ref.read(zip_internal_path)
                with open(os.path.join(destination_folder, image_id), "wb") as output_file:
                    output_file.write(file_data)
            except KeyError:
                print(f"Image {image_id} not found in the zip file!")


In [10]:
extract_images(training_subset, train_folder)
print(f"Training images extracted to: {train_folder}")


Training images extracted to: C:\Users\adamf\Desktop\EWB\Partitioned Data\train


In [None]:
extract_images(training_subset, train_folder)
print(f"Training images extracted to: {train_folder}")


In [None]:
extract_images(test_subset, test_folder)
print(f"Test images extracted to: {test_folder}")


In [11]:
# Count images in each folder
print(f"Training images: {len(os.listdir(train_folder))}")
print(f"Validation images: {len(os.listdir(validation_folder))}")
print(f"Test images: {len(os.listdir(test_folder))}")


Training images: 22520
Validation images: 2500
Test images: 2500
