Write a function that takes just one argument — the name of the CSV file – and generates a
split of the images into train (70%), validation (15%) and test sets (15%). Each split should
have approximately the same proportion of giraffe images and zebra images (and images
containing both), but the images should be randomly assigned to the splits otherwise. Create
output that shows the split.

In [39]:
%cd /content/drive/MyDrive/hw3-csci4946/

/content/drive/MyDrive/hw3-csci4946


In [40]:
import pandas as pd
import numpy as np

def generate_splits(csv_filename):
    df = pd.read_csv(csv_filename)

    # splits data based on giraffe and zebra samples, splits into 70% training, 15% validation, 15% test
    train, validate, test = np.split(df.sample(frac=1, random_state=42),
                                     [int(.7*len(df)), int(.85*len(df))])

    # prints the splits
    print(f"Train Set: {len(train)} images")
    print(f"Validation Set: {len(validate)} images")
    print(f"Test Set: {len(test)} images")

    return train, validate, test

generate_splits('metadata.csv')

Train Set: 3463 images
Validation Set: 742 images
Test Set: 743 images


(              filename  giraffe  zebra
 151   000000000152.jpg        0      1
 807   000000000808.jpg        0      1
 621   000000000622.jpg        0      1
 3978  000000003979.jpg        1      0
 3998  000000003999.jpg        1      0
 ...                ...      ...    ...
 37    000000000038.jpg        0      1
 974   000000000975.jpg        0      1
 3195  000000003196.jpg        0      1
 983   000000000984.jpg        0      1
 3838  000000003839.jpg        0      1
 
 [3463 rows x 3 columns],
               filename  giraffe  zebra
 4001  000000004002.jpg        0      1
 1418  000000001419.jpg        0      1
 3332  000000003333.jpg        0      1
 1560  000000001561.jpg        0      1
 39    000000000040.jpg        0      1
 ...                ...      ...    ...
 2762  000000002763.jpg        1      0
 846   000000000847.jpg        0      1
 2551  000000002552.jpg        0      1
 2928  000000002929.jpg        0      1
 117   000000000118.jpg        0      1
 
 [742 rows

In [41]:
!ls


images	metadata.csv


Write a subclass of the PyTorch Dataset class that implements the functionality to creates
datasets for your train, validation and test splits. The class should include transformations
to map your image into the appropriate format and to resize to the correct input size for your
network.



In [42]:
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image

class CustomDataset(Dataset): # inherits pytorch DataSet class
    def __init__(self, dataframe, root_dir, transform=None): # initializes dataset object using pandas data frame, image directory, and image transformations
        self.dataframe = dataframe
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self): # size of dataset
        return len(self.dataframe)


    def __getitem__(self, idx): # retrieves sample for given index, gets and loads image at given index
      img_name = self.dataframe.iloc[idx, 0]
      img_path = os.path.join(self.root_dir, img_name)
      image = Image.open(img_path)


      # retrieves labels for image, presence of zebra or giraffe
      label_giraffe = int(self.dataframe.iloc[idx, 1])
      label_zebra = int(self.dataframe.iloc[idx, 2])
      labels = torch.tensor([label_giraffe, label_zebra])

      if self.transform:
          image = self.transform(image)

      return image, labels, img_name  # Return filename here

    # def __getitem__(self, idx):
    #     img_name = self.dataframe.iloc[idx, 0]
    #     img_path = os.path.join(self.root_dir, img_name)
    #     image = Image.open(img_path)

    #     label_giraffe = int(self.dataframe.iloc[idx, 1])
    #     label_zebra = int(self.dataframe.iloc[idx, 2])
    #     labels = torch.tensor([label_giraffe, label_zebra]) # converts labels to pytorch tensor

    #     if self.transform: # performs transformations on image
    #         image = self.transform(image)

    #     return image, labels

transform = transforms.Compose([ # transformation sequence, resizes images to 224x224
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])


Write code that tests your Dataset class by iterating through the three instances you created
to show they are disjoint and include all images from the provided directory.

In [47]:
import os


def test_datasets(root_dir, train, validate, test):
    # Create dataset instances
    train_dataset = CustomDataset(dataframe=train, root_dir=root_dir, transform=transform)
    val_dataset = CustomDataset(dataframe=validate, root_dir=root_dir, transform=transform)
    test_dataset = CustomDataset(dataframe=test, root_dir=root_dir, transform=transform)

    # Create sets to collect filenames
    train_files = set()
    val_files = set()
    test_files = set()

    # iterate over all instances
    for _, _, img_name in train_dataset:
        train_files.add(img_name)

    for _, _, img_name in val_dataset:
        val_files.add(img_name)

    for _, _, img_name in test_dataset:
        test_files.add(img_name)

    # Check disjoint sets
    assert len(train_files & val_files) == 0, "Overlap between train and validation sets"
    assert len(train_files & test_files) == 0, "Overlap between train and test sets"
    assert len(val_files & test_files) == 0, "Overlap between validation and test sets"

    # Check that all files in the directory are accounted for
    all_files = set(os.listdir(root_dir))
    all_dataset_files = train_files | val_files | test_files
    assert all_files == all_dataset_files, "Not all files from the directory are included in the datasets"

    print("All tests passed!")

train, validate, test = generate_splits('metadata.csv')

test_datasets('/content/drive/MyDrive/hw3-csci4946/images', train, validate, test)


Train Set: 3463 images
Validation Set: 742 images
Test Set: 743 images
All tests passed!


In [54]:
import matplotlib.pyplot as plt

def explore_dataset(dataset, title):
    # Caclulates image counts for zebras, giraffes, both, and neither
    giraffe_count = sum(dataset.dataframe.iloc[:, 1])
    zebra_count = sum(dataset.dataframe.iloc[:, 2])
    both_count = sum((dataset.dataframe.iloc[:, 1] & dataset.dataframe.iloc[:, 2]))
    neither_count = len(dataset) - giraffe_count - zebra_count + both_count

    # Prints statistics
    print(f"{title} - Number of Giraffes: {giraffe_count}")
    print(f"{title} - Number of Zebras: {zebra_count}")
    print(f"{title} - Number of Both: {both_count}")
    print(f"{title} - Number of Neither: {neither_count}")

    # Displays 10 random images, creates 2x5 subplots using random choice
    fig, axs = plt.subplots(2, 5, figsize=(15, 6))
    fig.suptitle(title)
    indices = np.random.choice(len(dataset), 10, replace=False)
    for i, idx in enumerate(indices):
        image, _, _ = dataset[idx]
        ax = axs[i//5, i%5]
        ax.imshow(transforms.ToPILImage()(image))
        ax.axis('off')
    plt.show()

# creates instances again
train_dataset = CustomDataset(dataframe=train, root_dir='/content/drive/MyDrive/hw3-csci4946/images', transform=transform)
val_dataset = CustomDataset(dataframe=validate, root_dir='/content/drive/MyDrive/hw3-csci4946/images', transform=transform)
test_dataset = CustomDataset(dataframe=test, root_dir='/content/drive/MyDrive/hw3-csci4946/images', transform=transform)
for dataset, title in [(train_dataset, "Train Dataset"), (val_dataset, "Validation Dataset"), (test_dataset, "Test Dataset")]: # outputs formatted data
    explore_dataset(dataset, title)


Output hidden; open in https://colab.research.google.com to view.

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive
