<a href="https://colab.research.google.com/github/badbloody/diploma2023/blob/main/creating_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the needed libraries

In [None]:
import requests
import os
import shutil
import zipfile
import random

import torchvision
import torchvision.transforms as transforms
import torchvision.models as models

Connecting to Google Drive so we can safely store our downloaded images without them disappearing after the Colab session is over

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


# Defining parameters

In [None]:
""" PARAMETERS """
number_of_images = 10000 # how many images from the COCO dataset, which will be used as content images, you want to download
dataset_dir = "/content/gdrive/MyDrive/content_dataset" # where you want to save the content images
train_ratio_content = 0.99 # the ratio of content images you want to use for training
train_ratio_style = 0.8 # the ratio of style images you want to use for training

In [None]:
""" DON'T CHANGE THESE """

cnn_normalization_mean = [0.485, 0.456, 0.406]
cnn_normalization_std = [0.229, 0.224, 0.225]

data_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=cnn_normalization_mean, std=cnn_normalization_std)
])


# Helper functions

A function that will download the desired number of content images from the COCO dataset and store them in the provided folder:

In [None]:
def download_and_extract_coco(dataset_dir, number_of_images):
    # download annotations
    annotations_url = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
    annotations_path = dataset_dir + "/annotations.zip"

    # create the directory passed as the first argument
    os.makedirs(dataset_dir, exist_ok=True)

    # download annotations zip file
    print("Downloading annotations...")
    response = requests.get(annotations_url, stream=True)
    with open(annotations_path, "wb") as file:
        shutil.copyfileobj(response.raw, file)

    # extract the annotations
    print("Extracting annotations...")
    with zipfile.ZipFile(annotations_path, "r") as zip_ref:
        zip_ref.extractall(dataset_dir)

    # download images
    images_url = "http://images.cocodataset.org/zips/train2017.zip"
    images_path = dataset_dir + "/images.zip"

    # Download images zip file
    print("Downloading images...")
    response = requests.get(images_url, stream=True)
    with open(images_path, "wb") as file:
        shutil.copyfileobj(response.raw, file)

    # extracting images
    print("Extracting images...")
    with zipfile.ZipFile(images_path, "r") as zip_ref:
        selected_images = zip_ref.infolist()[:number_of_images]
        zip_ref.extractall(dataset_dir, members=selected_images)

    # removing the files afterwards
    os.remove(annotations_path)
    os.remove(images_path)

    print("Done!")

A helper function that splits the dataset into training and validation subsets


In [None]:
from IPython.utils.path import target_update
def split_dataset(dataset_dir, train_ratio):
    # create directories for train and validation sets
    if "/train2017" in dataset_dir:
      train_dir = dataset_dir.replace("/train2017", "_train")
      val_dir = dataset_dir.replace("/train2017", "_val")
    else:
      train_dir = dataset_dir + "_train"
      val_dir = dataset_dir + "_val"

    if not os.path.exists(train_dir):
      os.makedirs(train_dir, exist_ok=True)

    if not os.path.exists(val_dir):
      os.makedirs(val_dir, exist_ok=target_update)

    # get the list of files in the dataset directory
    files = os.listdir(dataset_dir)
    random.shuffle(files)

    # calculate the number of files for training and validation
    total_files = len(files)
    train_count = int(train_ratio * total_files)
    val_count = total_files - train_count

    # move files to the created train and validation directories
    for i, file in enumerate(files):
        src_path = os.path.join(dataset_dir, file)
        if i < train_count:
            dst_path = os.path.join(train_dir, file)
        else:
            dst_path = os.path.join(val_dir, file)
        shutil.move(src_path, dst_path) #move from source to destination path

    print("Dataset has been split into train and validation sets!")

A function to create a train folder inside our content folder - this is due to the library torchvision.datasets.Imagefolder needing there to be some classes within a folder in order to create a dataset

In [None]:
def move_files_within_folder(source_folder, destination_folder):
    # get the absolute paths for the source and destination folders
    source_folder_abs = os.path.abspath(source_folder)
    destination_folder_abs = os.path.join(source_folder_abs, destination_folder)

    # create the destination folder if it doesn't exist
    if not os.path.exists(destination_folder_abs):
        os.makedirs(destination_folder_abs)

    # get the list of files in the source folder
    files = os.listdir(source_folder_abs)

    for file_name in files:
        # get the absolute path of the current file
        file_abs_path = os.path.join(source_folder_abs, file_name)

        # check if the current item is a file (not a directory)
        if os.path.isfile(file_abs_path):
            # Move the file to the destination folder
            shutil.move(file_abs_path, os.path.join(destination_folder_abs, file_name))

# Creating the content dataset

In [None]:
download_and_extract_coco(dataset_dir, number_of_images)

Downloading annotations...
Extracting annotations...
Downloading images...
Extracting images...
Done!


In [None]:
split_dataset(dataset_dir+"/train2017", train_ratio_content)

Dataset has been split into train and validation sets!


In [None]:
source_folder = dataset_dir + "_train"
destination_folder = dataset_dir + "_train/train"

move_files_within_folder(source_folder, destination_folder)

In [None]:
print(len(os.listdir(destination_folder)))

9899


In [None]:
content_dataset = torchvision.datasets.ImageFolder(root= dataset_dir + "_train", transform= data_transform)

# Creating the style dataset

First we clone the repository from GitHub:

In [None]:
!git clone https://github.com/badbloody/diplomskiSlike

fatal: destination path 'diplomskiSlike' already exists and is not an empty directory.


Here we define the paths of our folders - the `source_folder` being the one which we cloned from GitHub; the `train_folder` and `val_folder`are the paths where we will store the datasets after the split - they don't have to exist, the function `split_style_dataset` will create them.

In [None]:
source_folder = '/content/diplomskiSlike'
train_folder = '/content/style_train'
val_folder = '/content/style_val'

A function that will split our style images into train and val; it is different from the function we used for the splitting the content images due to having multiple classes in our style dataset.

In [None]:
def split_style_dataset(source_folder, train_folder, val_folder, validation_split=0.2, random_seed=None):
    if not os.path.exists(source_folder):
        print("Source folder does not exist.")
        return

    if not os.path.exists(train_folder):
        os.makedirs(train_folder)

    if not os.path.exists(val_folder):
        os.makedirs(val_folder)

    class_folders = [folder for folder in os.listdir(source_folder) if os.path.isdir(os.path.join(source_folder, folder))]

    if random_seed is not None:
        random.seed(random_seed)

    for class_folder in class_folders:
        class_source_path = os.path.join(source_folder, class_folder)
        class_train_path = os.path.join(train_folder, class_folder)
        class_val_path = os.path.join(val_folder, class_folder)

        if not os.path.exists(class_train_path):
            os.makedirs(class_train_path)

        if not os.path.exists(class_val_path):
            os.makedirs(class_val_path)

        images = [img for img in os.listdir(class_source_path) if img.endswith('.jpg') or img.endswith('.png')]

        num_val_samples = int(len(images) * validation_split)
        val_samples = random.sample(images, num_val_samples)
        train_samples = [img for img in images if img not in val_samples]

        for img in val_samples:
            src_path = os.path.join(class_source_path, img)
            dest_path = os.path.join(class_val_path, img)
            shutil.copy(src_path, dest_path)

        for img in train_samples:
            src_path = os.path.join(class_source_path, img)
            dest_path = os.path.join(class_train_path, img)
            shutil.copy(src_path, dest_path)

    print("Split completed!")

In [None]:
split_style_dataset(source_folder, train_folder, val_folder, validation_split=1-train_ratio_style, random_seed=42)

Split completed!


There is an error when calling `torchvision.datasets.ImageFolder` due to there being some .git files in our folders, so here is a helper function to get rid of them

In [None]:
def remove_git_folders(directory):
    for root, dirs, _ in os.walk(directory):
        if '.git' in dirs:
            git_folder_path = os.path.join(root, '.git')
            shutil.rmtree(git_folder_path)
            print(f"Removed .git folder at: {git_folder_path}")
            dirs.remove('.git')

directory_to_clean = '/content/style_train'
remove_git_folders(directory_to_clean)

Removed .git folder at: /content/style_train/.git


In [None]:
style_dataset = torchvision.datasets.ImageFolder(train_folder, transform= data_transform)

Now the content and style datasets are ready for the training stage.