<a href="https://colab.research.google.com/github/badbloody/diploma2023/blob/main/creating_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the needed libraries

In [1]:
import requests
import os
import shutil
import zipfile
import random

import torchvision
import torchvision.transforms as transforms
import torchvision.models as models

Connecting to Google Drive so we can safely store our downloaded images without them disappearing after the Colab session is over

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Defining parameters

In [17]:
""" PARAMETERS """
number_of_images = 10 # how many images from the COCO dataset, which will be used as content images, you want to download
dataset_dir = "/content/gdrive/MyDrive/content_dataset" # where you want to save the content images
train_ratio_content = 0.95 # the ratio of content images you want to use for training
train_ratio_style = 0.8 # the ratio of style images you want to use for training

In [5]:
cnn_normalization_mean = [0.485, 0.456, 0.406]
cnn_normalization_std = [0.229, 0.224, 0.225]
width = 256

data_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=cnn_normalization_mean, std=cnn_normalization_std)
])


Here we have defined a function that will download the desired number of content images from the COCO dataset and store them in the provided folder:

In [11]:
def download_and_extract_coco(dataset_dir, number_of_images):
    # download annotations
    annotations_url = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
    annotations_path = dataset_dir + "/annotations.zip"

    # create the directory passed as the first argument
    os.makedirs(dataset_dir, exist_ok=True)

    # download annotations zip file
    print("Downloading annotations...")
    response = requests.get(annotations_url, stream=True)
    with open(annotations_path, "wb") as file:
        shutil.copyfileobj(response.raw, file)

    # extract the annotations
    print("Extracting annotations...")
    with zipfile.ZipFile(annotations_path, "r") as zip_ref:
        zip_ref.extractall(dataset_dir)

    # download images
    images_url = "http://images.cocodataset.org/zips/train2017.zip"
    images_path = dataset_dir + "/images.zip"

    # Download images zip file
    print("Downloading images...")
    response = requests.get(images_url, stream=True)
    with open(images_path, "wb") as file:
        shutil.copyfileobj(response.raw, file)

    # extracting images
    print("Extracting images...")
    with zipfile.ZipFile(images_path, "r") as zip_ref:
        # extracting only the specified number of images
        selected_images = zip_ref.infolist()[:number_of_images]
        zip_ref.extractall(dataset_dir, members=selected_images)

    # removing the files afterwards
    os.remove(annotations_path)
    os.remove(images_path)

    print("Done!")

A helper function that splits the dataset into training and validation subsets


In [36]:
def split_dataset(dataset_dir, train_ratio):
    # create directories for train and validation sets
    if "/train2017" in dataset_dir:
      train_dir = dataset_dir.replace("/train2017", "_train")
      val_dir = dataset_dir.replace("/train2017", "_val")
    else:
      train_dir = dataset_dir + "_train"
      val_dir = dataset_dir + "_val"

    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)

    # get the list of files in the dataset directory
    files = os.listdir(dataset_dir)
    random.shuffle(files)

    # calculate the number of files for training and validation
    total_files = len(files)
    train_count = int(train_ratio * total_files)
    val_count = total_files - train_count

    # move files to the train and validation directories
    for i, file in enumerate(files):
        src_path = os.path.join(dataset_dir, file)
        if i < train_count:
            dst_path = os.path.join(train_dir, file)
        else:
            dst_path = os.path.join(val_dir, file)
        shutil.move(src_path, dst_path) #move from source to destination path

    print("Dataset has been split into train and validation sets!")

A function to create a train folder inside our content folder - this is due to the library torchvision.datasets.Imagefolder needing there to be some classes within a folder in order to create a dataset

In [43]:
def move_files_within_folder(source_folder, destination_folder):
    # Get the absolute paths for the source and destination folders
    source_folder_abs = os.path.abspath(source_folder)
    destination_folder_abs = os.path.join(source_folder_abs, destination_folder)

    # Create the destination folder if it doesn't exist
    if not os.path.exists(destination_folder_abs):
        os.makedirs(destination_folder_abs)

    # Get the list of files in the source folder
    files = os.listdir(source_folder_abs)

    for file_name in files:
        # Get the absolute path of the current file
        file_abs_path = os.path.join(source_folder_abs, file_name)

        # Check if the current item is a file (not a directory)
        if os.path.isfile(file_abs_path):
            # Move the file to the destination folder
            shutil.move(file_abs_path, os.path.join(destination_folder_abs, file_name))

In [44]:
source_folder = dataset_dir + "_train"
destination_folder = dataset_dir + "_train/train"

move_files_within_folder(source_folder, destination_folder)

# Creating the content dataset

In [37]:
download_and_extract_coco(dataset_dir, number_of_images)
split_dataset(dataset_dir+"/train2017", train_ratio_content)

Downloading annotations...
Extracting annotations...
Downloading images...
Extracting images...
Done!


In [45]:
content_dataset = torchvision.datasets.ImageFolder(root= dataset_dir + "_train", transform= data_transform)

# Creating the style dataset

In [46]:
!git clone https://github.com/badbloody/diplomskiSlike

style_dataset = torchvision.datasets.ImageFolder('/content/gdrive/MyDrive/diplomskiSlike', transform= data_transform)

Cloning into 'diplomskiSlike'...
remote: Enumerating objects: 48, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 48 (delta 1), reused 2 (delta 0), pack-reused 42[K
Receiving objects: 100% (48/48), 89.31 MiB | 42.86 MiB/s, done.
Resolving deltas: 100% (1/1), done.


Now the content and style datasets are ready for the training stage.