<a href="https://colab.research.google.com/github/aliroshandel98/Pytorch/blob/main/Custom_Data_creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import torch
import torchvision

import torchvision.datasets as datasets
import torchvision.transforms as transforms

# Setup data directory
import pathlib
data_dir = pathlib.Path("../data_2")

In [16]:
# Get training data
train_data = datasets.Food101(root=data_dir,
                              split="train",
                              # transform=transforms.ToTensor(),
                              download=True)

# Get testing data
test_data = datasets.Food101(root=data_dir,
                             split="test",
                             # transform=transforms.ToTensor(),
                             download=True)

Downloading https://data.vision.ee.ethz.ch/cvl/food-101.tar.gz to ../data_2/food-101.tar.gz


100%|██████████| 4996278331/4996278331 [03:03<00:00, 27218475.77it/s]


Extracting ../data_2/food-101.tar.gz to ../data_2


In [17]:
class_names = train_data.classes
class_names[:10]

['apple_pie',
 'baby_back_ribs',
 'baklava',
 'beef_carpaccio',
 'beef_tartare',
 'beet_salad',
 'beignets',
 'bibimbap',
 'bread_pudding',
 'breakfast_burrito']

In [18]:
# Get random 10% of training images
import random

# Setup data paths
data_path = data_dir / "food-101" / "images"
target_classes = ["pizza", "steak", "sushi"]

# Change amount of data to get (e.g. 0.1 = random 10%, 0.2 = random 20%)
amount_to_get = 0.5

# Create function to separate a random amount of data
def get_subset(image_path=data_path,
               data_splits=["train", "test"],
               target_classes=["pizza", "steak", "sushi"],
               amount=0.1,
               seed=42):
    random.seed(42)
    label_splits = {}

    # Get labels
    for data_split in data_splits:
        print(f"[INFO] Creating image split for: {data_split}...")
        label_path = data_dir / "food-101" / "meta" / f"{data_split}.txt"
        with open(label_path, "r") as f:
            labels = [line.strip("\n") for line in f.readlines() if line.split("/")[0] in target_classes]

        # Get random subset of target classes image ID's
        number_to_sample = round(amount * len(labels))
        print(f"[INFO] Getting random subset of {number_to_sample} images for {data_split}...")
        sampled_images = random.sample(labels, k=number_to_sample)

        # Apply full paths
        image_paths = [pathlib.Path(str(image_path / sample_image) + ".jpg") for sample_image in sampled_images]
        label_splits[data_split] = image_paths
    return label_splits

label_splits = get_subset(amount=amount_to_get)
label_splits["train"][:10]

[INFO] Creating image split for: train...
[INFO] Getting random subset of 1125 images for train...
[INFO] Creating image split for: test...
[INFO] Getting random subset of 375 images for test...


[PosixPath('../data_2/food-101/images/pizza/3269634.jpg'),
 PosixPath('../data_2/food-101/images/pizza/1524655.jpg'),
 PosixPath('../data_2/food-101/images/steak/2825100.jpg'),
 PosixPath('../data_2/food-101/images/steak/225990.jpg'),
 PosixPath('../data_2/food-101/images/steak/1839481.jpg'),
 PosixPath('../data_2/food-101/images/pizza/38349.jpg'),
 PosixPath('../data_2/food-101/images/pizza/3018077.jpg'),
 PosixPath('../data_2/food-101/images/sushi/93139.jpg'),
 PosixPath('../data_2/food-101/images/pizza/2702825.jpg'),
 PosixPath('../data_2/food-101/images/sushi/200025.jpg')]

In [19]:
# Create target directory path
target_dir_name = f"../data_2/pizza_steak_sushi_{str(int(amount_to_get*100))}_percent"
print(f"Creating directory: '{target_dir_name}'")

# Setup the directories
target_dir = pathlib.Path(target_dir_name)

# Make the directories
target_dir.mkdir(parents=True, exist_ok=True)

Creating directory: '../data_2/pizza_steak_sushi_50_percent'


In [20]:
import shutil

for image_split in label_splits.keys():
    for image_path in label_splits[str(image_split)]:
        dest_dir = target_dir / image_split / image_path.parent.stem / image_path.name
        if not dest_dir.parent.is_dir():
            dest_dir.parent.mkdir(parents=True, exist_ok=True)
        print(f"[INFO] Copying {image_path} to {dest_dir}...")
        shutil.copy2(image_path, dest_dir)

[INFO] Copying ../data_2/food-101/images/pizza/3269634.jpg to ../data_2/pizza_steak_sushi_50_percent/train/pizza/3269634.jpg...
[INFO] Copying ../data_2/food-101/images/pizza/1524655.jpg to ../data_2/pizza_steak_sushi_50_percent/train/pizza/1524655.jpg...
[INFO] Copying ../data_2/food-101/images/steak/2825100.jpg to ../data_2/pizza_steak_sushi_50_percent/train/steak/2825100.jpg...
[INFO] Copying ../data_2/food-101/images/steak/225990.jpg to ../data_2/pizza_steak_sushi_50_percent/train/steak/225990.jpg...
[INFO] Copying ../data_2/food-101/images/steak/1839481.jpg to ../data_2/pizza_steak_sushi_50_percent/train/steak/1839481.jpg...
[INFO] Copying ../data_2/food-101/images/pizza/38349.jpg to ../data_2/pizza_steak_sushi_50_percent/train/pizza/38349.jpg...
[INFO] Copying ../data_2/food-101/images/pizza/3018077.jpg to ../data_2/pizza_steak_sushi_50_percent/train/pizza/3018077.jpg...
[INFO] Copying ../data_2/food-101/images/sushi/93139.jpg to ../data_2/pizza_steak_sushi_50_percent/train/sushi

In [21]:
# Check lengths of directories
def walk_through_dir(dir_path):
  """
  Walks through dir_path returning its contents.
  Args:
    dir_path (str): target directory

  Returns:
    A print out of:
      number of subdiretories in dir_path
      number of images (files) in each subdirectory
      name of each subdirectory
  """
  import os
  for dirpath, dirnames, filenames in os.walk(dir_path):
    print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")

walk_through_dir(target_dir)

There are 2 directories and 0 images in '../data_2/pizza_steak_sushi_50_percent'.
There are 3 directories and 0 images in '../data_2/pizza_steak_sushi_50_percent/test'.
There are 0 directories and 134 images in '../data_2/pizza_steak_sushi_50_percent/test/steak'.
There are 0 directories and 121 images in '../data_2/pizza_steak_sushi_50_percent/test/sushi'.
There are 0 directories and 120 images in '../data_2/pizza_steak_sushi_50_percent/test/pizza'.
There are 3 directories and 0 images in '../data_2/pizza_steak_sushi_50_percent/train'.
There are 0 directories and 380 images in '../data_2/pizza_steak_sushi_50_percent/train/steak'.
There are 0 directories and 359 images in '../data_2/pizza_steak_sushi_50_percent/train/sushi'.
There are 0 directories and 386 images in '../data_2/pizza_steak_sushi_50_percent/train/pizza'.


In [22]:
# Zip pizza_steak_sushi images
zip_file_name = data_dir / f"pizza_steak_sushi_{str(int(amount_to_get*100))}_percent"
shutil.make_archive(zip_file_name,
                    format="zip",
                    root_dir=target_dir)

'/data_2/pizza_steak_sushi_50_percent.zip'

In [23]:
!ls -la ../data/

total 4956220
drwxr-xr-x 4 root root       4096 Jul  2 13:45 .
drwxr-xr-x 1 root root       4096 Jul  2 15:03 ..
drwxr-xr-x 4 3156  320       4096 Jul  9  2014 food-101
-rw-r--r-- 1 root root 4996278331 Jul  2 13:43 food-101.tar.gz
drwxr-xr-x 4 root root       4096 Jul  2 13:45 pizza_steak_sushi_50_percent
-rw-r--r-- 1 root root   78864463 Jul  2 13:45 pizza_steak_sushi_50_percent.zip


In [24]:
!mkdir -p pizza_steak_sushi
!unzip ../data/pizza_steak_sushi_50_percent.zip -d pizza_steak_sushi

Archive:  ../data/pizza_steak_sushi_50_percent.zip
replace pizza_steak_sushi/test/steak/2989645.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: pizza_steak_sushi/test/steak/2989645.jpg  
replace pizza_steak_sushi/test/steak/3553838.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: 
error:  invalid response [{ENTER}]
replace pizza_steak_sushi/test/steak/3553838.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: 
error:  invalid response [{ENTER}]
replace pizza_steak_sushi/test/steak/3553838.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: 
error:  invalid response [{ENTER}]
replace pizza_steak_sushi/test/steak/3553838.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: pizza_steak_sushi/test/steak/3553838.jpg  
  inflating: pizza_steak_sushi/test/steak/289822.jpg  
  inflating: pizza_steak_sushi/test/steak/2921355.jpg  
  inflating: pizza_steak_sushi/test/steak/1792128.jpg  
  inflating: pizza_steak_sushi/test/steak/2649745.jpg  
  inflating: pizza_steak_sushi/test/steak/562855.jpg  
  infla

In [25]:
!ls ../data

food-101  food-101.tar.gz  pizza_steak_sushi_50_percent  pizza_steak_sushi_50_percent.zip


In [26]:
walk_through_dir("pizza_steak_sushi")

There are 2 directories and 0 images in 'pizza_steak_sushi'.
There are 3 directories and 0 images in 'pizza_steak_sushi/test'.
There are 0 directories and 134 images in 'pizza_steak_sushi/test/steak'.
There are 0 directories and 121 images in 'pizza_steak_sushi/test/sushi'.
There are 0 directories and 120 images in 'pizza_steak_sushi/test/pizza'.
There are 3 directories and 0 images in 'pizza_steak_sushi/train'.
There are 0 directories and 380 images in 'pizza_steak_sushi/train/steak'.
There are 0 directories and 359 images in 'pizza_steak_sushi/train/sushi'.
There are 0 directories and 386 images in 'pizza_steak_sushi/train/pizza'.
