In [10]:
# Import dependencies
import torch
import torchvision

from torchvision import transforms, datasets

import pathlib
import matplotlib.pyplot as plt
import random

In [2]:
# Setup path for the directory where data is to be stored
data_dir = pathlib.Path("data")
data_dir

PosixPath('data')

In [3]:
# Get the training data
train_data = datasets.Food101(root=data_dir,
                              split="train",
                              transform=None,
                              target_transform=None,
                              download=True)

# Get the testing data
test_data = datasets.Food101(root=data_dir,
                             split="test",
                             transform=None,
                             target_transform=None,
                             download=True)

Downloading https://data.vision.ee.ethz.ch/cvl/food-101.tar.gz to data/food-101.tar.gz


100%|██████████| 4996278331/4996278331 [03:14<00:00, 25733144.92it/s]


Extracting data/food-101.tar.gz to data


In [6]:
# Get the class names
class_names = train_data.classes
class_names, len(class_names)

(['apple_pie',
  'baby_back_ribs',
  'baklava',
  'beef_carpaccio',
  'beef_tartare',
  'beet_salad',
  'beignets',
  'bibimbap',
  'bread_pudding',
  'breakfast_burrito',
  'bruschetta',
  'caesar_salad',
  'cannoli',
  'caprese_salad',
  'carrot_cake',
  'ceviche',
  'cheese_plate',
  'cheesecake',
  'chicken_curry',
  'chicken_quesadilla',
  'chicken_wings',
  'chocolate_cake',
  'chocolate_mousse',
  'churros',
  'clam_chowder',
  'club_sandwich',
  'crab_cakes',
  'creme_brulee',
  'croque_madame',
  'cup_cakes',
  'deviled_eggs',
  'donuts',
  'dumplings',
  'edamame',
  'eggs_benedict',
  'escargots',
  'falafel',
  'filet_mignon',
  'fish_and_chips',
  'foie_gras',
  'french_fries',
  'french_onion_soup',
  'french_toast',
  'fried_calamari',
  'fried_rice',
  'frozen_yogurt',
  'garlic_bread',
  'gnocchi',
  'greek_salad',
  'grilled_cheese_sandwich',
  'grilled_salmon',
  'guacamole',
  'gyoza',
  'hamburger',
  'hot_and_sour_soup',
  'hot_dog',
  'huevos_rancheros',
  'hummu

In [5]:
# Get the length of train_data and test_data
len(train_data), len(test_data)

(75750, 25250)

In [7]:
len(train_data) / (len(train_data) + len(test_data))

0.75

In [9]:
# Setup image directory path
image_dir = data_dir / "food-101" / "images"
image_dir

PosixPath('data/food-101/images')

In [12]:
# Randomly select 3 classes

# Set random seed for reproducibility
random.seed(1)

target_classes = random.sample(class_names, k=3)
target_classes

['cheesecake', 'pancakes', 'takoyaki']

In [22]:
with open(data_dir / "food-101" / "meta" / "train.txt", "r") as f:
  # for line in f.readlines():
  #   print(line.strip("\n"), line.split("/")[0])
  #   break
  labels = [line.strip("\n") for line in f.readlines() if line.split("/")[0] in target_classes]
labels

['cheesecake/1001446',
 'cheesecake/1004515',
 'cheesecake/1004807',
 'cheesecake/1017408',
 'cheesecake/1021942',
 'cheesecake/1034351',
 'cheesecake/1035453',
 'cheesecake/1037609',
 'cheesecake/1047482',
 'cheesecake/1055684',
 'cheesecake/1057054',
 'cheesecake/1061232',
 'cheesecake/1061989',
 'cheesecake/106769',
 'cheesecake/1077491',
 'cheesecake/1083139',
 'cheesecake/1085860',
 'cheesecake/1086179',
 'cheesecake/1093939',
 'cheesecake/1096029',
 'cheesecake/109823',
 'cheesecake/1102384',
 'cheesecake/1122445',
 'cheesecake/1123723',
 'cheesecake/1127783',
 'cheesecake/1132368',
 'cheesecake/1135010',
 'cheesecake/1142319',
 'cheesecake/1142851',
 'cheesecake/1153138',
 'cheesecake/1159749',
 'cheesecake/1159762',
 'cheesecake/1179302',
 'cheesecake/1197481',
 'cheesecake/1207977',
 'cheesecake/1210888',
 'cheesecake/1211501',
 'cheesecake/1211627',
 'cheesecake/1217686',
 'cheesecake/1221451',
 'cheesecake/124545',
 'cheesecake/1248580',
 'cheesecake/125965',
 'cheesecake/12

In [23]:
number_to_sample = round(0.2 * len(labels))
number_to_sample

450

In [27]:
target_classes

['cheesecake', 'pancakes', 'takoyaki']

In [28]:
sampled_images = random.sample(labels,
                               k=number_to_sample)
class_dict = {
    target_classes[0]: 0,
    target_classes[1]: 0,
    target_classes[2]: 0
}

for i in sampled_images:
  if i.split("/")[0] == 'cheesecake':
    class_dict['cheesecake'] += 1
  elif i.split("/")[0] == 'pancakes':
    class_dict['pancakes'] += 1
  else:
    class_dict['takoyaki'] += 1

class_dict

{'cheesecake': 152, 'pancakes': 161, 'takoyaki': 137}

In [29]:
152 + 161 + 137

450

In [30]:
# Nah man! I want 150 images from each class
data_dir

PosixPath('data')

In [44]:
class_dict = {}
for class_name in target_classes:
  class_dict[class_name] = []
  with open(data_dir / "food-101" / "meta" / "train.txt", "r") as f:
    class_dict[class_name] = [line.strip("\n") for line in f.readlines() if line.split("/")[0] == class_name]
    f.close()

for key in class_dict.keys():
  print(len(class_dict[key]))

750
750
750


In [45]:
for key in class_dict.keys():
  image_list = class_dict[key]
  class_dict[key] = random.sample(image_list, k=round(0.2*len(image_list)))

for key in class_dict.keys():
  print(len(class_dict[key]))

150
150
150


In [48]:
sampled_images = []
for val in class_dict.values():
  sampled_images += val
len(sampled_images)

450

In [78]:
count = {class_name: 0 for class_name in target_classes}

for image in sampled_images:
  class_name = image.split('/')[0]
  count[class_name] += 1

count

{'cheesecake': 150, 'pancakes': 150, 'takoyaki': 150}

In [49]:
image_dir

PosixPath('data/food-101/images')

In [52]:
str(image_dir / sampled_images[0]) + '.jpg'

'data/food-101/images/cheesecake/2026630.jpg'

In [56]:
image_paths = [str(image_dir / sample_image) + '.jpg' for sample_image in sampled_images]
image_paths[:5], len(image_paths)

(['data/food-101/images/cheesecake/2026630.jpg',
  'data/food-101/images/cheesecake/615088.jpg',
  'data/food-101/images/cheesecake/912931.jpg',
  'data/food-101/images/cheesecake/3001984.jpg',
  'data/food-101/images/cheesecake/2805331.jpg'],
 450)

In [58]:
image_dir, data_dir

(PosixPath('data/food-101/images'), PosixPath('data'))

In [102]:
def get_subset(image_dir,
               target_classes=['cheesecake', 'pancakes', 'takoyaki'],
               data_splits=['train', 'test'],
               amount=0.2):
  label_splits = {}
  class_dict = {class_name: [] for class_name in target_classes}

  for data_split in data_splits:
    with open(data_dir / 'food-101' / 'meta' / f'{data_split}.txt', 'r') as f:
      lines = f.readlines()
      for class_name in target_classes:
        class_dict[class_name] = [line.strip("\n") for line in lines if line.split("/")[0] == class_name]
      f.close()

    for key in class_dict.keys():
      image_list = class_dict[key]
      class_dict[key] = random.sample(image_list, k=round(amount*len(image_list)))

    sampled_images = []
    for val in class_dict.values():
      sampled_images += val

    image_paths = [str(image_dir / sample_image) + '.jpg' for sample_image in sampled_images]

    label_splits[data_split] = image_paths

  return label_splits

In [103]:
label_splits = get_subset(image_dir)

In [104]:
len(label_splits['train']), len(label_splits['test'])

(450, 150)

In [105]:
class_dict = {}
for class_name in target_classes:
  class_dict[class_name] = 0

for image_path in label_splits['train']:
  if image_path.split('/')[3] == target_classes[0]:
    class_dict[target_classes[0]] += 1
  elif image_path.split('/')[3] == target_classes[1]:
    class_dict[target_classes[1]] += 1
  else:
    class_dict[target_classes[2]] += 1

class_dict

{'cheesecake': 150, 'pancakes': 150, 'takoyaki': 150}

## Move training and testing images to dedicated folders

In [111]:
# Create target directory path
target_dir = data_dir / 'cheesecake_pancakes_takoyaki_20_percent'

# Create the directory
target_dir.mkdir(parents=True, exist_ok=True)
target_dir

PosixPath('data/cheesecake_pancakes_takoyaki_20_percent')

In [135]:
import shutil

for data_split in label_splits.keys():
  for image_path in label_splits[data_split]:
    image_path = pathlib.Path(image_path)
    print(image_path)
    dest_dir = target_dir / data_split / image_path.parent.stem
    if not dest_dir.is_dir():
      dest_dir.mkdir(parents=True, exist_ok=True)
    print(f'[INFO] Copying {image_path} to {dest_dir}')
    shutil.copy2(image_path, dest_dir / image_path.name)

data/food-101/images/cheesecake/2470410.jpg
[INFO] Copying data/food-101/images/cheesecake/2470410.jpg to data/cheesecake_pancakes_takoyaki_20_percent/train/cheesecake
data/food-101/images/cheesecake/3892959.jpg
[INFO] Copying data/food-101/images/cheesecake/3892959.jpg to data/cheesecake_pancakes_takoyaki_20_percent/train/cheesecake
data/food-101/images/cheesecake/2047998.jpg
[INFO] Copying data/food-101/images/cheesecake/2047998.jpg to data/cheesecake_pancakes_takoyaki_20_percent/train/cheesecake
data/food-101/images/cheesecake/1656986.jpg
[INFO] Copying data/food-101/images/cheesecake/1656986.jpg to data/cheesecake_pancakes_takoyaki_20_percent/train/cheesecake
data/food-101/images/cheesecake/403610.jpg
[INFO] Copying data/food-101/images/cheesecake/403610.jpg to data/cheesecake_pancakes_takoyaki_20_percent/train/cheesecake
data/food-101/images/cheesecake/2129889.jpg
[INFO] Copying data/food-101/images/cheesecake/2129889.jpg to data/cheesecake_pancakes_takoyaki_20_percent/train/chees

## Zip up the images so that they can be transported easily

In [None]:
zip_file_name = data_dir / f"pizza_steak_sushi_20_percent"
shutil.make_archive(zip_file_name,
                    format="zip",
                    root_dir=data_dir)

In [136]:
target_dir

PosixPath('data/cheesecake_pancakes_takoyaki_20_percent')