# Train-Test-Split

This notebooks splits the dataset into train and test dataset.

### Input Data Format: 
* DATA_PATH/<CLASS_Name>/<Image>.jpg

### Output Data Format: 
* DATA_PATH/train/<CLASS_Name>/<Image>.jpg
* DATA_PATH/test/<CLASS_Name>/<Image>.jpg

In [None]:
import numpy as np
import pandas as pd

import os
import shutil
import pathlib
import sys

## Variables

In [2]:
# go to parent directory
os.chdir("..")
import paths

REPO_DIR = paths.get_repo_path()
ROOT_DIR = REPO_DIR / "Stanford Dogs"
DATA_BASE_PATH = paths.get_data_path() / "stanford-dogs-dataset"
DATA_PATH_IN = DATA_BASE_PATH / "images/Images"
DATA_PATH_OUT = DATA_BASE_PATH / "splited-data"

RANDOM_SEED = 42

# set path to repo_dir
os.chdir(REPO_DIR)

In [3]:
train_test_split = 0.75

## Splitting

### Naming Conventions 

In [5]:
classes_dir = sorted(os.listdir(DATA_PATH_IN))
try:
    classes_dir.remove(".DS_Store")
except ValueError:
    pass
len(classes_dir), classes_dir[:10]

(120,
 ['n02085620-Chihuahua',
  'n02085782-Japanese_spaniel',
  'n02085936-Maltese_dog',
  'n02086079-Pekinese',
  'n02086240-Shih-Tzu',
  'n02086646-Blenheim_spaniel',
  'n02086910-papillon',
  'n02087046-toy_terrier',
  'n02087394-Rhodesian_ridgeback',
  'n02088094-Afghan_hound'])

In [6]:
new_classes_names = []
for image_class in classes_dir:
    cls = image_class.replace("_", "-").split("-")
    cls = " ".join(cls[1:])
    new_classes_names.append(cls.title())
    
len(new_classes_names), new_classes_names[:10]

(120,
 ['Chihuahua',
  'Japanese Spaniel',
  'Maltese Dog',
  'Pekinese',
  'Shih Tzu',
  'Blenheim Spaniel',
  'Papillon',
  'Toy Terrier',
  'Rhodesian Ridgeback',
  'Afghan Hound'])

### Copy images to new folders

In [7]:
# create folders for train, test
np.random.seed(RANDOM_SEED)

train_images_num = 0
test_images_num = 0

train_set_location = DATA_PATH_OUT / "train"
os.makedirs(train_set_location, exist_ok=True)
test_set_location = DATA_PATH_OUT / "test"
os.makedirs(test_set_location, exist_ok=True)

for i, image_class in enumerate(classes_dir):
    image_class_path = DATA_PATH_IN / image_class
    images = sorted(os.listdir(image_class_path))
    
    try :
        images.remove(".DS_Store")
    except:
        pass
    
    train_set_size = int(len(images) * train_test_split)
    train_set_images = np.random.choice(images, train_set_size, replace=False)
    test_set_images = [image for image in images if image not in train_set_images]
    
    train_images_num+=train_set_size
    test_images_num+=len(test_set_images)
    
    train_set_class_path = train_set_location / new_classes_names[i]
    test_set_class_path = test_set_location / new_classes_names[i]
    
    # if folders exist, delete them
    if os.path.exists(train_set_class_path):
        shutil.rmtree(train_set_class_path)
    if os.path.exists(test_set_class_path):
        shutil.rmtree(test_set_class_path)
    
    # create folders for class in train and test
    os.makedirs(train_set_class_path, exist_ok=True)
    os.makedirs(test_set_class_path, exist_ok=True)
    
    # copy images to train and test
    for image in train_set_images:
        image_path = image_class_path / image
        shutil.copy(image_path, train_set_class_path)
    for image in test_set_images:
        image_path = image_class_path / image
        shutil.copy(image_path, test_set_class_path)
        
print("train set size:", train_images_num)
print("test set size:", test_images_num)

train set size: 15394
test set size: 5186


In [8]:
from json import dump

test_train_info = {
    "total_train_images": train_images_num,
    "total_test_images": test_images_num,
}

with open(DATA_PATH_OUT / "test_train_info.json", "w") as f:
    dump(test_train_info, f)

In [15]:
sorted(os.listdir(image_class_path))[:15]

['n02116738_10024.jpg',
 'n02116738_10038.jpg',
 'n02116738_10081.jpg',
 'n02116738_10169.jpg',
 'n02116738_10215.jpg',
 'n02116738_10469.jpg',
 'n02116738_10476.jpg',
 'n02116738_10493.jpg',
 'n02116738_10575.jpg',
 'n02116738_10614.jpg',
 'n02116738_10640.jpg',
 'n02116738_10872.jpg',
 'n02116738_10895.jpg',
 'n02116738_1097.jpg',
 'n02116738_1105.jpg']