# Train-Test-Split

This notebooks splits the dataset into train and test dataset.

### Input Data Format: 
* DATA_PATH/<CLASS_Name>/<Image>.jpg

### Output Data Format: 
* DATA_PATH/train/<CLASS_Name>/<Image>.jpg
* DATA_PATH/test/<CLASS_Name>/<Image>.jpg

In [92]:
import numpy as np
import pandas as pd

import os
import shutil
import pathlib
import sys

## Variables

In [93]:
# go to parent directory
os.chdir("..")
import paths

REPO_DIR = paths.get_repo_path()
ROOT_DIR = REPO_DIR / "Stanford Dogs"
DATA_BASE_PATH = paths.get_data_path() / "stanford-dogs-dataset"
DATA_PATH_IN = DATA_BASE_PATH / "images/Images"
DATA_PATH_OUT = DATA_BASE_PATH / "splited-data"

RANDOM_SEED = 42

# set path to repo_dir
os.chdir(REPO_DIR)

In [94]:
train_ratio, val_ratio, test_ratio = 0.6, 0.15, 0.25

## Splitting

### Naming Conventions 

In [107]:
classes_dir = os.listdir(DATA_PATH_IN)
classes_dir[:10]
try:
    classes_dir.remove(".DS_Store")
except ValueError:
    pass
len(classes_dir)

120

In [109]:
new_classes_names = []
for image_class in classes_dir:
    cls = image_class.replace("_", "-").split("-")
    cls = " ".join(cls[1:])
    new_classes_names.append(cls.title())
    
len(new_classes_names), new_classes_names[:10]

(120,
 ['Silky Terrier',
  'Scottish Deerhound',
  'Chesapeake Bay Retriever',
  'Ibizan Hound',
  'Wire Haired Fox Terrier',
  'Saluki',
  'Cocker Spaniel',
  'Schipperke',
  'Borzoi',
  'Pembroke'])

### Copy images to new folders

In [110]:
np.random.seed(RANDOM_SEED)

In [111]:
# create folders for train, test
train_set_location = DATA_PATH_OUT / "train"
os.makedirs(train_set_location, exist_ok=True)
test_set_location = DATA_PATH_OUT / "test"
os.makedirs(test_set_location, exist_ok=True)

for i, image_class in enumerate(classes_dir):
    image_class_path = DATA_PATH_IN / image_class
    images = os.listdir(image_class_path)
    
    try :
        images.remove(".DS_Store")
    except:
        pass
    
    train_set_size = int(len(images) * train_ratio)
    train_set_images = np.random.choice(images, train_set_size, replace=False)
    test_set_images = [image for image in images if image not in train_set_images]
    
    train_set_class_path = train_set_location / new_classes_names[i]
    test_set_class_path = test_set_location / new_classes_names[i]
    
    # if folders exist, delete them
    if os.path.exists(train_set_class_path):
        shutil.rmtree(train_set_class_path)
    if os.path.exists(test_set_class_path):
        shutil.rmtree(test_set_class_path)
    
    # create folders for class in train and test
    os.makedirs(train_set_class_path, exist_ok=True)
    os.makedirs(test_set_class_path, exist_ok=True)
    
    # copy images to train and test
    for image in train_set_images:
        image_path = image_class_path / image
        shutil.copy(image_path, train_set_class_path)
    for image in test_set_images:
        image_path = image_class_path / image
        shutil.copy(image_path, test_set_class_path)