# Modeling 1 - Train-Test-Split

In [38]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import json
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, Dataset
from torchvision import transforms

In [11]:
# check if gpu is available, set device

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
device

device(type='mps')

In [7]:
# import data

with open('data/df_kohaku_sanke_showa.json', 'r') as f:
  data = json.load(f)

df = pd.DataFrame(data)

df.head()

Unnamed: 0,image_path,label
0,champkoi_data/18f755_A_32_Female_Kohaku_[Nogam...,kohaku
1,gckoi_data/d9a7dbc_HIGH_QUALITY_SAKAI_BLOODLIN...,kohaku
2,gckoi_data/88ec869_ISA_KOHAKU_GUARANTEE_FEMALE...,kohaku
3,gckoi_data/3a7df4e_DAINICHI_3SAI_FEMALE_KOHAKU...,kohaku
4,gckoi_data/cb26670_DAINICHI_BLOODLINE_KOHAKU_2...,kohaku


In [39]:
# use the labelencoder to encode the classes

le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])
df['label_encoded'].value_counts()

label_encoded
0    264
1    237
2    185
Name: count, dtype: int64

In [47]:
# get the mapping

label_map = {}
for i in range(len(le.classes_)):
  label_map[i] = le.inverse_transform([i])[0]

label_map

{0: 'kohaku', 1: 'sanke', 2: 'showa'}

In [48]:
# double check the data

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 686 entries, 0 to 685
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   image_path     686 non-null    object
 1   label          686 non-null    object
 2   label_encoded  686 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 16.2+ KB


In [52]:
# train-test-val-split

train_df, testval_df = train_test_split(df[['image_path', 'label_encoded']], test_size=0.3, shuffle=True, random_state=42, stratify=df['label_encoded'])
val_df, test_df = train_test_split(testval_df, test_size=0.5, shuffle=True, random_state=42, stratify=testval_df['label_encoded'])

# check the shape

train_df.shape, val_df.shape, test_df.shape

((480, 2), (103, 2), (103, 2))

In [60]:
# create a custom class for the image dataset

class CustomImageDataset(Dataset):
  def __init__(self, dataframe, img_dir, transform=None):
    self.df = dataframe.reset_index(drop=True)
    self.img_dir = img_dir
    self.transform = transform

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    img_path = os.path.join(self.img_dir, self.df.loc[idx, 'image_path'])
    image = Image.open(img_path).convert('RGB')
    label = self.df.loc[idx, 'label_encoded']
    if self.transform:
      image = self.transform(image)
    return image, torch.tensor(label, dtype=torch.long)

In [61]:
# define transformers and loaders

transform = transforms.Compose([
  transforms.Resize((224, 224)),
  transforms.ToTensor()
])

dir_path = '../pictures/'

train_ds = CustomImageDataset(train_df, img_dir=dir_path, transform=transform)
val_ds = CustomImageDataset(val_df, img_dir=dir_path, transform=transform)
test_ds = CustomImageDataset(test_df, img_dir=dir_path, transform=transform)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)

In [64]:
for images, labels in train_loader:
    print(type(images), images.shape)
    print(type(labels), labels.shape)
    break

<class 'torch.Tensor'> torch.Size([32, 3, 224, 224])
<class 'torch.Tensor'> torch.Size([32])


In [65]:
# check tensors

for images, labels in train_loader:
  print(images.shape, labels.shape)
  break

torch.Size([32, 3, 224, 224]) torch.Size([32])
