We will expect the following folder structure for the data:
```bash
Data
├─ Folder1
│  ├─ image1
│  ├─ image2
│  └─ etc
├─ Folder2
│  ├─ image1
│  ├─ image2
│  └─ etc
├─ metadata1.csv
└─ metadata2.csv
```



In [1]:
import pandas as pd
import ast
import os
from pathlib import Path
from google.colab import drive

drive.mount("/content/drive")
%cd 'drive/Othercomputers/My Computer (1)/EmotionTeller-github'

Mounted at /content/drive
/content/drive/Othercomputers/My Computer (1)/EmotionTeller-github


In [40]:
data_root       = 'Data' # Folder where all data sources are stored
data_folders    = 'ImageData' # List of folders containing images in .jpg format
#data_folders    = 'mosaic_images' # List of folders containing images in .jpg format
data_meta       = ['emotic-relabelled.csv',
                 'hgel-relabelled.csv'] # List of metadata corresponding to previous list of folders in .csv format. In our case we relabelled some of the data, so this is different from original metadata.
#data_meta       = ['mosaic_labels.csv']
meta_root       = 'Metadata'

In [41]:
import pandas as pd
import ast
import os
from sklearn.model_selection import train_test_split

meta_list = []

for meta_file in data_meta:
    df = pd.read_csv(os.path.join(data_root,meta_file))
    #df['objects']           = df['objects'].apply(ast.literal_eval)
    df['objects']           = df['objects_final'].apply(ast.literal_eval)
    #df['o_old']             = df['objects_old'].apply(ast.literal_eval)
    df['original_height']    = df['file_name'].apply(lambda x: 1)
    df['original_width']    = df['file_name'].apply(lambda x: 1)
    cols = list(set(df.columns) & set(['objects_old','objects_new','objects_final','n_old','n_final','o_old']))
    df.drop(columns=cols, inplace = True)
    meta_list.append(df)

colset = set(meta_list[0].columns)
if all(set(df.columns) for df in meta_list):
    meta = pd.concat(meta_list, ignore_index = True)
else:
    print('Mismatched columns in metadata.')

In [27]:
from PIL import Image

def get_image_dimensions(image_path):
    """
    Gets the width and height of an image file.

    Args:
        image_path: The path to the image file.

    Returns:
        A tuple containing the width and height of the image.
    """
    with Image.open(image_path) as img:
        width, height = img.size
    return width, height


If `interative_stratification` isn't installed:

In [28]:
!pip install iterative-stratification
!pip install Pillow



The widths and heights in the original metadata are incorrect, so we replace them with the correct values taken directly from each image.

In [42]:
for index, row in meta.iterrows():
    image_path = os.path.join(data_root, data_folders, row['file_name'])
    try:
        width, height = get_image_dimensions(image_path)
        meta.loc[index, 'original_width'] = width
        meta.loc[index, 'original_height'] = height
    except Exception as e:
        print(f"Could not process file {row['file_name']}: {e}")


To handle the multiple labels associated to every picture we use a multilabel stratifier.

In [44]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

meta['emotions'] = meta['objects'].apply(lambda x: x['categories'])

mlb = MultiLabelBinarizer()

X = meta[['file_name']]
y = mlb.fit_transform(meta['emotions'])

mss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
idx = np.arange(len(meta))

(train_idx, test_idx), = mss.split(idx, y)

train_df = meta.iloc[train_idx].reset_index(drop=True)
test_df  = meta.iloc[test_idx].reset_index(drop=True)

train_df.to_csv(os.path.join(meta_root,'train_meta.csv'),index=False)
test_df.to_csv(os.path.join(meta_root,'test_meta.csv'),index=False)
#train_df.to_csv(os.path.join(meta_root,'train_meta_mosaic.csv'),index=False)
#test_df.to_csv(os.path.join(meta_root,'test_meta_mosaic.csv'),index=False)