# Create Dataset Solely Off of Boredom Labels

## Working with a multi-label dataset is too complicated as the data is very unbalanced. So for now, we will only pick 1 label from the dataset, I picked boredom because it is the least unbalanced out of the four labels.

Create a new dataset by keeping only the 'Boredom' label column and balancing the classes by reducing all videos to match the size of the smallest class within the 'Boredom' label.

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
import torch
from torchvision import transforms
from PIL import Image
import random
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from collections import Counter


In [64]:
label_set = pd.read_csv(r"C:\Users\ahmad\Desktop\EngagementML\DAiSEE\Labels\AllLabels.csv")

new_label_set = label_set.drop(columns=['Engagement', 'Confusion', 'Frustration'], axis=1)

print("Unbalanced boredom dataset class distribution:")
print(new_label_set.shape)
print(new_label_set['Boredom'].value_counts())
print('-------------------------------------------s')


# Balance the dataset by sampling an equal number of videos for each class in 'Boredom'
# The size is limited to the smallest class size
min_class_size = 330
balanced_data = new_label_set.groupby('Boredom', group_keys=False).apply(
    lambda x: x.sample(min_class_size, random_state=42)
).reset_index(drop=True)

# Check the new balanced distribution
print("Balanced dataset class distribution:")
print(balanced_data['Boredom'].value_counts())

print('-------------------------------------------')

# Further balance: Ensure all persons in the dataset have an equal number of videos (10 videos per person)
# Extract 'person_id' from the 'ClipID' column
balanced_data['person_id'] = balanced_data['ClipID'].str[:6]
balanced_data = balanced_data.groupby('person_id', group_keys=False).apply(
    lambda x: x.sample(10, random_state=42) if len(x) >= 10 else None
).dropna().reset_index(drop=True)


# Check the updated counts
person_counts = Counter(balanced_data['person_id'])
print(dict(person_counts))

print("\nLength of the updated dataset:")
print(len(balanced_data))


# Define the split ratios
train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

# Split the data into training and temp (for validation and testing)
train_data, temp_data = train_test_split(
    balanced_data, test_size=(1 - train_ratio), random_state=42, stratify=balanced_data['Boredom']
)

# Further split the temp data into validation and testing
val_data, test_data = train_test_split(
    temp_data, test_size=(test_ratio / (test_ratio + val_ratio)), random_state=42, stratify=temp_data['Boredom']
)

# Print the sizes of the splits
print("Train set size:", len(train_data))
print("Validation set size:", len(val_data))
print("Test set size:", len(test_data))

# Save each split as a CSV file
train_data.to_csv("train_labels.csv", index=False)
val_data.to_csv("val_labels.csv", index=False)
test_data.to_csv("test_labels.csv", index=False)

print("Train, validation, and test splits saved as CSV files.")





Unbalanced boredom dataset class distribution:
(8925, 2)
Boredom
0    3822
1    2850
2    1923
3     330
Name: count, dtype: int64
-------------------------------------------s
Balanced dataset class distribution:
Boredom
0    330
1    330
2    330
3    330
Name: count, dtype: int64
-------------------------------------------
{'110001': 10, '110005': 10, '110006': 10, '110007': 10, '110012': 10, '110014': 10, '110015': 10, '110017': 10, '111003': 10, '181374': 10, '200050': 10, '202614': 10, '205601': 10, '210052': 10, '210053': 10, '210055': 10, '210057': 10, '210058': 10, '210059': 10, '210060': 10, '210061': 10, '226051': 10, '240846': 10, '310062': 10, '310070': 10, '310072': 10, '310074': 10, '310075': 10, '310076': 10, '310077': 10, '310078': 10, '310079': 10, '310082': 10, '334463': 10, '342227': 10, '350361': 10, '400018': 10, '400022': 10, '400030': 10, '400033': 10, '410019': 10, '410020': 10, '410024': 10, '410025': 10, '410026': 10, '410028': 10, '410029': 10, '410030': 10, 

#### To simplify training without a GPU, we’ll use a smaller subset of the dataset. This reduces computational load, speeds up experimentation, and lets us validate the pipeline before scaling to the full dataset. To put it simply, we will create a new dataset out of the current one we have.


In [48]:
# Target size per class
videos_per_class = 25

# Downsample each class
small_dataset = balanced_data.groupby('Boredom', group_keys=False).apply(
    lambda x: x.sample(videos_per_class, random_state=42)
).reset_index(drop=True)

# Check the reduced dataset
print("Reduced dataset class distribution:")
print(small_dataset['Boredom'].value_counts())

print("\nLength of the reduced dataset:")
print(len(small_dataset))

person_counts = Counter(small_dataset['person_id'])
print(dict(person_counts))
print(len(person_counts))

Reduced dataset class distribution:
Boredom
0    25
1    25
2    25
3    25
Name: count, dtype: int64

Length of the reduced dataset:
100
{'500044': 2, '110006': 1, '210061': 3, '310072': 1, '500095': 1, '110001': 1, '310074': 4, '350361': 2, '110007': 2, '410020': 2, '510038': 1, '110015': 4, '410030': 3, '882654': 2, '310079': 2, '567496': 4, '500067': 3, '510046': 2, '310062': 2, '510035': 2, '210059': 1, '459999': 1, '310070': 1, '410026': 4, '210052': 1, '410032': 2, '202614': 3, '181374': 2, '310077': 1, '205601': 1, '210053': 2, '414081': 1, '410029': 2, '200050': 4, '334463': 1, '210058': 2, '826412': 1, '410024': 2, '410028': 2, '510040': 1, '110014': 1, '310082': 3, '110017': 1, '411021': 2, '310076': 1, '556463': 1, '110012': 1, '310075': 2, '400018': 1, '411031': 1, '410019': 1, '240846': 1, '210060': 1, '111003': 1, '210055': 1, '510034': 1, '410025': 1}
57


In [155]:
# Target size per person
videos_per_person = 3

# Downsample to ensure diversity: max 3 videos per person
small_dataset = balanced_data.groupby('person_id', group_keys=False).apply(
    lambda x: x.sample(videos_per_person, random_state=42) if len(x) >= videos_per_person else None
).dropna().reset_index(drop=True)

# Ensure class balance: Equal number of videos for each boredom level
min_samples_per_class = small_dataset['Boredom'].value_counts().min()
smalL_balanced_dataset = small_dataset.groupby('Boredom', group_keys=False).apply(
    lambda x: x.sample(min_samples_per_class, random_state=42)
).reset_index(drop=True)

# Check distribution
print("Reduced dataset class distribution:")
print(smalL_balanced_dataset['Boredom'].value_counts())

print("\nLength of the reduced dataset:")
print(len(smalL_balanced_dataset))


person_counts = Counter(smalL_balanced_dataset['person_id'])
print(dict(person_counts))
print(len(person_counts))




Reduced dataset class distribution:
Boredom
0    32
1    32
2    32
3    32
Name: count, dtype: int64

Length of the reduced dataset:
128
{'240846': 2, '210060': 3, '210059': 2, '310079': 3, '110012': 3, '210055': 2, '826382': 3, '882654': 2, '110017': 2, '310078': 3, '181374': 2, '210057': 3, '410020': 2, '510035': 3, '110001': 3, '400018': 3, '310072': 1, '510046': 3, '310076': 3, '500044': 3, '500095': 2, '334463': 3, '410028': 2, '342227': 3, '310074': 2, '110007': 3, '205601': 3, '210052': 3, '410026': 3, '110015': 3, '410032': 3, '410024': 3, '350361': 2, '556463': 3, '400033': 3, '410029': 2, '400030': 2, '111003': 2, '310075': 3, '210058': 3, '500039': 3, '110014': 3, '500067': 1, '411031': 3, '200050': 3, '826412': 2, '510034': 2, '410030': 3, '410019': 2, '411021': 2}
50


In [156]:
# Define the split ratios
train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

# Split the data into training and temp (for validation and testing)
train_data, temp_data = train_test_split(
    smalL_balanced_dataset, test_size=(1 - train_ratio), random_state=42, stratify=smalL_balanced_dataset['Boredom']
)

# Further split the temp data into validation and testing
val_data, test_data = train_test_split(
    temp_data, test_size=(test_ratio / (test_ratio + val_ratio)), random_state=42, stratify=temp_data['Boredom']
)

# Print the sizes of the splits
print("Train set size:", len(train_data))
print("Validation set size:", len(val_data))
print("Test set size:", len(test_data))

# Save each split as a CSV file
train_data.to_csv("train_labels_small.csv", index=False)
val_data.to_csv("val_labels_small.csv", index=False)
test_data.to_csv("test_labels_small.csv", index=False)

print("Train, validation, and test splits saved as CSV files.")


Train set size: 89
Validation set size: 26
Test set size: 13
Train, validation, and test splits saved as CSV files.
