In [1]:
from torch.utils.data import Dataset
import numpy as np
import matplotlib.pyplot as plt
import torch
from typing import List, Union
from transformers import AutoTokenizer, AutoModel

class MyDataset(Dataset):
    def __init__(self, 
                ids: List[str], 
                speakers: List[str], 
                sexes: List[str], 
                texts: List[str], 
                texts_en: List[str], 
                labels: List[bool],
                device: torch.device = torch.device('cpu'),
                model_name: str = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english',
                max_length: int = 512
        ):
        assert len(ids) == len(speakers) == len(sexes) == len(texts) == len(texts_en) == len(labels)
        self.ids = []
        self.speakers = []
        self.sexes = []
        self.texts = []
        self.texts_en = []
        self.embeddings = []
        self.attention_masks = []
        self.labels = []
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        for i in range(len(ids)):
            inputs = self.tokenizer(texts[i], add_special_tokens=True, return_tensors='pt', padding='max_length',max_length=max_length)
            if inputs['input_ids'].shape[1] <= max_length:
                inputs = self.tokenizer(texts_en[i], add_special_tokens=True, return_tensors='pt', padding='max_length',max_length=max_length)
                self.ids.append(ids[i])
                self.speakers.append(speakers[i])
                self.sexes.append(sexes[i])
                self.texts.append(texts[i])
                self.texts_en.append(texts_en[i])
                self.embeddings.append(inputs['input_ids'][0])
                self.attention_masks.append(inputs['attention_mask'])
                self.labels.append(torch.tensor((labels[i]), dtype=torch.long))
                
        print(f'Loaded {len(self.ids)}/{len(ids)} samples.')

    def __getitem__(self, index):
        return self.ids[index], self.speakers[index], self.sexes[index], self.texts[index], \
                self.texts_en[index], self.embeddings[index][:512].to(self.device), self.attention_masks[index][0][:512].to(self.device), self.labels[index]
            
    def __len__(self):
        return len(self.ids)

    def set_device(self, device: torch.device):
        '''
        Sets the device to the given device.
        '''
        self.device = device

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
try:
    from dataset import MyDataset
except ImportError:
    pass

from typing import List
import os
import torch
import pandas as pd
from torch.utils.data import random_split

english_speaking_countries = ['gb']

DATA_DIR = "data/orientation"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "roberta-base"
OUTPUT_DIR = "data/torch/orientation"

def load_data(file_path: str):
    '''
    Loads specified dataset and returns lists of columns
    '''
    country_code = file_path.split('-')[1]  # Extract country code from filename
    try:
        df = pd.read_csv(file_path, delimiter='\t', quoting=3, on_bad_lines='skip')
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return pd.DataFrame()
    
    if country_code in english_speaking_countries:
        df['text_combined'] = df['text']
    else:
        df['text_combined'] = df['text_en']
    # Drop rows where 'text_combined' is NaN or empty
    df = df.dropna(subset=['text_combined'])
    df = df[df['text_combined'] != '']
    df['file_path'] = file_path  # Add file path information
    return list(df['id']), list(df['speaker']), list(df['sex']), list(df['text']), list(df['text_combined']), list(df['label'])

def train_val_test_split_country(data: MyDataset, val_size:float = 0.1, test_size:float = 0.1, random_state:int = 42):
    train_size = 1 - test_size - val_size
    train_data, val_data, test_data = random_split(data, [train_size, val_size, test_size], \
                                                   generator=torch.Generator().manual_seed(random_state))
    return train_data, val_data, test_data


train_dataset, val_dataset, test_dataset = [], [], []
for filename in os.listdir(DATA_DIR):
    if filename.endswith(".tsv"):
        file_path = os.path.join(DATA_DIR, filename)
        ids, speakers, sexes, texts, texts_en, labels = load_data(file_path)
        df = MyDataset(
            ids=ids,
            speakers=speakers,
            sexes=sexes,
            texts=texts,
            texts_en=texts_en,
            labels=labels,
            device=DEVICE,
            model_name=MODEL_NAME
        )
        train_df, val_df, test_df = train_val_test_split_country(df)
        train_dataset.append(train_df)
        val_dataset.append(val_df)
        test_dataset.append(test_df)
        torch.save(train_df, os.path.join(OUTPUT_DIR, f"train_dataset_{filename.replace('-train.tsv', '.pt')}"))
        torch.save(val_df, os.path.join(OUTPUT_DIR, f"val_dataset_{filename.replace('-train.tsv', '.pt')}"))
        torch.save(test_df, os.path.join(OUTPUT_DIR, f"test_dataset_{filename.replace('-train.tsv', '.pt')}"))
        breakpoint()
        print(f"Processed {filename}, created train, val, and test datasets of size {len(train_df)}, {len(val_df)}, and {len(test_df)} respectively.")

train_dataset = torch.utils.data.ConcatDataset(train_dataset)
val_dataset = torch.utils.data.ConcatDataset(val_dataset)
test_dataset = torch.utils.data.ConcatDataset(test_dataset)

torch.save(train_dataset, os.path.join(OUTPUT_DIR, "train_dataset_all.pt"))
torch.save(val_dataset, os.path.join(OUTPUT_DIR, "val_dataset_all.pt"))
torch.save(test_dataset, os.path.join(OUTPUT_DIR, "test_dataset_all.pt"))

print(f"Processed all files, created train, val, and test datasets of size {len(train_dataset)}, {len(val_dataset)}, and {len(test_dataset)} respectively.")

  self.labels.append(torch.tensor((labels[i]), dtype=torch.long))


Loaded 538/3438 samples.
Processed orientation-at-train.tsv, created train, val, and test datasets of size 431, 54, and 53 respectively.


  self.labels.append(torch.tensor((labels[i]), dtype=torch.long))


Loaded 398/1249 samples.
Processed orientation-ba-train.tsv, created train, val, and test datasets of size 319, 40, and 39 respectively.


  self.labels.append(torch.tensor((labels[i]), dtype=torch.long))


Loaded 759/2220 samples.
Processed orientation-be-train.tsv, created train, val, and test datasets of size 608, 76, and 75 respectively.
Loaded 0/3907 samples.
Processed orientation-bg-train.tsv, created train, val, and test datasets of size 0, 0, and 0 respectively.


  self.labels.append(torch.tensor((labels[i]), dtype=torch.long))


Loaded 961/4136 samples.
Processed orientation-cz-train.tsv, created train, val, and test datasets of size 769, 96, and 96 respectively.
Loaded 840/3069 samples.
Processed orientation-dk-train.tsv, created train, val, and test datasets of size 672, 84, and 84 respectively.


  self.labels.append(torch.tensor((labels[i]), dtype=torch.long))


Loaded 1514/2580 samples.
Processed orientation-ee-train.tsv, created train, val, and test datasets of size 1212, 151, and 151 respectively.
Loaded 98/2077 samples.
Processed orientation-es-ct-train.tsv, created train, val, and test datasets of size 79, 10, and 9 respectively.


  self.labels.append(torch.tensor((labels[i]), dtype=torch.long))


Loaded 17/850 samples.
Processed orientation-es-ga-train.tsv, created train, val, and test datasets of size 14, 2, and 1 respectively.


  self.labels.append(torch.tensor((labels[i]), dtype=torch.long))


Loaded 120/4767 samples.
Processed orientation-es-train.tsv, created train, val, and test datasets of size 96, 12, and 12 respectively.


  self.labels.append(torch.tensor((labels[i]), dtype=torch.long))


Loaded 320/740 samples.
Processed orientation-fi-train.tsv, created train, val, and test datasets of size 256, 32, and 32 respectively.


  self.labels.append(torch.tensor((labels[i]), dtype=torch.long))


Loaded 1561/2293 samples.
Processed orientation-fr-train.tsv, created train, val, and test datasets of size 1249, 156, and 156 respectively.
Loaded 15196/24239 samples.
Processed orientation-gb-train.tsv, created train, val, and test datasets of size 12157, 1520, and 1519 respectively.
Loaded 0/5639 samples.
Processed orientation-gr-train.tsv, created train, val, and test datasets of size 0, 0, and 0 respectively.


  self.labels.append(torch.tensor((labels[i]), dtype=torch.long))


Loaded 2238/6507 samples.
Processed orientation-hr-train.tsv, created train, val, and test datasets of size 1791, 224, and 223 respectively.
Loaded 94/2935 samples.
Processed orientation-hu-train.tsv, created train, val, and test datasets of size 76, 9, and 9 respectively.


  self.labels.append(torch.tensor((labels[i]), dtype=torch.long))


Loaded 113/533 samples.
Processed orientation-is-train.tsv, created train, val, and test datasets of size 91, 11, and 11 respectively.
Loaded 656/3367 samples.
Processed orientation-it-train.tsv, created train, val, and test datasets of size 525, 66, and 65 respectively.
Loaded 185/798 samples.
Processed orientation-lv-train.tsv, created train, val, and test datasets of size 149, 18, and 18 respectively.
Loaded 2319/5657 samples.
Processed orientation-nl-train.tsv, created train, val, and test datasets of size 1856, 232, and 231 respectively.


  self.labels.append(torch.tensor((labels[i]), dtype=torch.long))


Loaded 2507/10861 samples.
Processed orientation-no-train.tsv, created train, val, and test datasets of size 2006, 251, and 250 respectively.
Loaded 1502/5489 samples.
Processed orientation-pl-train.tsv, created train, val, and test datasets of size 1202, 150, and 150 respectively.
Loaded 641/3464 samples.
Processed orientation-pt-train.tsv, created train, val, and test datasets of size 513, 64, and 64 respectively.


  self.labels.append(torch.tensor((labels[i]), dtype=torch.long))


Loaded 1173/9789 samples.
Processed orientation-rs-train.tsv, created train, val, and test datasets of size 939, 117, and 117 respectively.
Loaded 1215/8425 samples.
Processed orientation-se-train.tsv, created train, val, and test datasets of size 973, 121, and 121 respectively.


  self.labels.append(torch.tensor((labels[i]), dtype=torch.long))


Loaded 365/2518 samples.
Processed orientation-si-train.tsv, created train, val, and test datasets of size 293, 36, and 36 respectively.
Loaded 5085/16138 samples.
Processed orientation-tr-train.tsv, created train, val, and test datasets of size 4069, 508, and 508 respectively.
Loaded 0/2545 samples.
Processed orientation-ua-train.tsv, created train, val, and test datasets of size 0, 0, and 0 respectively.




Processed all files, created train, val, and test datasets of size 32345, 4040, and 4030 respectively.
