In [83]:
import pandas as pd
import os

import torch
from torch.utils.data import Dataset
import torchtext

import numpy as np

import string

from torch.nn.utils.rnn import pad_sequence

from sklearn.preprocessing import OneHotEncoder

DATA_FOLDER_PTH=os.path.join(os.getcwd(), os.pardir, 'data')
 
TRAIN_AUDIO_FOLDER_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/train', 'train_splits')
TRAIN_TEXT_FILE_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/train', 'train_sent_emo.csv')
 
DEV_AUDIO_FOLDER_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/dev', 'dev_splits_complete')
DEV_TEXT_FILE_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/dev', 'dev_sent_emo.csv')
 
TEST_AUDIO_FOLDER_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/test', 'output_repeated_splits_test')
TEST_TEXT_FILE_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/test', 'test_sent_emo.csv')

In [87]:
class TextDataset(Dataset):
    
    # dataset_type = train/validation/test
    # label_target = Emotion/Sentiment
    # embedding_dim = 50/100/200/300
    
    def __init__(self, dataset_type = 'train', label_target = 'Emotion', embedding_dim = 50):

        self.dataset_type = dataset_type
        self.label_target = label_target
        self.embedding_dim = embedding_dim
        
        self.glove = torchtext.vocab.GloVe(name="6B", dim = self.embedding_dim) 
        
        if self.dataset_type == 'train':
            self.data = pd.read_csv(TRAIN_TEXT_FILE_PTH, encoding='utf-8')
        if self.dataset_type == 'validation':
            self.data = pd.read_csv(DEV_TEXT_FILE_PTH, encoding='utf-8')
#         if self.dataset_type == 'test':
#             self.data = pd.read_csv(TRAIN_TEXT_FILE_PTH, encoding='utf-8')

        self.datasetSize = len(self.data)
        self.maxSentLength = int(self.calcMaxSentLength())
        self.padTensor = torch.zeros(size = (self.maxSentLength, self.embedding_dim))
        
        self.labels_idx = {'neutral': 0, 'surprise': 1, 'fear': 2, 'sadness': 3, 'joy': 4, 'disgust': 5,'anger': 6}

    def __getitem__(self, idx):
        sentence = data['Utterance'][idx]
        label = data[self.label_target][idx]
        
        sent_embeddings = self.get_embeddings(sentence)
        sent_embeddings = pad_sequence([sent_embeddings, self.padTensor], batch_first=True)[0]
        
        label_id = torch.tensor(self.labels_idx[label], dtype = torch.long)
        
        #print(sent_embeddings, label_id)
        
        return sent_embeddings, label_id

    def __len__(self):
        return self.datasetSize
    
    def get_embeddings(self, sentence):
        
        sent_embeddings = None
        for word in sentence.split():                   
            word = word.translate(str.maketrans('', '', string.punctuation)).lower() # strip punctuations and lowercase
            word_emb = self.glove[word]
                
            if sent_embeddings is None:
                sent_embeddings = list([word_emb])
            else:
                sent_embeddings.append(word_emb)
            
        sent_embeddings = torch.stack(sent_embeddings)

        return sent_embeddings
    
    def calcMaxSentLength(self):
        corpus = data['Utterance']

        sentence_length = pd.DataFrame(np.zeros(data['Utterance'].size), columns=['Length'])

        for i, val in enumerate(corpus):

            val = str(val)
            tokens = val.split()

            sentence_length.loc[i, 'Length'] = len(tokens)
            
        return max(sentence_length['Length'])