In [1]:
# first let's understand our data

import os
HOME = os.getcwd()
zip_file_name = 'sentiment-analysis-on-movie-reviews.zip'
import zipfile
with zipfile.ZipFile(os.path.join(HOME, zip_file_name), 'r') as zip_ref:
    zip_ref.extractall(HOME)

# extract all the data from the train data
with zipfile.ZipFile(os.path.join(HOME, 'train.tsv.zip'), 'r') as zip_ref:
    zip_ref.extractall(HOME)

with zipfile.ZipFile(os.path.join(HOME, 'test.tsv.zip'), 'r') as zip_ref:
    zip_ref.extractall(HOME)



In [2]:
from pathlib import Path
from typing import Union
import pandas as pd

from torch.utils.data import Dataset

from transformers import AutoTokenizer

BERT_CHECKPOINT = 'bert-base-cased'

import torch

class ReviewDataset(Dataset):
    def __init__(self, file_path: Union[Path, str], train:bool=True, tokenizer=None, embedding_dimension: int = None):
        # need to call the super class constructor
        super().__init__()
        self.file_path = file_path
        # read the data
        self.data = pd.read_csv(file_path, sep='\t')

        if tokenizer is None:
            self.tokenizer = AutoTokenizer.from_pretrained(BERT_CHECKPOINT)
    
        # if no dimension was specifically set, use the dataset to determine it
        # the value 2 is added to account for the special characters: mainly 
        self.embedding_dimension = embedding_dimension + 2 if embedding_dimension else max([len(str(phrase)) for phrase in self.data['Phrase']]) + 2

        # a boolean flag to determine whether labels will be returned or not
        self.train = train 
        

    def __len__(self):
        return len(self.data)
    

    def __getitem__(self, index):
        # get the row from the data
        phrase_raw = self.data.iloc[index, :]['Phrase']

        if self.train:
            label = self.data.iloc[index, :]['Sentiment']
            
        assert isinstance(phrase_raw, str)

        # tokenize the data
        tokens = self.tokenizer(phrase_raw, padding='max_length', max_length=self.embedding_dimension, return_tensors='pt')
        embedding = tokens['input_ids']

        assert torch.is_tensor(embedding)  and embedding.size()[0] == 1 and embedding.size()[1] == self.embedding_dimension # make sure the types and shapes are as expected

        if self.train:
                return embedding, torch.Tensor([label]) 
        return embedding    

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# let's create the training dataset

train_dir = os.path.join(HOME, 'train.tsv')
test_dir = os.path.join(HOME, 'test.tsv')

TRAIN_DATASET = ReviewDataset(train_dir)
TEST_DATASET = ReviewDataset(test_dir, train=False)

# data iterator
from torch.utils.data import DataLoader

train_dataloader = DataLoader(TRAIN_DATASET, batch_size=32, shuffle=True, num_workers=0)

i = 0
for X, y in train_dataloader:
    # print(X)
    print(y)
    i += 1 
    if i == 2:
        break

tensor([[3.],
        [2.],
        [2.],
        [2.],
        [2.],
        [0.],
        [1.],
        [2.],
        [1.],
        [2.],
        [1.],
        [2.],
        [3.],
        [1.],
        [4.],
        [3.],
        [3.],
        [3.],
        [3.],
        [1.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [1.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [3.]])
tensor([[2.],
        [3.],
        [2.],
        [2.],
        [3.],
        [3.],
        [1.],
        [2.],
        [1.],
        [4.],
        [3.],
        [0.],
        [3.],
        [2.],
        [2.],
        [1.],
        [1.],
        [3.],
        [2.],
        [4.],
        [4.],
        [2.],
        [0.],
        [2.],
        [3.],
        [2.],
        [2.],
        [3.],
        [2.],
        [2.],
        [3.],
        [3.]])


In [None]:
# let's see the performance of a simple RNN on this task
