# Collate Function and Dataloader Class

In this notebook, we develop the appropriate collate function and data classes for this project. A **collate function**  is how the dataloader will processes each example from the dataset.

Dataset and DataLoader are PyTorch classes that provides utilities for iterating through and sampling from a dataset.

In [7]:
import torch
import pandas as pd
import numpy as np

In [3]:
SEED = 1234 # Specify a seed for reproductability

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## 0. Download Data

In [26]:
articles = pd.read_csv('data/articles.csv')
scores = pd.read_csv('data/articles_w_scores.csv')
data = scores[['articleID','vaderMean','vaderStd', 'vaderIQR']].merge(articles)

In [43]:
data_vader_std = []
for index, row in data.iterrows():
    text = row['headline'] if row['headline'] != 'Unknown' else row['snippet']
    data_vader_std.append([row['vaderStd'], text])

In [44]:
data_vader_std[0]

[0.5119524453713237,
 'Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell']

## 1. Create Custom Dataset Object (PyTorch)

In [4]:
from torch.utils.data import Dataset

In [5]:
class ProjectDataset(Dataset):
    def __init__(self, data):
        assert isinstance(data, list)
        self.samples = data
        
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [45]:
data_object = ProjectDataset(data_vader_std)

## 2. Collate Function

### 2.1 Bag of Words

In [1]:
#Bag of Words
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')

def get_vocab(training_data):
    counter = Counter()
    for (label, line) in train_iter:
        counter.update(tokenizer(line))
    vocab = Vocab(counter, min_freq=1000)
    return vocab


def collate_into_bow(batch):  
    labels = []
    bag_vector = torch.zeros((len(batch),len(vocab)))
    for i, (label, line) in enumerate(batch):
        words = tokenizer(line)
        labels.append(label-1)
        for w in words:            
            bag_vector[i, vocab[w]] += 1
    
    bag_vector = (bag_vector/bag_vector.sum(axis=1, keepdim=True))
    return torch.tensor(labels), bag_vector

### 2.2 Continuous Bag of Words using GloVe

In [None]:
from torchtext.vocab import GloVe
glove = GloVe(name='6B')

def collate_into_cbow(batch):
    cbag_vector = torch.tensor([])
    labels = []
    for i, (label, line) in enumerate(batch):
        labels.append(label-1)
        words = tokenizer(line)
        vecs = glove.get_vecs_by_tokens(words)
        vecs = vecs.sum(axis=0)/vecs.shape[0]
        cbag_vector = torch.cat([cbag_vector, vecs.view(1, -1)])
    
    return torch.tensor(labels), cbag_vector