# Using Self Attention

Here we are going to use a simple dataset to train for a sentiment model using Self Attention.

In [1]:
# !pip install datasets

## Importing the Dataset

In [2]:
from datasets import load_dataset_builder, load_dataset, get_dataset_split_names

In [3]:
ds_name = "sentiment140"

In [4]:
tdata_builder = load_dataset_builder(ds_name)

In [5]:
tdata_builder.info.description

'Sentiment140 consists of Twitter messages with emoticons, which are used as noisy labels for\nsentiment classification. For more detailed information please refer to the paper.\n'

In [6]:
tdata_builder.info.features

{'text': Value(dtype='string', id=None),
 'date': Value(dtype='string', id=None),
 'user': Value(dtype='string', id=None),
 'sentiment': Value(dtype='int32', id=None),
 'query': Value(dtype='string', id=None)}

In [7]:
get_dataset_split_names(ds_name)

['train', 'test']

In [8]:
source_dataset = load_dataset("sentiment140", split="train").with_format("torch")

Found cached dataset sentiment140 (/home/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/f81c014152931b776735658d8ae493b181927de002e706c4d5244ecb26376997)


In [9]:
source_dataset

Dataset({
    features: ['text', 'date', 'user', 'sentiment', 'query'],
    num_rows: 1600000
})

In [10]:
source_dataset[22313]

{'text': "@lauraJaynebaker @robertcjenkin I'm fine, I've just got so many medicines to take   Trying to convince my parents I'm ok to go back to Ed!",
 'date': 'Sun Apr 19 04:14:40 PDT 2009',
 'user': 'Gabi_Jones',
 'sentiment': tensor(0),
 'query': 'NO_QUERY'}

In [11]:
## Load the Word Embeddings
import pickle
import torch
import torch.nn as nn
from pathlib import Path
from lib.glove import GloveEmbeddings

In [12]:
glove_pkl_path = Path.home()/"data"/"glove.pkl"
glove = pickle.load(open(glove_pkl_path, "rb"))

In [13]:
glove.make(source_dataset[22313]['text']).shape

torch.Size([24, 50])

In [14]:
class TwitterDataset(torch.utils.data.Dataset):
    def __init__(self, source):
        super().__init__()
        self.source = source
        
    def __len__(self):
        return len(self.source)
    
    def __getitem__(self, index):
        raw_item = self.source[index]
        item = glove.make(raw_item['text'])[0:240, :]
        zeros = torch.zeros(240, 50)
        zeros[0:len(item)] = item
        
        return zeros, raw_item['sentiment']

In [15]:
train_dataset = TwitterDataset(source_dataset)

In [16]:
test_inputs = next(iter(torch.utils.data.DataLoader(train_dataset, batch_size=3, shuffle=True)))[0]
test_inputs.shape

torch.Size([3, 240, 50])

## Training with a Liner Model (Baseline)

In [17]:
class BaseLineModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(50, 5),
            nn.ReLU()
        )
        
    def forward(self, inputs):
        result = self.net(inputs)
        result = torch.mean(result, dim=1)
        return result
    
_m = BaseLineModel()
_m(test_inputs).shape

torch.Size([3, 5])