<a href="https://colab.research.google.com/github/archyyu/encoder-related/blob/main/encoder_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import math
import pandas as pd

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/archyyu/publicResource/main/chat_dataset.csv')

In [3]:
# Hyperparameters
hidden_size = 100
embedding_dim = 100
seq_length = 25
learning_rate = 1e-1
batch_size = 20
dropout = 0.1
eval_iters = 200
head_size = 20

In [4]:
pad = '<pad>'
data = []
targets = []
for index, row in df.iterrows():
  data.append(row['message'])
  targets.append(row['sentiment'])

targetset = sorted(set(targets))
sentiment_to_index = {s:i for i, s in enumerate(targetset)}
index_to_sentiment = {i:s for i, s in enumerate(targetset)}

dataset = sorted(set((' '.join(data)).split(' ')))
dataset.append(pad)
vocab_size = len(dataset)
word_to_index = {w:i for i, w in enumerate(dataset)}
index_to_word = {i:w for i, w in enumerate(dataset)}

lines = []
for item in data:
  lines.append(item.split(' '))

max_line = max([len(line) for line in lines])

for item in lines:
  for _ in range(max_line - len(item)):
    item.append(pad)

X = []
for line in lines:
  item = [word_to_index[word] for word in line]
  X.append(item)

Y = []
for i in range(len(targets)):
  item = sentiment_to_index[targets[i]]
  Y.append(item)


In [42]:
class AttentionHead(nn.Module):
  def __init__(self, embedding_size, head_size):
    super(AttentionHead, self).__init__()
    self.head_size = head_size
    self.C = embedding_size

    self.q = nn.Linear(self.C, head_size, bias=False)
    self.v = nn.Linear(self.C, head_size, bias=False)
    self.k = nn.Linear(self.C, head_size, bias=False)

  def forward(self, x, mask):
    B,T,C = x.shape
    q = self.q(x)
    k = self.k(x)
    v = self.v(x)

    wei = q @ k.transpose(-2, -1) * (self.head_size ** -0.5)
    wei.masked_fill_(mask == 0, -1e9)
    wei = F.softmax(wei, dim=-1)

    return wei @ v

class EncoderMultiHeadAttention(nn.Module):
  def __init__(self, embedding_size, head_size):
    super(EncoderMultiHeadAttention, self).__init__()
    self.num_heads = embedding_size // head_size

    self.heads = nn.ModuleList([
        AttentionHead(embedding_size, head_size) for _ in range(self.num_heads)
    ])

    self.final_linear = nn.Linear(self.num_heads * head_size, embedding_size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, mask):
    head_outputs = [head(x, mask) for head in self.heads]
    concatenated_output = torch.cat(head_outputs, dim=-1)
    final_output = self.final_linear(concatenated_output)
    return self.dropout(final_output)

class FeedForward(nn.Module):
  def __init__(self, embedding_size):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(embedding_size, 4 * embedding_size),
        nn.ReLU(),
        nn.Linear(4 * embedding_size, embedding_size),
        nn.Dropout(dropout),
    )
  def forward(self, x):
    return self.net(x)

class AddAndNormLayer(nn.Module):
  def __init__(self, embedding_size):
    super(AddAndNormLayer, self).__init__()
    self.norm = nn.LayerNorm(embedding_size)

  def forward(self, x, subLayer: nn.Module, mask=None):
    if mask == None:
      return x + subLayer(self.norm(x))
    else:
      return x + subLayer(self.norm(x), mask)

class EncoderBlockAttention(nn.Module):
  def __init__(self, embedding_size, head_size):
    super(EncoderBlockAttention, self).__init__()
    self.multiheads = EncoderMultiHeadAttention(embedding_size, head_size)
    self.fw = FeedForward(embedding_size)
    self.addnormlayers = nn.ModuleList([AddAndNormLayer(embedding_size) for i in range(2)])

  def forward(self, x, mask):
    inter_result = self.addnormlayers[0](x, self.multiheads, mask)
    final_result = self.addnormlayers[1](inter_result, self.fw)
    return final_result


class Encoder(nn.Module):
  def __init__(self, vocab_size, embedding_size, output_size, head_size):
    super(Encoder, self).__init__()
    self.embedding_size = embedding_size
    self.em = nn.Embedding(vocab_size, embedding_size)
    # self.pos_encode = nn.Embedding(seq_length, embedding_size)
    self.blocks = nn.ModuleList([EncoderBlockAttention(embedding_size, head_size) for _ in range(4)])
    self.f_norm = nn.LayerNorm(embedding_size)
    self.fw = nn.Linear(embedding_size, output_size, bias=False)

  def positionEncoding(self, x, embedding_size):
    B,T = x.shape
    C = embedding_size
    n = 10000
    x = torch.zeros((T, C))
    for k in range(T):
        for i in torch.arange(int(C/2)):
            denominator = torch.pow(n, 2*i/C)
            x[k, 2*i] = torch.sin(k/denominator)
            x[k, 2*i+1] = torch.cos(k/denominator)
    return x

  def forward(self, x, mask):
    B,T = x.shape
    x_em = self.em(x)
    x_pm = self.positionEncoding(x, self.embedding_size)
    x = x_em + x_pm
    for block in self.blocks:
      x = block(x, mask)
    x = self.f_norm(x)
    x = self.fw(x)
    return torch.clone(x[:,T-1,:]).squeeze(1)

criterion = nn.CrossEntropyLoss()

model = Encoder(vocab_size, embedding_dim, 3, head_size)
optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)

In [33]:
def get_batch():
  inputs = []
  masks = []
  targets = []

  pad_index = word_to_index[pad]

  n = torch.randint(len(X) - batch_size, [1]).item()
  for i in range(batch_size):

    inputs_item = torch.tensor(X[n + i])
    targets_item = torch.tensor(Y[n + i])
    inputs.append(inputs_item)
    masks.append(inputs_item != pad_index)
    targets.append(targets_item)

  return torch.stack(inputs), torch.stack(masks), torch.stack(targets)

In [43]:
n_iters = 10000
for i in range(n_iters):
  inputs, masks, targets = get_batch()

  masks = masks.unsqueeze(1)

  predicts = model(inputs, masks)

  optimizer.zero_grad(set_to_none=True)
  B,T = predicts.shape

  loss = criterion(predicts, targets)

  loss.backward()

  optimizer.step()

  if i % 200 == 0:
    print(f'i {i}, loss:{loss.item()}')

i 0, loss:1.2257578372955322
i 200, loss:0.6134036779403687
i 400, loss:0.3400978446006775
i 600, loss:0.00831008143723011
i 800, loss:0.03650485724210739
i 1000, loss:0.0056799426674842834
i 1200, loss:0.00044404398067854345
i 1400, loss:0.0002671200199984014


KeyboardInterrupt: 

In [49]:
hhh = "it is a wonderful of time"

hgg = [word_to_index[word] for word in hhh.split(' ')]
for i in range(max_line - len(hgg)):
  hgg.append(word_to_index[pad])

hgg = torch.tensor(hgg).unsqueeze(0)

mask = (hgg != word_to_index[pad])

pred = model(hgg, mask)
print(index_to_sentiment[torch.argmax(pred).item()])

positive


I seen a very interesting youtube video about transformers, now, I think I could improve that model from several points
1: masked the padding tokens, because the padding tokens should not been attented.
2: optimize the multiheads

I have optimized the multiheads

Now I think I should optimize the attention part, if the token is pad, then that token should not be put attention by other tokens.

the trick part is that, the pad tokens in line are variable length
so for each line, I should also pass the start index of pad tokens, then Head could receive that index, to decide mask tokens.

let's do that

ok, optimized the code from serval prospects

1: capsulate the Add and Norm layer

2: add the position encoding, I am not sure why Andrej didnot do that, he use an embedding to train the position encoding layer, but I think the position encoding is much more better

3: add the mask tokens, so the pad token could not draw attention from actual tokens.

hope that will helpful

also I will do the following things

1: split the training data from val data


all those things are prepared for the translation. because my translation model still not doing well.