<a href="https://colab.research.google.com/github/archyyu/encoder-related/blob/main/attention_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import math
import pandas as pd

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/archyyu/publicResource/main/chat_dataset.csv')

In [3]:
# Hyperparameters
hidden_size = 100
embedding_dim = 80
seq_length = 25
learning_rate = 1e-1
batch_size = 20
dropout = 0.1
eval_iters = 200
num_heads = 4
head_size = 20

In [23]:
pad = '[pad]'
mask = '[MASK]'
data = []
targets = []
for index, row in df.iterrows():
  data.append(row['message'])
  targets.append(row['sentiment'])

datalen = []
for line in data:
  datalen.append(len(line.split(' ')))


targetset = sorted(set(targets))
sentiment_to_index = {s:i for i, s in enumerate(targetset)}
index_to_sentiment = {i:s for i, s in enumerate(targetset)}

In [5]:
dataset = sorted(set((' '.join(data)).split(' ')))
dataset.append(pad)
dataset.append(mask)
vocab_size = len(dataset)
word_to_index = {w:i for i, w in enumerate(dataset)}
index_to_word = {i:w for i, w in enumerate(dataset)}

pad_index = word_to_index[pad]

n = (int)(len(data) * 0.9)
training_data = data[:n]
val_data = data[n:]

In [6]:
lines = []
for item in data:
  lines.append(item.split(' '))

max_line = max([len(line) for line in lines])

for item in lines:
  for _ in range(max_line - len(item)):
    item.append(pad)

X = []
for line in lines:
  item = [word_to_index[word] for word in line]
  X.append(item)

Y = []
for i in range(len(targets)):
  item = sentiment_to_index[targets[i]]
  Y.append(item)

In [12]:
class AttentionHead(nn.Module):
  def __init__(self, embedding_size, head_size):
    super(AttentionHead, self).__init__()
    self.head_size = head_size
    self.C = embedding_size

    self.q = nn.Linear(self.C, head_size, bias=False)
    self.v = nn.Linear(self.C, head_size, bias=False)
    self.k = nn.Linear(self.C, head_size, bias=False)

  def forward(self, x, mask):
    B,T,C = x.shape
    q = self.q(x)
    k = self.k(x)
    v = self.v(x)

    wei = q @ k.transpose(-2, -1) * (self.head_size ** -0.5)
    wei.masked_fill_(mask==0, -1e9)
    wei = F.softmax(wei, dim=-1)

    return wei @ v

class EncoderMultiHeadAttention(nn.Module):
  def __init__(self, num_heads, embedding_size, head_size):
    super(EncoderMultiHeadAttention, self).__init__()
    self.num_heads = num_heads

    self.heads = nn.ModuleList([
        AttentionHead(embedding_size, head_size) for _ in range(num_heads)
    ])

    self.final_linear = nn.Linear(num_heads * head_size, embedding_size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, mask):
    head_outputs = [head(x, mask) for head in self.heads]
    concatenated_output = torch.cat(head_outputs, dim=-1)
    final_output = self.final_linear(concatenated_output)
    return self.dropout(final_output)

class FeedForward(nn.Module):
  def __init__(self, embedding_size):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(embedding_size, 4 * embedding_size),
        nn.ReLU(),
        nn.Linear(4 * embedding_size, embedding_size),
        nn.Dropout(dropout),
    )
  def forward(self, x):
    return self.net(x)

class AddAndNormLayer(nn.Module):
  def __init__(self, embedding_size):
    super(AddAndNormLayer, self).__init__()
    self.norm = nn.LayerNorm(embedding_size)

  def forward(self, x, subLayer: nn.Module, mask=None):
    if mask == None:
      return x + subLayer(self.norm(x))
    else:
      return x + subLayer(self.norm(x), mask)

class EncoderBlockAttention(nn.Module):
  def __init__(self, num_heads, embedding_size, head_size):
    super(EncoderBlockAttention, self).__init__()
    self.multiheads = EncoderMultiHeadAttention(num_heads, embedding_size, head_size)
    self.fw = FeedForward(embedding_size)
    self.addLayers = nn.ModuleList([AddAndNormLayer(embedding_size) for i in range(2)])

  def forward(self, x, mask):
    inter_result = self.addLayers[0](x, self.multiheads, mask)
    final_result = self.addLayers[1](inter_result, self.fw)
    return final_result

class PositionalEncoding(nn.Module):
  def __init__(self, d_model, seq_len):
    super().__init__()
    self.C = d_model
    T = seq_len
    n = 10000
    pe = torch.zeros((T, self.C))
    for k in range(T):
      for i in torch.arange(int(self.C/2)):
        denominator = torch.pow(n, 2*i/self.C)
        pe[k, 2*i] += torch.sin(k/denominator)
        pe[k, 2*i+1] += torch.cos(k/denominator)

    pe.requires_grad_(False)
    self.register_buffer('pe', pe)

  def forward(self, x):
    return x + self.pe

class Encoder(nn.Module):
  def __init__(self, num_heads, vocab_size, embedding_size, head_size, seq_len):
    super(Encoder, self).__init__()
    self.em = nn.Embedding(vocab_size, embedding_size)
    self.pe = PositionalEncoding(embedding_size, seq_len)
    self.embedding_size = embedding_size
    self.blocks = nn.ModuleList([EncoderBlockAttention(num_heads, embedding_size, head_size) for _ in range(4)])
    self.f_norm = nn.LayerNorm(embedding_size)
    self.fw = nn.Linear(embedding_size, vocab_size, bias=False)

  def forward(self, x, mask):
    B,T = x.shape
    x_em = self.em(x)
    x = self.pe(x_em)
    for block in self.blocks:
      x = block(x, mask)
    x = self.f_norm(x)
    x = self.fw(x)
    return x

criterion = nn.CrossEntropyLoss()

model = Encoder(num_heads, vocab_size, embedding_dim, head_size, max_line)
optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)

In [24]:
def get_batch():
  inputs = []
  targets = []
  mask_index_list = []
  masks = []
  n = torch.randint(len(X) - batch_size, [1]).item()
  for i in range(batch_size):
    inputs_item = torch.tensor(X[n + i])

    mask_index = torch.randint(datalen[n + 1], [1])

    targets.append(inputs_item[mask_index])
    mask_index_list.append(torch.tensor([i,mask_index]))

    inputs_item[mask_index] = word_to_index[mask]

    inputs.append(inputs_item)

    masks.append(inputs_item != pad_index)

  return torch.stack(inputs), torch.stack(targets), torch.stack(mask_index_list), torch.stack(masks).unsqueeze(1)

In [25]:
n_iters = 10000
for i in range(n_iters):
  inputs, targets, indices, masks = get_batch()

  predicts = model(inputs, masks)

  #print(targets.view(-1).shape)
  outputs = predicts[indices[:,0],indices[:,1]]
  #print(outputs.shape)

  optimizer.zero_grad(set_to_none=True)

  loss = criterion(outputs, targets.view(-1))

  loss.backward()

  optimizer.step()

  if i%500 == 0:
    print(f'i {i}, loss:{loss.item()}')

i 0, loss:0.9897297024726868
i 500, loss:0.5745040774345398
i 1000, loss:0.5397576689720154
i 1500, loss:0.6856590509414673
i 2000, loss:0.5614961385726929
i 2500, loss:0.5877989530563354
i 3000, loss:0.349053293466568
i 3500, loss:0.8374935388565063
i 4000, loss:1.155908226966858
i 4500, loss:0.08496621251106262
i 5000, loss:0.43738698959350586
i 5500, loss:1.1929086446762085
i 6000, loss:0.7024846076965332
i 6500, loss:0.46525293588638306
i 7000, loss:0.8066798448562622
i 7500, loss:0.7971165776252747
i 8000, loss:0.7153050899505615
i 8500, loss:0.6752622723579407
i 9000, loss:0.701088547706604
i 9500, loss:1.0237014293670654


In [14]:
n = torch.randint(len(X) - 20,[1]).item()
for line in data[-n:-n + 15]:
  hgg = [word_to_index[word] for word in line.split(' ')]
  linelen = len(hgg)
  for i in range(max_line - len(hgg)):
    hgg.append(word_to_index[pad])
  maskindex = torch.randint(linelen,[1]).item()
  hgg[maskindex] = word_to_index['[MASK]']
  print(' '.join([index_to_word[item] for item in hgg]))
  hgg = torch.tensor(hgg).unsqueeze(0)
  mask = (hgg != pad_index)
  predict = model(hgg, mask)
  values, indexes = torch.topk(predict[0][maskindex],3)
  for item in indexes:
    print('  ',index_to_word[item.item()])


I'm feeling a bit stuck right [MASK] 🚶‍♀️ [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad]
   now
   today
   [pad]
I'm not [MASK] what to do with my free time 🤷‍♂️ [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad]
   sure
   what
   happy
I'm just trying to figure things out [MASK] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad]
   my
   in
   🔍
I'm feeling a bit overwhelmed with [MASK] going on 😩 [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad]
   out
   my
   everything
I'm not sure if I'm [MASK] the right decision 🤔 [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad]
   making
   ready
   for
I'm just [MASK] things one step at a time 🚶‍♂️ [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad]
   taking
   on
   impressed
I'm feeling a bit frustrated [MASK] 😤 [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad]
   today
   😐
   now
I'm not sur

that is amazing
I masked one word in the sentence, then training to model to predict the masked word

after some epoches, it works. even though, it maynot predicts the exact word, but it will predict very similar word

will continue

the same with encoder_sentiment
updated with three things
1: use a class to encapsulate the add and norm layer
2: mask the pad tokens, so the pad tokens did not get attention
3: add position encoding

overall, the performance is better now.

using the transformer encoder as an bert
then I will use dual-directional rnn to do a bert again
to check which model will perform better