# LSTM-arithmetic

## Dataset
- [Arithmetic dataset](https://drive.google.com/file/d/1cMuL3hF9jefka9RyF4gEBIGGeFGZYHE-/view?usp=sharing)

In [1]:
# ! pip install seaborn
# ! pip install -U scikit-learn

import numpy as np
import pandas as pd
import torch
import torch.nn
import torch.nn.utils.rnn
import torch.utils.data
import matplotlib.pyplot as plt
import seaborn as sns
import opencc
import os
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = pd.read_csv(os.path.join('arithmetic_train.csv'))
df_eval = pd.read_csv(os.path.join('arithmetic_eval.csv'))
df_train.head()

Unnamed: 0.1,Unnamed: 0,src,tgt
0,2285313,14*(43+20)=,882
1,317061,(6+1)*5=,35
2,718770,13+32+29=,74
3,170195,31*(3-11)=,-248
4,2581417,24*49+1=,1177


In [3]:
# transform the input data to string
df_train['tgt'] = df_train['tgt'].apply(lambda x: str(x))
df_train['src'] = df_train['src'].add(df_train['tgt'])
df_train['len'] = df_train['src'].apply(lambda x: len(x))

df_eval['tgt'] = df_eval['tgt'].apply(lambda x: str(x))
df_eval['src'] = df_eval['src'].add(df_eval['tgt'])
df_eval['len'] = df_eval['src'].apply(lambda x: len(x))

# Build Dictionary
 - The model cannot perform calculations directly with plain text.
 - Convert all text (numbers/symbols) into numerical representations.
 - Special tokens
    - '&lt;pad&gt;'
        - Each sentence within a batch may have different lengths.
        - The length is padded with '&lt;pad&gt;' to match the longest sentence in the batch.
    - '&lt;eos&gt;'
        - Specifies the end of the generated sequence.
        - Without '&lt;eos&gt;', the model will not know when to stop generating.

In [4]:
char_to_id = {
    "0"  : 0,
    "1"  : 1,
    "2"  : 2,
    "3"  : 3,
    "4"  : 4,
    "5"  : 5,
    "6"  : 6,
    "7"  : 7,
    "8"  : 8,
    "9"  : 9,
    "<pad>": 10,
    "<eos>": 11,
    "+"  : 12,
    "-"  : 13,
    "*"  : 14,
    "("  : 15,
    ")"  : 16,
    "="  : 17,
    }
id_to_char = {
    0  : "0",
    1  : "1",
    2  : "2",
    3  : "3",
    4  : "4",
    5  : "5",
    6  : "6",
    7  : "7",
    8  : "8",
    9  : "9",
    10  : "<pad>",
    11  : "<eos>",
    12  : "+",
    13  : "-",
    14  : "*",
    15  : "(",
    16  : ")",
    17  : "=",
}

# write your code here
# Build a dictionary and give every token in the train dataset an id
# The dictionary should contain <eos> and <pad>
# char_to_id is to conver charactors to ids, while id_to_char is the opposite

vocab_size = len(char_to_id)
print('Vocab size{}'.format(vocab_size))

Vocab size18


# Data Preprocessing
 - The data is processed into the format required for the model's input and output.
 - Example: 1+2-3=0
     - Model input: 1 + 2 - 3 = 0
     - Model output: / / / / / 0 &lt;eos&gt;  (the '/' can be replaced with &lt;pad&gt;)
     - The key for the model's output is that the model does not need to predict the next character of the previous part. What matters is that once the model sees '=', it should start generating the answer, which is '0'. After generating the answer, it should also generate&lt;eos&gt;


In [5]:
def convert_to_id(equation):
    return [char_to_id[char] for char in equation]+ [char_to_id["<eos>"]]

def convert_to_char(equation):
    return [id_to_char[char] for char in equation]+ [char_to_id[11]]

def label_id_list(equation):
    parts = equation.split('=')
    rhs = parts[1]
    pad_list = [char_to_id['<pad>']] * (len(parts[0])+1)
    rhs_ids = [char_to_id[char] for char in rhs]
    label_ids = pad_list + rhs_ids
    return label_ids + [char_to_id["<eos>"]]

In [6]:
def label_id_list_shift(equation):
    equation.pop(0)
    equation.append(char_to_id['<pad>'])
    return equation

In [7]:
df_train['len'] = df_train['src'].apply(len)
df_train['char_id_list'] = df_train['src'].apply(convert_to_id)
df_train['label_id_list'] = df_train['src'].apply(label_id_list)
df_train.head()


Unnamed: 0.1,Unnamed: 0,src,tgt,len,char_id_list,label_id_list
0,2285313,14*(43+20)=882,882,14,"[1, 4, 14, 15, 4, 3, 12, 2, 0, 16, 17, 8, 8, 2...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8..."
1,317061,(6+1)*5=35,35,10,"[15, 6, 12, 1, 16, 14, 5, 17, 3, 5, 11]","[10, 10, 10, 10, 10, 10, 10, 10, 3, 5, 11]"
2,718770,13+32+29=74,74,11,"[1, 3, 12, 3, 2, 12, 2, 9, 17, 7, 4, 11]","[10, 10, 10, 10, 10, 10, 10, 10, 10, 7, 4, 11]"
3,170195,31*(3-11)=-248,-248,14,"[3, 1, 14, 15, 3, 13, 1, 1, 16, 17, 13, 2, 4, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 13, 2..."
4,2581417,24*49+1=1177,1177,12,"[2, 4, 14, 4, 9, 12, 1, 17, 1, 1, 7, 7, 11]","[10, 10, 10, 10, 10, 10, 10, 10, 1, 1, 7, 7, 11]"


In [8]:
df_eval['len'] = df_eval['src'].apply(len)
df_eval['char_id_list'] = df_eval['src'].apply(convert_to_id)
df_eval['label_id_list'] = df_eval['src'].apply(label_id_list)
df_eval.head()

Unnamed: 0.1,Unnamed: 0,src,tgt,len,char_id_list,label_id_list
0,2573208,48+43+34=125,125,12,"[4, 8, 12, 4, 3, 12, 3, 4, 17, 1, 2, 5, 11]","[10, 10, 10, 10, 10, 10, 10, 10, 10, 1, 2, 5, 11]"
1,1630340,30-(48+13)=-31,-31,14,"[3, 0, 13, 15, 4, 8, 12, 1, 3, 16, 17, 13, 3, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
2,549277,(21*31)+10=661,661,14,"[15, 2, 1, 14, 3, 1, 16, 12, 1, 0, 17, 6, 6, 1...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 6..."
3,133957,2-27-10=-35,-35,11,"[2, 13, 2, 7, 13, 1, 0, 17, 13, 3, 5, 11]","[10, 10, 10, 10, 10, 10, 10, 10, 13, 3, 5, 11]"
4,1279828,(15*20)+24=324,324,14,"[15, 1, 5, 14, 2, 0, 16, 12, 2, 4, 17, 3, 2, 4...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 3..."


In [9]:
df_train['label_id_list'] = df_train['label_id_list'].apply(label_id_list_shift)
df_eval['label_id_list'] = df_eval['label_id_list'].apply(label_id_list_shift)
df_eval.head()

Unnamed: 0.1,Unnamed: 0,src,tgt,len,char_id_list,label_id_list
0,2573208,48+43+34=125,125,12,"[4, 8, 12, 4, 3, 12, 3, 4, 17, 1, 2, 5, 11]","[10, 10, 10, 10, 10, 10, 10, 10, 1, 2, 5, 11, 10]"
1,1630340,30-(48+13)=-31,-31,14,"[3, 0, 13, 15, 4, 8, 12, 1, 3, 16, 17, 13, 3, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 13, 3..."
2,549277,(21*31)+10=661,661,14,"[15, 2, 1, 14, 3, 1, 16, 12, 1, 0, 17, 6, 6, 1...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 6, 6,..."
3,133957,2-27-10=-35,-35,11,"[2, 13, 2, 7, 13, 1, 0, 17, 13, 3, 5, 11]","[10, 10, 10, 10, 10, 10, 10, 13, 3, 5, 11, 10]"
4,1279828,(15*20)+24=324,324,14,"[15, 1, 5, 14, 2, 0, 16, 12, 2, 4, 17, 3, 2, 4...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 3, 2,..."


# Hyper Parameters

|Hyperparameter|Meaning|Value|
|-|-|-|
|`batch_size`|Number of data samples in a single batch|64|
|`epochs`|Total number of epochs to train|10|
|`embed_dim`|Dimension of the word embeddings|256|
|`hidden_dim`|Dimension of the hidden state in each timestep of the LSTM|256|
|`lr`|Learning Rate|0.001|
|`grad_clip`|To prevent gradient explosion in RNNs, restrict the gradient range|1|

In [10]:
batch_size = 1024
epochs = 8
embed_dim = 256
hidden_dim = 256
lr = 0.001
grad_clip = 1

# Data Batching
- Use `torch.utils.data.Dataset` to create a data generation tool called  `dataset`.
- The, use `torch.utils.data.DataLoader` to randomly sample from the `dataset` and group the samples into batches.

In [11]:
import torch
class Dataset(torch.utils.data.Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        # Extract the input data x and the ground truth y from the data
        sequence = self.sequences.iloc[index]

        x = sequence['char_id_list']
        y = sequence['label_id_list']

        return x, y

# collate function, used to build dataloader
def collate_fn(batch):
    batch_x = [torch.tensor(data[0]) for data in batch]
    batch_y = [torch.tensor(data[1]) for data in batch]
    batch_x_lens = torch.LongTensor([len(x) for x in batch_x])
    batch_y_lens = torch.LongTensor([len(y) for y in batch_y])

    # Pad the input sequence
    pad_batch_x = torch.nn.utils.rnn.pad_sequence(batch_x,
                                                  batch_first=True,
                                                  padding_value=char_to_id['<pad>'])

    pad_batch_y = torch.nn.utils.rnn.pad_sequence(batch_y,
                                                  batch_first=True,
                                                  padding_value=char_to_id['<pad>'])
    return pad_batch_x, pad_batch_y, batch_x_lens, batch_y_lens

In [12]:
ds_train = Dataset(df_train[['char_id_list', 'label_id_list']])
ds_eval = Dataset(df_eval[['char_id_list', 'label_id_list']])

In [13]:
from torch.utils.data import DataLoader
# Build dataloader of train set and eval set, collate_fn is the collate function
dl_train = DataLoader(dataset=ds_train, batch_size= batch_size, shuffle= True, collate_fn = collate_fn)
dl_eval = DataLoader(dataset=ds_eval, batch_size= batch_size, shuffle= False, collate_fn=collate_fn)

In [14]:
first_batch = next(iter(dl_train))
batch_x, batch_y, batch_x_lens, batch_y_lens = first_batch

# 印出資料
print("Inputs (batch_x):", batch_x[1])
print("Labels (batch_y):", batch_y[1])
print("Input lengths (batch_x_lens):", batch_x_lens[1])
print("Label lengths (batch_y_lens):", batch_y_lens[1])

Inputs (batch_x): tensor([ 2,  3, 12,  2,  3, 12,  4,  5, 17,  9,  1, 11, 10, 10, 10, 10, 10])
Labels (batch_y): tensor([10, 10, 10, 10, 10, 10, 10, 10,  9,  1, 11, 10, 10, 10, 10, 10, 10])
Input lengths (batch_x_lens): tensor(12)
Label lengths (batch_y_lens): tensor(12)


# Model Design

## Execution Flow
1. Convert all characters in the sentence into embeddings.
2. Pass the embeddings through an LSTM sequentially.
3. The output of the LSTM is passed into another LSTM, and additional layers can be added.
4. The output from all time steps of the final LSTM is passed through a Fully Connected layer.
5. The character corresponding to the maximum value across all output dimensions is selected as the next character.

## Loss Function
Since this is a classification task, Cross Entropy is used as the loss function.

## Gradient Update
Adam algorithm is used for gradient updates.

In [15]:
class CharRNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CharRNN, self).__init__()

        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=embed_dim,
                                            padding_idx=char_to_id['<pad>'])

        self.rnn_layer1 = torch.nn.GRU(input_size=embed_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)

        self.rnn_layer2 = torch.nn.GRU(input_size=hidden_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)

        self.linear = torch.nn.Sequential(torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=hidden_dim),
                                          torch.nn.ReLU(),
                                          torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=vocab_size))

    def forward(self, batch_x, batch_x_lens):
        return self.encoder(batch_x, batch_x_lens)

    # The forward pass of the model
    def encoder(self, batch_x, batch_x_lens):
        batch_x = self.embedding(batch_x)

        batch_x = torch.nn.utils.rnn.pack_padded_sequence(batch_x,batch_x_lens,batch_first=True,enforce_sorted=False)

        batch_x, _ = self.rnn_layer1(batch_x)
        batch_x, _ = self.rnn_layer2(batch_x)

        batch_x, _ = torch.nn.utils.rnn.pad_packed_sequence(batch_x,batch_first=True)

        batch_x = self.linear(batch_x)

        return batch_x

    def generator(self, start_char, max_len=200):
        # Initialize char_list with the token ids for the start_char sequence
        char_list = [char_to_id[c] for c in start_char]
        next_char = None
        # We will iterate to predict the next characters
        while len(char_list) < max_len:
            # Write your code here
            # Pack the char_list to tensor
            # Input the tensor to the embedding layer, LSTM layers, linear respectively

            current_seq = torch.tensor(char_list).unsqueeze(0).to(next(self.parameters()).device)
            current_seq_emb = self.embedding(current_seq)
            output, _ = self.rnn_layer1(current_seq_emb)
            output, _ = self.rnn_layer2(output)
            logits = self.linear(output)
            y = logits[:, -1, :]
            next_char = torch.argmax(y, dim=-1).item() # Obtain the next token prediction y
            print(next_char)
            if next_char == char_to_id['<eos>']:
                break
            char_list.append(next_char)
        return [id_to_char[ch_id] for ch_id in char_list]

In [16]:
torch.manual_seed(2)


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = CharRNN(vocab_size,
                embed_dim,
                hidden_dim)

In [17]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=char_to_id['<pad>'])
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

# Training
1. The outer `for` loop controls the `epoch`
    1. The inner `for` loop uses `data_loader` to retrieve batches.
        1. Pass the batch to the `model` for training.
        2. Compare the predicted results `batch_pred_y` with the true labels `batch_y` using Cross Entropy to calculate the loss `loss`
        3. Use `loss.backward` to automatically compute the gradients.
        4. Use `torch.nn.utils.clip_grad_value_` to limit the gradient values between `-grad_clip` &lt; and &lt; `grad_clip`.
        5. Use `optimizer.step()` to update the model (backpropagation).
2.  After every `1000` batches, output the current loss to monitor whether it is converging.

In [18]:
from tqdm import tqdm
from copy import deepcopy
model = model.to(device)
i = 0
for epoch in range(1, epochs+1):
    model.train()
    # The process bar
    bar = tqdm(dl_train, desc=f"Train epoch {epoch}")
    for batch_x, batch_y, batch_x_lens, batch_y_lens in bar:
        optimizer.zero_grad()
        # Write your code here
        # Clear the gradient

        batch_pred_y = model(batch_x.to(device), batch_x_lens)
        loss = criterion(batch_pred_y.view(-1, vocab_size), batch_y.to(device).view(-1))
        loss.backward()

        # Write your code here
        # Input the prediction and ground truths to loss function
        # Back propagation

        torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)
        optimizer.step()

        # Write your code here
        # Optimize parameters in the model
        i+=1
        if i%50==0:
            bar.set_postfix(loss = loss.item())

    # Evaluate your model
    model.eval()
    bar = tqdm(dl_eval, desc=f"Validation epoch {epoch}")
    matched = 0
    total = 0
    for batch_x, batch_y, batch_x_lens, batch_y_lens in bar:
        batch_y = batch_y.to(device)
        predictions = model(batch_x.to(device), batch_x_lens)# Write your code here. Input the batch_x to the model and generate the predictions
        pred_tokens = predictions.argmax(dim=-1)
        for i in range(pred_tokens.size(0)):
            mask = (batch_y[i] != char_to_id['<pad>']) & (batch_y[i] != char_to_id['<eos>'])
            filtered_preds = pred_tokens[i][mask]
            filtered_labels = batch_y[i][mask]
            matched += (filtered_preds == filtered_labels).sum().item()
            total += filtered_labels.numel()  # Only count valid tokens for total

        # for i in range(pred_tokens.size(0)):
        #   print(pred_tokens[0],"ccccc\n",batch_y[0])
        #   if torch.equal(pred_tokens[i], batch_y[i]):
        #     matched += 1
        #   total += 1


        # for i in range(pred_tokens.size(0)):
        #     for j in range(len(pred_tokens[i])):
        #         if pred_tokens[i][j] != char_to_id['<eos>'] or pred_tokens[i][j] != char_to_id['<pad>']:
        #             total += 1
        #             if pred_tokens[i][j] == batch_y[i][j]:
        #                 matched += 1
    print(f"Exact Match: {matched / total}")
        # Write your code here.
        # Check whether the prediction match the ground truths
        # Compute exact match (EM) on the eval dataset
        # EM = correct/total

Train epoch 1: 100%|██████████| 2314/2314 [03:19<00:00, 11.57it/s, loss=0.532]
Validation epoch 1: 100%|██████████| 258/258 [02:26<00:00,  1.76it/s]


Exact Match: 0.722707131620346


Train epoch 2: 100%|██████████| 2314/2314 [03:19<00:00, 11.62it/s, loss=0.323]
Validation epoch 2: 100%|██████████| 258/258 [02:51<00:00,  1.51it/s]


Exact Match: 0.7782223387519664


Train epoch 3: 100%|██████████| 2314/2314 [03:27<00:00, 11.14it/s, loss=0.32] 
Validation epoch 3: 100%|██████████| 258/258 [02:50<00:00,  1.52it/s]


Exact Match: 0.8132275825904562


Train epoch 4: 100%|██████████| 2314/2314 [03:25<00:00, 11.27it/s, loss=0.332]
Validation epoch 4: 100%|██████████| 258/258 [02:38<00:00,  1.62it/s]


Exact Match: 0.7976127425275301


Train epoch 5: 100%|██████████| 2314/2314 [03:29<00:00, 11.02it/s, loss=0.247]
Validation epoch 5: 100%|██████████| 258/258 [02:56<00:00,  1.46it/s]


Exact Match: 0.8594480859989513


Train epoch 6: 100%|██████████| 2314/2314 [03:34<00:00, 10.80it/s, loss=0.218]
Validation epoch 6: 100%|██████████| 258/258 [02:55<00:00,  1.47it/s]


Exact Match: 0.8789604090194022


Train epoch 7: 100%|██████████| 2314/2314 [03:23<00:00, 11.38it/s, loss=0.164]
Validation epoch 7: 100%|██████████| 258/258 [02:49<00:00,  1.53it/s]


Exact Match: 0.8797810697430519


Train epoch 8: 100%|██████████| 2314/2314 [03:33<00:00, 10.84it/s, loss=0.157]
Validation epoch 8: 100%|██████████| 258/258 [02:53<00:00,  1.49it/s]

Exact Match: 0.9170896696381752





In [19]:
torch.save(model, 'LSTM.pt')

# Generation
Use `model.generator` and provide an initial character to automatically generate a sequence.

In [20]:
model = model.to("cpu")
print("".join(model.generator('22+12=')))

3
4
11
22+12=34
