In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
import torch.utils.data
import torch.nn
import numpy as np
import torch.nn.functional as F

In [38]:
data = pd.read_csv('arithmetic.csv')

In [39]:
data['tgt'] = data['tgt'].apply(str)
data = data[data['src'].apply(lambda x:'*' not in x)]
data = data[data['src'].apply(lambda x:'/' not in x)]
#data = data[data['src'].apply(lambda x:'(' not in x)]

In [40]:
def rshift(row):
    return ''.join(['.' for _ in range(len(row['src']))]) + row['tgt']
data['tgt'] = data.apply(rshift, axis=1)
data['tgt_eos'] = data['tgt'].apply(len)
# eos recorded ###############################################################################################################################
data['tgt'] = data['tgt'].apply(lambda x : x + '_')
data['src'] = data['src'].apply(lambda x : x + '_')
data

Unnamed: 0,src,tgt,tgt_eos
0,0+0=_,....0_,5
1,0-0=_,....0_,5
15,0+0+0=_,......0_,7
16,0-0-0=_,......0_,7
18,0+0-0=_,......0_,7
...,...,...,...
2632495,(49+49)-49=_,...........49_,13
2632496,49+(49-49)=_,...........49_,13
2632497,49-49+49=_,.........49_,11
2632498,(49-49)+49=_,...........49_,13


In [41]:
data['tgt'] = data['tgt'].str.ljust(width=20 , fillchar='.')
data['src'] = data['src'].str.ljust(width=20 , fillchar = '.')

In [42]:
data

Unnamed: 0,src,tgt,tgt_eos
0,0+0=_...............,....0_..............,5
1,0-0=_...............,....0_..............,5
15,0+0+0=_.............,......0_............,7
16,0-0-0=_.............,......0_............,7
18,0+0-0=_.............,......0_............,7
...,...,...,...
2632495,(49+49)-49=_........,...........49_......,13
2632496,49+(49-49)=_........,...........49_......,13
2632497,49-49+49=_..........,.........49_........,11
2632498,(49-49)+49=_........,...........49_......,13


In [43]:
class Tokenizer():
    def __init__(self,tokens , pad , eos , sos):
        self.tokens = [pad , eos , sos] + list(tokens)
        self.stoi = {ch:i for i,ch in enumerate(self.tokens)}
        self.itos = {i:ch for i,ch in enumerate(self.tokens)}
    def encoder(self , string):
        return [self.stoi[s] for s in string]
    def decoder(self, idx):
        return ''.join([self.itos[i] for i in idx])
    def token_len(self):
        return len(self.tokens)

In [44]:
tokenizer = Tokenizer('0123456789-+=()' , pad='.' , eos='_' , sos=':')
tokenizer.decoder(tokenizer.encoder(':0123456'))

':0123456'

In [45]:
data['src'] = data['src'].apply(tokenizer.encoder)
data['tgt'] = data['tgt'].apply(tokenizer.encoder)
train , test = train_test_split(data , train_size=0.8 , random_state=444)

In [46]:
batch_size = 128
epochs = 5
embed_dim = 256
hidden_dim = 256
lr = 0.001
grad_clip = 1
input_dim = tokenizer.token_len()

In [47]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self , data):
        self.data = data

    def __getitem__(self, index):
        # Get input and output from the dataframe
        input_data = self.data.iloc[index,0]
        output_data = self.data.iloc[index,1]
        eos_idx = self.data.iloc[index , 2]
        # Convert input and output to PyTorch tensors
        input_tensor = torch.tensor(input_data)
        output_tensor = torch.tensor(output_data)

        return input_tensor, output_tensor , eos_idx
    def __len__(self):
        return len(self.data)
    
def collate_fn(batch):
    batch_x = [torch.tensor(data[0]) for data in batch] # list[torch.tensor]
    batch_y = [torch.tensor(data[1]) for data in batch] # list[torch.tensor]
    batch_eos = [data[2] for data in batch]
    # eos returned ###########################################################################################################################
    batch_x_lens = torch.LongTensor([len(x) for x in batch_x])
    batch_y_lens = torch.LongTensor([len(y) for y in batch_y])
    
    # torch.tensor
    # [[1968, 1891, 3580, ... , 0, 0, 0],
    #  [1014, 2242, 2247, ... , 0, 0, 0],
    #  [3032,  522, 1485, ... , 0, 0, 0]]
    #                       padding↑
    pad_batch_x = torch.nn.utils.rnn.pad_sequence(batch_x,
                                                  batch_first=True, # shape=(batch_size, seq_len)
                                                  padding_value=tokenizer.stoi['.'])
    pad_batch_y = torch.nn.utils.rnn.pad_sequence(batch_y,
                                                  batch_first=True, # shape=(batch_size, seq_len)
                                                  padding_value=tokenizer.stoi['.'])

    return pad_batch_x, pad_batch_y,batch_x_lens, batch_y_lens , batch_eos # eos returned ##########################################################

In [48]:
train_dataset = Dataset(train)
test_dataset = Dataset(test)

In [49]:
train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                        batch_size=batch_size,
                                        shuffle=True,
                                        collate_fn=collate_fn)
test_data_loader = torch.utils.data.DataLoader(test_dataset,
                                        batch_size=batch_size,
                                        shuffle= False,
                                        collate_fn= collate_fn)

In [50]:
class CharRNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CharRNN, self).__init__()
        
        # Embedding層
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=embed_dim,
                                            padding_idx=tokenizer.stoi['.'])
        
        # RNN層
        self.rnn_layer1 = torch.nn.RNN(input_size=embed_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)
        self.rnn_layer2 = torch.nn.RNN(input_size=hidden_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)
        # output層
        self.linear = torch.nn.Sequential(torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=hidden_dim),
                                          torch.nn.ReLU(),
                                          torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=vocab_size))
    def forward(self, batch_x):
        batch_x = self.encoder(batch_x)
        batch_x = self.linear(batch_x)
        return batch_x
    
    def encoder(self, batch_x):
        batch_x = self.embedding(batch_x)
        batch_x , _ = self.rnn_layer1(batch_x)
        batch_x , _ = self.rnn_layer2(batch_x)
        return batch_x
    

In [51]:
torch.manual_seed(2)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = CharRNN(tokenizer.token_len(),
                embed_dim,
                hidden_dim)

In [52]:
criterion = torch.nn.CrossEntropyLoss( reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [53]:
from tqdm import tqdm
model = model.to(device)
model.train()
i = 0
for epoch in range(1, epochs+1):
    process_bar = tqdm(train_data_loader, desc=f"Training epoch {epoch}")
    for batch_x, batch_y, batch_x_lens, batch_y_lens , batch_eos in process_bar:
        
        # 標準DL訓練幾板斧
        optimizer.zero_grad()
        batch_pred_y = model(batch_x.to(device))
        for idx in batch_eos:
            #print(idx)
            batch_y = batch_y[: , :idx]
            batch_pred_y = batch_pred_y[: , :idx , :]
        #modify length of output and target based of index of eos###########################################################
        batch_y = batch_y.to(device)
        #print(batch_x.shape , batch_y.shape , batch_pred_y.shape)
        ##print(batch_pred_y.shape , batch_y.shape)
        batch_pred_y = batch_pred_y.reshape(-1 , *batch_pred_y.shape[2:])
        batch_y = batch_y.view(-1)
        
        loss = criterion(batch_pred_y, batch_y)
        loss.backward()
        optimizer.step()
        
        i+=1
        if i%10==0:
            process_bar.set_postfix(loss=loss.item())

    # 麻煩各位同學加上 validation 的部分
    # validation_process_bar = tqdm(...)
    # for ... in validation_process_bar:
    #     pred = model...
    model.eval()
    validation_process_bar = tqdm(test_data_loader , desc=f"Testing epoch {epoch}")
    for batch_x , batch_y , batch_x_lens , batch_y_lens , batch_eos in validation_process_bar:
        batch_pred_y = model(batch_x.to(device))
        for idx in batch_eos:
            #print(idx)
            batch_y = batch_y[: , :idx]
            batch_pred_y = batch_pred_y[: , :idx , :]
        batch_y = batch_y.to(device)
        #print(batch_x.shape , batch_y.shape , batch_pred_y.shape)
        ##print(batch_pred_y.shape , batch_y.shape)
        batch_pred_y = batch_pred_y.reshape(-1 , *batch_pred_y.shape[2:])
        batch_y = batch_y.view(-1)
        
        loss = criterion(batch_pred_y, batch_y)
        i+=1
        if i%10==0:
            validation_process_bar.set_postfix(loss=loss.item())
    model.train()

  batch_x = [torch.tensor(data[0]) for data in batch] # list[torch.tensor]
  batch_y = [torch.tensor(data[1]) for data in batch] # list[torch.tensor]
Training epoch 1: 100%|██████████| 6282/6282 [03:25<00:00, 30.52it/s, loss=0.00505]
Testing epoch 1: 100%|██████████| 1571/1571 [00:33<00:00, 46.43it/s, loss=0.00617]
Training epoch 2: 100%|██████████| 6282/6282 [04:21<00:00, 24.02it/s, loss=0.00409]
Testing epoch 2: 100%|██████████| 1571/1571 [00:30<00:00, 50.92it/s, loss=0.0205] 
Training epoch 3: 100%|██████████| 6282/6282 [04:27<00:00, 23.48it/s, loss=0.00402]
Testing epoch 3: 100%|██████████| 1571/1571 [00:30<00:00, 51.09it/s, loss=0.00159]
Training epoch 4: 100%|██████████| 6282/6282 [04:18<00:00, 24.27it/s, loss=0.0201] 
Testing epoch 4: 100%|██████████| 1571/1571 [00:35<00:00, 44.64it/s, loss=0.00263]
Training epoch 5: 100%|██████████| 6282/6282 [04:24<00:00, 23.78it/s, loss=0.0255] 
Testing epoch 5: 100%|██████████| 1571/1571 [00:31<00:00, 49.75it/s, loss=0.00261]


In [54]:
model.eval()
model.cpu()
count = 0.0
for i in range(1000):
    input = torch.tensor(test.iloc[i,0])
    input_en = torch.tensor(test.iloc[i,1])
    input_text = tokenizer.decoder(test.iloc[i,0])
    
    experssion = ''.join(filter(lambda c: c != '.' and c !='_' , input_text))
    out = model(input)
    out = out.topk(1)
    out_exp = tokenizer.decoder([i.item() for i in out.indices])
    ans = tokenizer.decoder(test.iloc[i,1])
    eos_idx = ans.find('_')
    ans = ans[:eos_idx]
    out_exp = out_exp[:eos_idx]
    result = out_exp == ans
    if result == True :
        count = count + 1
    print(experssion , out_exp , ans , result)
print(count/1000)

38+39-7= ........78 ........70 False
19-41+49= .........26 .........27 False
44+30-26= .........36 .........48 False
48-30-15= .........1 .........3 False
0-(5+12)= .........-10 .........-17 False
22+(22-22)= ...........60 ...........22 False
(44-16)+42= ...........22 ...........70 False
(46+48)-9= ..........16 ..........85 False
1+29+13= ........46 ........43 False
39-(20+16)= ...........1 ...........3 False
(49-8)+20= ..........-1 ..........61 False
(3-42)+10= ..........-41 ..........-29 False
27+11+40= .........88 .........78 False
26-(6+49)= ..........627 ..........-29 False
37-29+26= .........36 .........34 False
42-(8+31)= ..........5 ..........3 False
(21-9)+35= ..........-2 ..........47 False
(8+9)-11= .........3 .........6 False
6-41-2= .......-40 .......-37 False
31-30+16= .........18 .........17 False
6-31+2= .......-25 .......-23 False
(40+25)-6= ..........-1 ..........59 False
34-10-34= .........-10 .........-10 True
46-(45+33)= ...........46. ...........-32 False
31+(7-2)