In [320]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
import torch.utils.data
import torch.nn
import numpy as np
import torch.nn.functional as F

In [321]:
data = pd.read_csv('arithmetic.csv')

In [322]:
max(data['tgt'])

117649

In [323]:
data['tgt'] = data['tgt'].apply(str)
data = data[data['src'].apply(lambda x:'*' not in x)]
data = data[data['src'].apply(lambda x:'/' not in x)]
#data = data[data['src'].apply(lambda x:'(' not in x)]

In [324]:
data['tgt'] = data['tgt'].apply(lambda x : x + '_')
data['en_src'] = data['src'].apply(lambda x : ':' + x)

In [325]:
def rshift(row):
    return ''.join(['.' for _ in range(len(row['src']))]) + row['tgt']
data['tgt'] = data.apply(rshift, axis=1)
data['src'] = data['src'].apply(lambda x : x + '_')
data

Unnamed: 0,src,tgt,en_src
0,0+0=_,....0_,:0+0=
1,0-0=_,....0_,:0-0=
15,0+0+0=_,......0_,:0+0+0=
16,0-0-0=_,......0_,:0-0-0=
18,0+0-0=_,......0_,:0+0-0=
...,...,...,...
2632495,(49+49)-49=_,...........49_,:(49+49)-49=
2632496,49+(49-49)=_,...........49_,:49+(49-49)=
2632497,49-49+49=_,.........49_,:49-49+49=
2632498,(49-49)+49=_,...........49_,:(49-49)+49=


In [326]:
data['tgt'] = data['tgt'].str.ljust(width=20 , fillchar='.')
data['en_src'] = data['en_src'].str.ljust(width=20 , fillchar='.')
data['src'] = data['src'].str.ljust(width=20 , fillchar = '.')

In [327]:
data

Unnamed: 0,src,tgt,en_src
0,0+0=_...............,....0_..............,:0+0=...............
1,0-0=_...............,....0_..............,:0-0=...............
15,0+0+0=_.............,......0_............,:0+0+0=.............
16,0-0-0=_.............,......0_............,:0-0-0=.............
18,0+0-0=_.............,......0_............,:0+0-0=.............
...,...,...,...
2632495,(49+49)-49=_........,...........49_......,:(49+49)-49=........
2632496,49+(49-49)=_........,...........49_......,:49+(49-49)=........
2632497,49-49+49=_..........,.........49_........,:49-49+49=..........
2632498,(49-49)+49=_........,...........49_......,:(49-49)+49=........


In [328]:
class Tokenizer():
    def __init__(self,tokens , pad , eos , sos):
        self.tokens = [pad , eos , sos] + list(tokens)
        self.stoi = {ch:i for i,ch in enumerate(self.tokens)}
        self.itos = {i:ch for i,ch in enumerate(self.tokens)}
    def encoder(self , string):
        return [self.stoi[s] for s in string]
    def decoder(self, idx):
        return ''.join([self.itos[i] for i in idx])
    def token_len(self):
        return len(self.tokens)

In [329]:
tokenizer = Tokenizer('0123456789-+=()' , pad='.' , eos='_' , sos=':')
tokenizer.decoder(tokenizer.encoder(':0123456'))

':0123456'

In [330]:
data['src'] = data['src'].apply(tokenizer.encoder)
data['tgt'] = data['tgt'].apply(tokenizer.encoder)
data['en_src'] = data['en_src'].apply(tokenizer.encoder)
train , test = train_test_split(data , train_size=0.8 , random_state=444)

In [331]:
batch_size = 128
epochs = 5
embed_dim = 256
hidden_dim = 256
lr = 0.001
grad_clip = 1
input_dim = tokenizer.token_len()

In [332]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self , data):
        self.data = data

    def __getitem__(self, index):
        # Get input and output from the dataframe
        input_data = self.data.iloc[index,0]
        en_input_data = self.data.iloc[index,2]
        output_data = self.data.iloc[index,1]
        # Convert input and output to PyTorch tensors
        input_tensor = torch.tensor(input_data)
        output_tensor = torch.tensor(output_data)
        en_input_tensor = torch.tensor(en_input_data)

        return input_tensor, output_tensor , en_input_tensor
    def __len__(self):
        return len(self.data)
    
def collate_fn(batch):
    batch_x = [torch.tensor(data[0]) for data in batch] # list[torch.tensor]
    batch_y = [torch.tensor(data[1]) for data in batch] # list[torch.tensor]
    batch_en = [torch.tensor(data[2]) for data in batch]
    batch_x_lens = torch.LongTensor([len(x) for x in batch_x])
    batch_y_lens = torch.LongTensor([len(y) for y in batch_y])
    batch_en_lens = torch.LongTensor([len(x) for x in batch_en])
    
    # torch.tensor
    # [[1968, 1891, 3580, ... , 0, 0, 0],
    #  [1014, 2242, 2247, ... , 0, 0, 0],
    #  [3032,  522, 1485, ... , 0, 0, 0]]
    #                       padding↑
    pad_batch_x = torch.nn.utils.rnn.pad_sequence(batch_x,
                                                  batch_first=True, # shape=(batch_size, seq_len)
                                                  padding_value=tokenizer.stoi['.'])
    pad_batch_y = torch.nn.utils.rnn.pad_sequence(batch_y,
                                                  batch_first=True, # shape=(batch_size, seq_len)
                                                  padding_value=tokenizer.stoi['.'])
    pad_batch_en = torch.nn.utils.rnn.pad_sequence(batch_en,
                                                   batch_first=True,
                                                   padding_value=tokenizer.stoi['.']
                                                )

    return pad_batch_x, pad_batch_y,pad_batch_en, batch_x_lens, batch_y_lens , batch_en_lens

In [333]:
train_dataset = Dataset(train)
test_dataset = Dataset(test)

In [334]:
train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                        batch_size=batch_size,
                                        shuffle=True,
                                        collate_fn=collate_fn)
test_data_loader = torch.utils.data.DataLoader(test_dataset,
                                        batch_size=batch_size,
                                        shuffle= False,
                                        collate_fn= collate_fn)

In [335]:
class CharRNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CharRNN, self).__init__()
        
        # Embedding層
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=embed_dim,
                                            padding_idx=tokenizer.stoi['.'])
        
        # RNN層
        self.rnn_layer1 = torch.nn.RNN(input_size=embed_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)
        self.rnn_layer2 = torch.nn.RNN(input_size=hidden_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)
        # output層
        self.linear = torch.nn.Sequential(torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=hidden_dim),
                                          torch.nn.ReLU(),
                                          torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=vocab_size))
        self.embedding2 = torch.nn.Embedding(num_embeddings=vocab_size,embedding_dim=embed_dim , padding_idx=tokenizer.stoi['.'])
        self.gru = torch.nn.GRU(input_size=embed_dim,hidden_size=embed_dim,batch_first=True)

    def forward(self, batch_x , target):
        hidden = self.encoder(batch_x)
        output , _ = self.decoder(prev_hidden=hidden , target=target)
        output = self.linear(output)
        return output
    
    def encoder(self, batch_x):
        batch_x = self.embedding(batch_x)
        batch_x , ht = self.rnn_layer1(batch_x)
        batch_x , ht = self.rnn_layer2(batch_x)
        return ht
    def decoder(self , prev_hidden , target):
        decoder_hidden = prev_hidden
        decoder_outputs ,_ = self.forward_step(target , decoder_hidden)
        #decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden
    
    def forward_step(self,input , hidden):
        output = self.embedding2(input)
        output , hidden = self.gru(output , hidden)
        return output , hidden

    

In [336]:
torch.manual_seed(2)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = CharRNN(tokenizer.token_len(),
                embed_dim,
                hidden_dim)


In [337]:
criterion = torch.nn.CrossEntropyLoss( reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [338]:
from tqdm import tqdm
model = model.to(device)
model.train()
i = 0
for epoch in range(1, epochs+1):
    process_bar = tqdm(train_data_loader, desc=f"Training epoch {epoch}")
    for batch_x, batch_y,batch_en, batch_x_lens, batch_y_lens,batch_en_lens in process_bar:
        
        # 標準DL訓練幾板斧
        optimizer.zero_grad()
        batch_pred_y = model(batch_x.to(device) , batch_en.to(device))
        batch_y = batch_y.to(device)
        #print(batch_pred_y.shape , batch_y.shape , batch_en.shape)
        ##print(batch_x.shape , batch_y.shape , batch_pred_y.shape)
        batch_pred_y = batch_pred_y.view(-1 , *batch_pred_y.shape[2:])
        batch_y = batch_y.view(-1)
        loss = criterion(batch_pred_y, batch_y)
        loss.backward()
        optimizer.step()

        i+=1
        if i%10==0:
            process_bar.set_postfix(loss=loss.item())

    # 麻煩各位同學加上 validation 的部分
    # validation_process_bar = tqdm(...)
    # for ... in validation_process_bar:
    #     pred = model...
    model.eval()
    validation_process_bar = tqdm(test_data_loader , desc=f"Testing epoch {epoch}")
    for batch_x , batch_y,batch_en , batch_x_lens , batch_y_lens,batch_en_lens in validation_process_bar:
        batch_pred_y = model(batch_x.to(device), batch_en.to(device))
        batch_pred_y = batch_pred_y.view(-1 , *batch_pred_y.shape[2:])
        batch_y = batch_y.view(-1).to(device)
        loss = criterion(batch_pred_y, batch_y)
        i+=1
        if i%10==0:
            validation_process_bar.set_postfix(loss=loss.item())
    model.train()

  batch_x = [torch.tensor(data[0]) for data in batch] # list[torch.tensor]
  batch_y = [torch.tensor(data[1]) for data in batch] # list[torch.tensor]
  batch_en = [torch.tensor(data[2]) for data in batch]


Training epoch 1: 100%|██████████| 6282/6282 [02:10<00:00, 48.06it/s, loss=0.107]
Testing epoch 1: 100%|██████████| 1571/1571 [00:31<00:00, 49.24it/s, loss=0.113] 
Training epoch 2: 100%|██████████| 6282/6282 [02:10<00:00, 48.04it/s, loss=0.0815]
Testing epoch 2: 100%|██████████| 1571/1571 [00:32<00:00, 48.69it/s, loss=0.0747]
Training epoch 3: 100%|██████████| 6282/6282 [02:10<00:00, 48.07it/s, loss=0.0604]
Testing epoch 3: 100%|██████████| 1571/1571 [00:32<00:00, 48.77it/s, loss=0.0495]
Training epoch 4: 100%|██████████| 6282/6282 [02:10<00:00, 48.06it/s, loss=0.0632]
Testing epoch 4: 100%|██████████| 1571/1571 [00:31<00:00, 49.33it/s, loss=0.0449]
Training epoch 5: 100%|██████████| 6282/6282 [02:10<00:00, 48.26it/s, loss=0.035] 
Testing epoch 5: 100%|██████████| 1571/1571 [00:32<00:00, 48.85it/s, loss=0.0326]


In [342]:
model.eval()
model.cpu()
count = 0.0
for i in range(1000):
    input = torch.tensor(test.iloc[i,0])
    input_en = torch.tensor(test.iloc[i,2])
    input_text = tokenizer.decoder(test.iloc[i,0])
    experssion = ''.join(filter(lambda c: c != '.' and c !='_' , input_text))
    out = model(input , input_en)
    out = out.topk(1)
    out_exp = tokenizer.decoder([i.item() for i in out.indices])
    ans = tokenizer.decoder(test.iloc[i,1])
    result = out_exp == ans
    if result == True :
        count = count + 1
    print(experssion , out_exp , ans , result)
print(count/1000)

38+39-7= ........70_......... ........70_......... True
19-41+49= .........27_........ .........27_........ True
44+30-26= .........48_........ .........48_........ True
48-30-15= .........4_......... .........3_......... False
0-(5+12)= .........-18_....... .........-17_....... False
22+(22-22)= ...........22_...... ...........22_...... True
(44-16)+42= ...........79_...... ...........70_...... False
(46+48)-9= ..........87_....... ..........85_....... False
1+29+13= ........43_......... ........43_......... True
39-(20+16)= ...........3_....... ...........3_....... True
(49-8)+20= ..........69_....... ..........61_....... False
(3-42)+10= ..........-39_...... ..........-29_...... False
27+11+40= .........79_........ .........78_........ False
26-(6+49)= ..........-39_...... ..........-29_...... False
37-29+26= .........35_........ .........34_........ False
42-(8+31)= ..........2_........ ..........3_........ False
(21-9)+35= ..........47_....... ..........47_....... True
(8+9)-11= .