In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
import torch.utils.data
import torch.nn
import numpy as np
import torch.nn.functional as F

In [8]:
data = pd.read_csv('arithmetic.csv')

In [9]:
data['tgt'] = data['tgt'].apply(str)
data = data[data['src'].apply(lambda x:'*' not in x)]
data = data[data['src'].apply(lambda x:'/' not in x)]
#data = data[data['src'].apply(lambda x:'(' not in x)]

In [10]:
data['tgt'] = data['tgt'].apply(lambda x : x + '_')

In [11]:
def rshift(row):
    return ''.join(['.' for _ in range(len(row['src']))]) + row['tgt']
data['tgt'] = data.apply(rshift, axis=1)
data['src'] = data['src'].apply(lambda x : x + '_')
data

Unnamed: 0,src,tgt
0,0+0=_,....0_
1,0-0=_,....0_
15,0+0+0=_,......0_
16,0-0-0=_,......0_
18,0+0-0=_,......0_
...,...,...
2632495,(49+49)-49=_,...........49_
2632496,49+(49-49)=_,...........49_
2632497,49-49+49=_,.........49_
2632498,(49-49)+49=_,...........49_


In [12]:
data['tgt'] = data['tgt'].str.ljust(width=20 , fillchar='.')
data['src'] = data['src'].str.ljust(width=20 , fillchar = '.')

In [13]:
data

Unnamed: 0,src,tgt
0,0+0=_...............,....0_..............
1,0-0=_...............,....0_..............
15,0+0+0=_.............,......0_............
16,0-0-0=_.............,......0_............
18,0+0-0=_.............,......0_............
...,...,...
2632495,(49+49)-49=_........,...........49_......
2632496,49+(49-49)=_........,...........49_......
2632497,49-49+49=_..........,.........49_........
2632498,(49-49)+49=_........,...........49_......


In [14]:
class Tokenizer():
    def __init__(self,tokens , pad , eos , sos):
        self.tokens = [pad , eos , sos] + list(tokens)
        self.stoi = {ch:i for i,ch in enumerate(self.tokens)}
        self.itos = {i:ch for i,ch in enumerate(self.tokens)}
    def encoder(self , string):
        return [self.stoi[s] for s in string]
    def decoder(self, idx):
        return ''.join([self.itos[i] for i in idx])
    def token_len(self):
        return len(self.tokens)

In [15]:
tokenizer = Tokenizer('0123456789-+=()' , pad='.' , eos='_' , sos=':')
tokenizer.decoder(tokenizer.encoder(':0123456'))

':0123456'

In [16]:
data['src'] = data['src'].apply(tokenizer.encoder)
data['tgt'] = data['tgt'].apply(tokenizer.encoder)
train , test = train_test_split(data , train_size=0.8 , random_state=444)

In [17]:
batch_size = 128
epochs = 5
embed_dim = 256
hidden_dim = 256
lr = 0.001
grad_clip = 1
input_dim = tokenizer.token_len()

In [18]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self , data):
        self.data = data

    def __getitem__(self, index):
        # Get input and output from the dataframe
        input_data = self.data.iloc[index,0]
        output_data = self.data.iloc[index,1]
        # Convert input and output to PyTorch tensors
        input_tensor = torch.tensor(input_data)
        output_tensor = torch.tensor(output_data)

        return input_tensor, output_tensor
    def __len__(self):
        return len(self.data)
    
def collate_fn(batch):
    batch_x = [torch.tensor(data[0]) for data in batch] # list[torch.tensor]
    batch_y = [torch.tensor(data[1]) for data in batch] # list[torch.tensor]
    batch_x_lens = torch.LongTensor([len(x) for x in batch_x])
    batch_y_lens = torch.LongTensor([len(y) for y in batch_y])
    
    # torch.tensor
    # [[1968, 1891, 3580, ... , 0, 0, 0],
    #  [1014, 2242, 2247, ... , 0, 0, 0],
    #  [3032,  522, 1485, ... , 0, 0, 0]]
    #                       padding↑
    pad_batch_x = torch.nn.utils.rnn.pad_sequence(batch_x,
                                                  batch_first=True, # shape=(batch_size, seq_len)
                                                  padding_value=tokenizer.stoi['.'])
    pad_batch_y = torch.nn.utils.rnn.pad_sequence(batch_y,
                                                  batch_first=True, # shape=(batch_size, seq_len)
                                                  padding_value=tokenizer.stoi['.'])

    return pad_batch_x, pad_batch_y,batch_x_lens, batch_y_lens

In [19]:
train_dataset = Dataset(train)
test_dataset = Dataset(test)

In [20]:
train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                        batch_size=batch_size,
                                        shuffle=True,
                                        collate_fn=collate_fn)
test_data_loader = torch.utils.data.DataLoader(test_dataset,
                                        batch_size=batch_size,
                                        shuffle= False,
                                        collate_fn= collate_fn)

In [21]:
class CharRNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CharRNN, self).__init__()
        
        # Embedding層
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=embed_dim,
                                            padding_idx=tokenizer.stoi['.'])
        
        # RNN層
        self.rnn_layer1 = torch.nn.RNN(input_size=embed_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)
        self.rnn_layer2 = torch.nn.RNN(input_size=hidden_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)
        # output層
        self.linear = torch.nn.Sequential(torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=hidden_dim),
                                          torch.nn.ReLU(),
                                          torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=vocab_size))
    def forward(self, batch_x):
        batch_x = self.encoder(batch_x)
        batch_x = self.linear(batch_x)
        return batch_x
    
    def encoder(self, batch_x):
        batch_x = self.embedding(batch_x)
        batch_x , _ = self.rnn_layer1(batch_x)
        batch_x , _ = self.rnn_layer2(batch_x)
        return batch_x
    

In [22]:
torch.manual_seed(2)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = CharRNN(tokenizer.token_len(),
                embed_dim,
                hidden_dim)

In [23]:
criterion = torch.nn.CrossEntropyLoss( reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
from tqdm import tqdm
model = model.to(device)
model.train()
i = 0
for epoch in range(1, epochs+1):
    process_bar = tqdm(train_data_loader, desc=f"Training epoch {epoch}")
    for batch_x, batch_y, batch_x_lens, batch_y_lens in process_bar:
        
        # 標準DL訓練幾板斧
        optimizer.zero_grad()
        batch_pred_y = model(batch_x.to(device))
        batch_y = batch_y.to(device)
        ##print(batch_pred_y.shape , batch_y.shape)
        ##print(batch_x.shape , batch_y.shape , batch_pred_y.shape)
        batch_pred_y = batch_pred_y.view(-1 , *batch_pred_y.shape[2:])
        batch_y = batch_y.view(-1)
        loss = criterion(batch_pred_y, batch_y)
        loss.backward()
        optimizer.step()
        
        i+=1
        if i%10==0:
            process_bar.set_postfix(loss=loss.item())

    # 麻煩各位同學加上 validation 的部分
    # validation_process_bar = tqdm(...)
    # for ... in validation_process_bar:
    #     pred = model...
    model.eval()
    validation_process_bar = tqdm(test_data_loader , desc=f"Testing epoch {epoch}")
    for batch_x , batch_y , batch_x_lens , batch_y_lens in validation_process_bar:
        batch_pred_y = model(batch_x.to(device))
        batch_pred_y = batch_pred_y.view(-1 , *batch_pred_y.shape[2:])
        batch_y = batch_y.view(-1).to(device)
        loss = criterion(batch_pred_y, batch_y)
        i+=1
        if i%10==0:
            validation_process_bar.set_postfix(loss=loss.item())
    model.train()

  batch_x = [torch.tensor(data[0]) for data in batch] # list[torch.tensor]
  batch_y = [torch.tensor(data[1]) for data in batch] # list[torch.tensor]
Training epoch 1: 100%|██████████| 6282/6282 [01:28<00:00, 70.62it/s, loss=0.127]
Testing epoch 1: 100%|██████████| 1571/1571 [00:22<00:00, 68.42it/s, loss=0.139]
Training epoch 2: 100%|██████████| 6282/6282 [01:28<00:00, 70.75it/s, loss=0.104] 
Testing epoch 2: 100%|██████████| 1571/1571 [00:22<00:00, 70.63it/s, loss=0.103] 
Training epoch 3: 100%|██████████| 6282/6282 [01:28<00:00, 71.10it/s, loss=0.0939]
Testing epoch 3: 100%|██████████| 1571/1571 [00:22<00:00, 70.55it/s, loss=0.0936]
Training epoch 4: 100%|██████████| 6282/6282 [01:27<00:00, 71.56it/s, loss=0.0838]
Testing epoch 4: 100%|██████████| 1571/1571 [00:21<00:00, 71.60it/s, loss=0.0743]
Training epoch 5: 100%|██████████| 6282/6282 [01:28<00:00, 71.06it/s, loss=0.0765]
Testing epoch 5: 100%|██████████| 1571/1571 [00:22<00:00, 71.05it/s, loss=0.0798]


In [25]:
model.eval()
model.cpu()
count = 0.0
for i in range(1000):
    input = torch.tensor(test.iloc[i,0])
    input_en = torch.tensor(test.iloc[i,1])
    input_text = tokenizer.decoder(test.iloc[i,0])
    experssion = ''.join(filter(lambda c: c != '.' and c !='_' , input_text))
    out = model(input)
    out = out.topk(1)
    out_exp = tokenizer.decoder([i.item() for i in out.indices])
    ans = tokenizer.decoder(test.iloc[i,1])
    result = out_exp == ans
    if result == True :
        count = count + 1
    print(experssion , out_exp , ans , result)
print(count/1000)

38+39-7= ........68_......... ........70_......... False
19-41+49= .........28_........ .........27_........ False
44+30-26= .........48_........ .........48_........ True
48-30-15= .........4_......... .........3_......... False
0-(5+12)= .........-17_....... .........-17_....... True
22+(22-22)= ...........22_...... ...........22_...... True
(44-16)+42= ...........68_...... ...........70_...... False
(46+48)-9= ..........85_....... ..........85_....... True
1+29+13= ........42_......... ........43_......... False
39-(20+16)= ...........3_....... ...........3_....... True
(49-8)+20= ..........60_....... ..........61_....... False
(3-42)+10= ..........-28_...... ..........-29_...... False
27+11+40= .........78_........ .........78_........ True
26-(6+49)= ..........-28_...... ..........-29_...... False
37-29+26= .........35_........ .........34_........ False
42-(8+31)= ..........4_........ ..........3_........ False
(21-9)+35= ..........47_....... ..........47_....... True
(8+9)-11= .