In [3]:
# !pip install transformers -q

[K     |████████████████████████████████| 890kB 2.8MB/s 
[K     |████████████████████████████████| 890kB 15.6MB/s 
[K     |████████████████████████████████| 3.0MB 18.3MB/s 
[K     |████████████████████████████████| 1.1MB 40.2MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [1]:
# !unzip -q data.zip
# !rm data.zip

In [1]:
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd

from glob import glob
import random
from tqdm.notebook import tqdm
from IPython.display import display

In [2]:
SEED = 2020

def set_seed(seed: int = 2020):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    transformers.set_seed(seed)

In [3]:
df_raw = [pd.read_csv(path) for path in glob('data/num_text_pair *.csv')]
df_raw = (pd.concat(df_raw).
          sample(frac=1.0, random_state=SEED).
          reset_index(drop=True))
df_raw = df_raw.astype(str)
df_raw['number'] = df_raw['number'].apply(lambda x: ' '.join(x))

df_raw.shape

(12200, 2)

For better training:
```python
max_digit_len = df_raw['number'].apply(lambda x: len(x)).max()
df_raw['number'] = df_raw['number'].apply(lambda x: '0' * (max_digit_len - len(x)) + x)
df_raw['number'] = df_raw['number'].apply(lambda x: ' '.join(x))
```

* Above code-snippet convert numbers in following manner

    * `859` -> `000859`
    * `3346` -> `003346`
    * `54` -> `000054`

In [4]:
def pre_process(df: pd.DataFrame, num2text: bool) -> pd.DataFrame:
    new_df = pd.DataFrame(columns=['inputs', 'targets'])

    if num2text is True:
        task = 'number to english'
        new_df['inputs'] = df['number'].apply(lambda x: f'{task}: {x}')
        new_df['targets'] = df['text'].apply(lambda x: f'{x}')
    else: 
        task = 'english to number'
        new_df['inputs'] = df['text'].apply(lambda x: f'{task}: {x}')
        new_df['targets'] = df['number'].apply(lambda x: f'{x}')

    return new_df


def concat_df(*args):
    concatenated_df = pd.concat(args)
    concatenated_df = (concatenated_df
                       .sample(frac=1.0, random_state=SEED)
                       .reset_index(drop=True))
    
    return concatenated_df


def to_tensor(df: pd.DataFrame, 
              tokenizer: transformers.tokenization_t5.T5Tokenizer) -> pd.DataFrame:
    tensor_df = pd.DataFrame()
    tensor_df['inputs'] = (df['inputs']
                           .apply(lambda x: tokenizer.encode(x, return_tensors='pt')))
    tensor_df['targets'] = (df['targets']
                            .apply(lambda x: tokenizer.encode(x, return_tensors='pt')))
    
    return tensor_df

In [31]:
def ready_to_feed_df(df, tokenizer):
    df_processed_1 = pre_process(df[0::2], num2text=False)
    df_processed_2 = pre_process(df[1::2], num2text=True)
    concated_df = concat_df(df_processed_1, df_processed_2)
    df_encoded = to_tensor(concated_df, tokenizer=tokenizer)

    return df_encoded

In [32]:
class Seq2SeqDataSet(object):

    def __init__(self, 
                 df: pd.DataFrame, 
                 batch_size: int = 128,
                 suffle: bool = False,
                 seed=0) -> pd.DataFrame:
        self.df = df
        self.bs = batch_size
        self.suffle = suffle
        self.is_gpu_available = torch.cuda.is_available()
        set_seed(seed)
    
    def __len__(self) -> int:
        return len(self.df.iloc[::self.bs, :])

    def __iter__(self):
        self._idx = 0
        return self

    def __next__(self):
        if self._idx < len(self):
            rows = self.df.iloc[self._idx*self.bs : (self._idx+1)*self.bs, :]
            inputs, targets = rows['inputs'].values, rows['targets'].values
            inputs = Seq2SeqDataSet.add_pad(inputs)
            targets = Seq2SeqDataSet.add_pad(targets)
            self._idx += 1

            if self.is_gpu_available:
                return inputs.cuda(), targets.cuda()
            else:
                return inputs, targets
        else:
            if self.suffle:
                self.df = self.df.sample(frac=1.0)
            raise StopIteration
            
    @staticmethod
    def add_pad(tensor: torch.Tensor) -> torch.Tensor:
        max_len = max([x.squeeze().numel() for x in tensor])
        tensor = [F.pad(x, pad=(0, max_len - x.numel()), mode='constant', value=0) for x in tensor]
        tensor = torch.stack(tensor).squeeze()
        
        return tensor

In [7]:
def translate(model: nn.Module, 
              input: torch.Tensor) -> tuple((str, str)):

    input_text = tokenizer.decode(input)
    output = model.generate(input.unsqueeze(0))
    output_text = tokenizer.decode(output[0])

    return input_text, output_text


def evaluate(model: nn.Module,
             dataset: Seq2SeqDataSet) -> tuple((float, str, str)):

    val_loss = 0.0

    for inputs, targets in dataset:
        loss, _, _, _ = model(input_ids=inputs, labels=targets)
        val_loss += loss.item()

    val_loss /= len(dataset)

    random_index = np.random.choice(len(inputs))

    input = inputs[random_index]
    input_text, output_text = translate(model, input)

    return val_loss, input_text, output_text


def train_step(model: nn.Module, 
               dataset: Seq2SeqDataSet) -> float:

    model.train()
    train_loss = 0.0
    
    # get all sentences and corresponding tags in the training data
    for inputs, targets in tqdm(dataset, total=len(dataset)):
        model.zero_grad()

        loss, _, _, _ = model(input_ids=inputs, labels=targets)

        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    model.eval()
    train_loss /= len(dataset)

    return train_loss


def train(model: nn.Module, 
          train_ds: Seq2SeqDataSet, 
          val_ds: Seq2SeqDataSet) -> None:

    for epoch in range(n_epochs):
        train_loss = train_step(model, train_ds)
        val_loss, input_text, output_text = evaluate(model, val_ds)

        print(f"Epoch: {epoch+1}, |",
              f"Training loss: {train_loss:2.6f} |",
              f"Val loss: {val_loss:2.6f} |\n",
              f'Input=> "{input_text}" |',
              f'Pred output=> "{output_text}"')

In [8]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
t5_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

if torch.cuda.is_available():
    t5_model = t5_model.cuda()

In [9]:
df_train, df_test = df_raw[:11000], df_raw[11000:]
df_val, df_test = df_test[:600], df_test[600:]

In [33]:
df_train_encoded = ready_to_feed_df(df_train, tokenizer)
df_val_encoded = ready_to_feed_df(df_val, tokenizer)
df_test_encoded = ready_to_feed_df(df_test, tokenizer)

df_train_encoded.shape, df_val_encoded.shape, df_test_encoded.shape

((11000, 2), (600, 2), (600, 2))

In [61]:
display(df_train_encoded.tail(3)); display(df_val_encoded.tail(3)); display(df_test_encoded.tail(3))

Unnamed: 0,inputs,targets
10997,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(505), tensor(505), tensor(431), tenso..."
10998,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(2641), tensor(6189), tensor(27757), t..."
10999,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(3), tensor(324), tensor(7863), tensor..."


Unnamed: 0,inputs,targets
597,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(209), tensor(489), tensor(505), tenso..."
598,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(662), tensor(6189), tensor(3), tensor..."
599,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(2641), tensor(63), tensor(7863), tens..."


Unnamed: 0,inputs,targets
597,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(668), tensor(505), tensor(668), tenso..."
598,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(192), tensor(6189), tensor(2391), ten..."
599,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(4169), tensor(6189), tensor(6786), te..."


In [34]:
n_epochs = 10
batch_size = 128
lr = 0.001
optimizer = transformers.AdamW(t5_model.parameters(), lr=lr)

train_ds = Seq2SeqDataSet(df=df_train_encoded, 
                          batch_size=batch_size,
                          suffle=True)
val_ds = Seq2SeqDataSet(df=df_val_encoded, 
                        batch_size=batch_size,
                        suffle=False)
test_ds = Seq2SeqDataSet(df=df_test_encoded, 
                         batch_size=batch_size,
                         suffle=False)

In [22]:
train(t5_model, train_ds, val_ds)

HBox(children=(FloatProgress(value=0.0, max=86.0), HTML(value='')))


Epoch: 1, | Training loss: 0.591817 | Val loss: 0.037424 | 
 Input=> "number to english: 7 4 5 9 4 1" | Pred output=> "seven hundred forty-five thousand nine hundred forty-one" |


HBox(children=(FloatProgress(value=0.0, max=86.0), HTML(value='')))


Epoch: 2, | Training loss: 0.061910 | Val loss: 0.023081 | 
 Input=> "number to english: 3 9 2 6 6 5" | Pred output=> "three hundred ninety-two thousand six hundred sixty-five" |


HBox(children=(FloatProgress(value=0.0, max=86.0), HTML(value='')))


Epoch: 3, | Training loss: 0.034275 | Val loss: 0.008328 | 
 Input=> "number to english: 8 0 9 7 6" | Pred output=> "eight hundred ninety-six" |


HBox(children=(FloatProgress(value=0.0, max=86.0), HTML(value='')))


Epoch: 4, | Training loss: 0.017890 | Val loss: 0.002936 | 
 Input=> "english to number: four hundred ninety-eight thousand one hundred forty-two" | Pred output=> "4 9 8 1 4 2" |


HBox(children=(FloatProgress(value=0.0, max=86.0), HTML(value='')))


Epoch: 5, | Training loss: 0.012612 | Val loss: 0.002026 | 
 Input=> "english to number: two hundred eighty-four thousand eight hundred seventy-one" | Pred output=> "2 8 4 8 7 1" |


HBox(children=(FloatProgress(value=0.0, max=86.0), HTML(value='')))


Epoch: 6, | Training loss: 0.008107 | Val loss: 0.000711 | 
 Input=> "english to number: eight hundred ninety-four thousand nine hundred sixty-five" | Pred output=> "8 9 4 9 6 5" |


HBox(children=(FloatProgress(value=0.0, max=86.0), HTML(value='')))


Epoch: 7, | Training loss: 0.005911 | Val loss: 0.000431 | 
 Input=> "number to english: 9 2 9 2 3" | Pred output=> "ninety-two thousand nine hundred twenty-three" |


HBox(children=(FloatProgress(value=0.0, max=86.0), HTML(value='')))


Epoch: 8, | Training loss: 0.004501 | Val loss: 0.000358 | 
 Input=> "english to number: one hundred forty-eight thousand one hundred eighty-nine" | Pred output=> "1 4 8 1 8 9" |


HBox(children=(FloatProgress(value=0.0, max=86.0), HTML(value='')))


Epoch: 9, | Training loss: 0.004134 | Val loss: 0.000314 | 
 Input=> "english to number: eight hundred seventeen thousand one hundred forty-five" | Pred output=> "8 1 7 1 4 5" |


HBox(children=(FloatProgress(value=0.0, max=86.0), HTML(value='')))


Epoch: 10, | Training loss: 0.004582 | Val loss: 0.000766 | 
 Input=> "english to number: four hundred ninety-three" | Pred output=> "4 9 3" |


In [14]:
PATH = 'num_and_eng.pt'
# torch.save(t5_model.state_dict(), PATH)

In [None]:
# from google.colab import drive
# drive.mount('/gdrive')

In [54]:
# !cp  /gdrive/My\ Drive/num_and_eng.pt .

In [15]:
t5_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
t5_model.load_state_dict(torch.load(PATH, map_location='cpu'))

if torch.cuda.is_available():
    t5_model = t5_model.cuda()

In [35]:
translate(t5_model, df_val_encoded.loc[0, 'inputs'][0])

('number to english: 6 1 6 7 9 8',
 'six hundred sixteen thousand seven hundred ninety-eight')

In [36]:
translate(t5_model, df_test_encoded.loc[0, 'inputs'][0])

('number to english: 1 4 7 8 1 4',
 'one hundred forty-seven thousand eight hundred fourteen')

In [40]:
translate(t5_model, df_test_encoded.loc[1, 'inputs'][0])

('english to number: seven hundred fifty-three thousand twenty-nine',
 '7 5 3 0 2 9')