In [None]:
# !pip install transformers -q

[K     |████████████████████████████████| 778kB 6.9MB/s 
[K     |████████████████████████████████| 3.0MB 42.0MB/s 
[K     |████████████████████████████████| 890kB 40.3MB/s 
[K     |████████████████████████████████| 1.1MB 37.7MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd

from glob import glob
from tqdm.notebook import tqdm
from IPython.display import display

SEED = 2020

In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small").cuda()

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-small and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Pre-process data

In [None]:
# !unzip -q data.zip
# !rm data.zip

In [None]:
df_raw = [pd.read_csv(path) for path in glob('data/num_text_pair *.csv')]
df_raw = (pd.concat(df_raw).
          sample(frac=1.0, random_state=SEED).
          reset_index(drop=True))
df_raw = df_raw.astype(str)
df_raw['number'] = df_raw['number'].apply(lambda x: ' '.join(x))

df_raw.shape

(12200, 2)

In [None]:
df_raw.tail()

Unnamed: 0,number,text
12195,7 8 6 6 9 9,seven hundred eighty-six thousand six hundred ...
12196,8 6 1 1 1 7,eight hundred sixty-one thousand one hundred s...
12197,5 2 2 6,five thousand two hundred twenty-six
12198,9 2 6 4 7,ninety-two thousand six hundred forty-seven
12199,2 0 5 6 0 8,two hundred five thousand six hundred eight


## Unsupervised denoising training

## Supervised training

In [None]:
# from sklearn.model_selection import train_test_split

In [None]:
df_train, df_test = df_raw[:11000], df_raw[11000:]
df_val, df_test = df_test[:600], df_test[600:]

In [None]:
print('Total data:', df_raw.shape[0])
print('Total train data:', df_train.shape[0])
print('Total validation data:', df_val.shape[0])
print('Total test data:', df_test.shape[0])

Total data: 12200
Total train data: 11000
Total validation data: 600
Total test data: 600


In [None]:
def pre_process(df: pd.DataFrame, num2text: bool) -> pd.DataFrame:
    new_df = pd.DataFrame(columns=['inputs', 'targets'])

    if num2text is True:
        task = 'number to english'
        new_df['inputs'] = df['number'].apply(lambda x: f'{task}: {x} </s>')
        new_df['targets'] = df['text'].apply(lambda x: f'{x} </s>')
    else: 
        task = 'english to number'
        new_df['inputs'] = df['text'].apply(lambda x: f'{task}: {x} </s>')
        new_df['targets'] = df['number'].apply(lambda x: f'{x} </s>')

    return new_df

In [None]:
pre_process(df_raw.tail(), num2text=False)

Unnamed: 0,inputs,targets
12195,english to number: seven hundred eighty-six th...,7 8 6 6 9 9 </s>
12196,english to number: eight hundred sixty-one tho...,8 6 1 1 1 7 </s>
12197,english to number: five thousand two hundred t...,5 2 2 6 </s>
12198,english to number: ninety-two thousand six hun...,9 2 6 4 7 </s>
12199,english to number: two hundred five thousand s...,2 0 5 6 0 8 </s>


### Train dataset preparation

In [None]:
df_train_processed_1 = pre_process(df_train[0::3], num2text=False)
df_train_processed_2 = pre_process(df_train[1::3], num2text=True)
df_train_processed_3 = pre_process(df_train[2::3], num2text=False)
df_train_processed_4 = pre_process(df_train[2::3], num2text=True)

print(df_train_processed_1.shape[0], df_train_processed_2.shape[0])
print(df_train_processed_3.shape[0], df_train_processed_4.shape[0])

3667 3667
3666 3666


In [None]:
total_train_size = (df_train_processed_1.shape[0] + 
                    df_train_processed_2.shape[0] + 
                    df_train_processed_3.shape[0] + 
                    df_train_processed_4.shape[0])

total_train_size

14666

In [None]:
df_train_total = pd.concat([
    df_train_processed_1,
    df_train_processed_2,
    df_train_processed_3,
    df_train_processed_4
])

df_train_total = (df_train_total
                  .sample(frac=1.0, random_state=SEED)
                  .reset_index(drop=True))

del df_train_processed_1, df_train_processed_2, df_train_processed_3, df_train_processed_4

print(df_train_total.shape)
df_train_total

(14666, 2)


Unnamed: 0,inputs,targets
0,english to number: fifty-three thousand one hu...,5 3 1 9 1 </s>
1,english to number: twenty-four thousand three ...,2 4 3 0 1 </s>
2,english to number: eight hundred sixty-four th...,8 6 4 7 0 2 </s>
3,number to english: 5 4 4 3 4 3 </s>,five hundred forty-four thousand three hundred...
4,number to english: 4 3 2 3 0 </s>,forty-three thousand two hundred thirty </s>
...,...,...
14661,english to number: one hundred sixty-three tho...,1 6 3 4 1 6 </s>
14662,number to english: 6 9 3 1 3 1 </s>,six hundred ninety-three thousand one hundred ...
14663,english to number: one hundred fifty-nine thou...,1 5 9 1 4 8 </s>
14664,number to english: 1 9 0 3 8 </s>,nineteen thousand thirty-eight </s>


In [None]:
df_train_total_encoded = pd.DataFrame()
df_train_total_encoded['inputs'] = (df_train_total['inputs']
                                    .apply(lambda x: tokenizer.encode(x, return_tensors='pt')))
df_train_total_encoded['targets'] = (df_train_total['targets']
                                     .apply(lambda x: tokenizer.encode(x, return_tensors='pt')))

df_train_total_encoded

Unnamed: 0,inputs,targets
0,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(305), tensor(220), tensor(209), tenso..."
1,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(204), tensor(314), tensor(220), tenso..."
2,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(505), tensor(431), tensor(314), tenso..."
3,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(874), tensor(6189), tensor(19662), te..."
4,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(19662), tensor(18), tensor(21182), te..."
...,...,...
14661,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(209), tensor(431), tensor(220), tenso..."
14662,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(1296), tensor(6189), tensor(4169), te..."
14663,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(209), tensor(305), tensor(668), tenso..."
14664,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(4169), tensor(6808), tensor(7863), te..."


### Validation dataset preparation

In [None]:
df_val_processed_1 = pre_process(df_val[0::2], num2text=False)
df_val_processed_2 = pre_process(df_val[1::2], num2text=True)

df_val_total = pd.concat([
    df_val_processed_1,
    df_val_processed_2
])

df_val_total = (df_val_total
                .sample(frac=1.0, random_state=SEED)
                .reset_index(drop=True))

del df_val_processed_1, df_val_processed_2

print(df_val_total.shape)
df_val_total

(600, 2)


Unnamed: 0,inputs,targets
0,number to english: 1 3 7 9 2 6 </s>,one hundred thirty-seven thousand nine hundred...
1,english to number: two hundred eighty thousand...,2 8 0 0 8 9 </s>
2,english to number: fifty thousand three hundre...,5 0 3 0 4 </s>
3,english to number: three hundred five thousand...,3 0 5 8 8 3 </s>
4,number to english: 6 0 7 0 5 1 </s>,six hundred seven thousand fifty-one </s>
...,...,...
595,english to number: forty-eight thousand one hu...,4 8 1 1 7 </s>
596,number to english: 7 5 8 3 9 4 </s>,seven hundred fifty-eight thousand three hundr...
597,english to number: eight hundred four thousand...,8 0 4 6 3 4 </s>
598,number to english: 7 2 9 8 2 6 </s>,seven hundred twenty-nine thousand eight hundr...


In [None]:
df_val_total_encoded = pd.DataFrame()
df_val_total_encoded['inputs'] = (df_val_total['inputs']
                                  .apply(lambda x: tokenizer.encode(x, return_tensors='pt')))
df_val_total_encoded['targets'] = (df_val_total['targets']
                                   .apply(lambda x: tokenizer.encode(x, return_tensors='pt')))

df_val_total_encoded

Unnamed: 0,inputs,targets
0,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(80), tensor(6189), tensor(12010), ten..."
1,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(204), tensor(505), tensor(3), tensor(..."
2,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(305), tensor(3), tensor(632), tensor(..."
3,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(220), tensor(3), tensor(632), tensor(..."
4,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(1296), tensor(6189), tensor(2391), te..."
...,...,...
595,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(314), tensor(505), tensor(209), tenso..."
596,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(2391), tensor(6189), tensor(18358), t..."
597,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(505), tensor(3), tensor(632), tensor(..."
598,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(2391), tensor(6189), tensor(6786), te..."


### Test dataset preparation

In [None]:
df_test_processed_1 = pre_process(df_test[0::2], num2text=False)
df_test_processed_2 = pre_process(df_test[1::2], num2text=True)

df_test_total = pd.concat([
    df_test_processed_1,
    df_test_processed_2
])

df_test_total = (df_test_total
                 .sample(frac=1.0, random_state=SEED)
                 .reset_index(drop=True))

del df_test_processed_1, df_test_processed_2

print(df_test_total.shape)
df_test_total

(600, 2)


Unnamed: 0,inputs,targets
0,number to english: 7 5 4 3 6 3 </s>,seven hundred fifty-four thousand three hundre...
1,english to number: forty-one thousand eight hu...,4 1 8 8 5 </s>
2,english to number: two hundred forty-eight tho...,2 4 8 0 9 5 </s>
3,english to number: sixty-one thousand one hund...,6 1 1 8 4 </s>
4,number to english: 4 7 3 0 9 5 </s>,four hundred seventy-three thousand ninety-fiv...
...,...,...
595,english to number: eight hundred eighty-eight ...,8 8 8 2 5 1 </s>
596,number to english: 6 5 2 0 1 4 </s>,six hundred fifty-two thousand fourteen </s>
597,english to number: three hundred six thousand ...,3 0 6 3 3 0 </s>
598,number to english: 5 8 3 7 0 3 </s>,five hundred eighty-three thousand seven hundr...


In [None]:
df_test_total_encoded = pd.DataFrame()
df_test_total_encoded['inputs'] = (df_test_total['inputs']
                                   .apply(lambda x: tokenizer.encode(x, return_tensors='pt')))
df_test_total_encoded['targets'] = (df_test_total['targets']
                                    .apply(lambda x: tokenizer.encode(x, return_tensors='pt')))

df_test_total_encoded

Unnamed: 0,inputs,targets
0,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(2391), tensor(6189), tensor(18358), t..."
1,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(314), tensor(209), tensor(505), tenso..."
2,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(204), tensor(314), tensor(505), tenso..."
3,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(431), tensor(209), tensor(209), tenso..."
4,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(662), tensor(6189), tensor(2391), ten..."
...,...,...
595,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(505), tensor(505), tensor(505), tenso..."
596,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(1296), tensor(6189), tensor(18358), t..."
597,"[[tensor(22269), tensor(12), tensor(381), tens...","[[tensor(220), tensor(3), tensor(632), tensor(..."
598,"[[tensor(381), tensor(12), tensor(22269), tens...","[[tensor(874), tensor(6189), tensor(2641), ten..."


In [None]:
# df_test_total_encoded['inputs'] = df_test_total_encoded['inputs'].astype(int)
# df_test_total_encoded['targets'] = df_test_total_encoded['targets'].astype(int)

In [None]:
df_test_total_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   inputs   600 non-null    object
 1   targets  600 non-null    object
dtypes: object(2)
memory usage: 9.5+ KB


In [None]:
def generate_batch(df, batch_size=128):
    i = 0

    def add_pad(tensor):
        max_len = max([x.squeeze().numel() for x in tensor])
        tensor = [F.pad(x, pad=(0, max_len - x.numel()), mode='constant', value=0) for x in tensor]
        tensor = torch.stack(tensor).squeeze()
        return tensor

    while True:
        rows = df.iloc[i*batch_size : (i+1)*batch_size, :]
        
        if len(rows) == 0:
            break
            
        inputs, targets = rows['inputs'].values, rows['targets'].values

        inputs, targets = add_pad(inputs), add_pad(targets)

        yield inputs, targets
        i += 1

### Train

In [None]:
n_epochs = 10
batch_size = 128
lr = 0.001
optimizer = transformers.AdamW(model.parameters(), lr=lr)

In [None]:
def evaluate(df = df_val_total_encoded):
    model.eval()
    val_loss = 0.0

    for inputs, targets in generate_batch(df, batch_size=batch_size):
        inputs, targets = inputs.cuda(), targets.cuda()
        loss, _, _, _ = model(input_ids=inputs, labels=targets)
        val_loss += loss.item()

    val_loss /= len(df.iloc[::batch_size, :])

    random_index = np.random.choice(20)
    input = inputs[random_index].cuda()
    output = model.generate(input.unsqueeze(0))
    output = tokenizer.decode(output[0])

    model.train()

    return val_loss, tokenizer.decode(input), output
        

def train(df = df_train_total_encoded):
    model.train()

    for epoch in range(n_epochs):
        
        epoch_loss = 0.0
        
        # get all sentences and corresponding tags in the training data
        for inputs, targets in tqdm(generate_batch(df, batch_size=batch_size)):
            inputs, targets = inputs.cuda(), targets.cuda()
            
            model.zero_grad()

            loss, _, _, _ = model(input_ids=inputs, labels=targets)
            epoch_loss += loss.item()
            loss.backward()
            
            optimizer.step()
            
        # print out avg loss per 1 epochs
        if(epoch%1 == 0):
            val_loss, input, output = evaluate()

            print(f"Epoch: {epoch+1}, |",
            f"Training loss: {epoch_loss/len(df.iloc[::batch_size, :]):2.6f} |",
            f"Val loss: {val_loss:2.6f} |",
            f'\n',
            f'Input=> "{input:}" |',
            f'Pred output=> "{output:}" |')

    model.eval()

In [None]:
train()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 1, | Training loss: 0.39759 | Val loss: 0.02946 | 
 Input=> "english to number: thirty-two thousand eight hundred forty-one" | Pred output=> "3 2 8 4 1" |


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 2, | Training loss: 0.04324 | Val loss: 0.01491 | 
 Input=> "number to english: 7 1 4 1 1" | Pred output=> "seven hundred fourteen thousand one hundred eleven" |


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 3, | Training loss: 0.02275 | Val loss: 0.00463 | 
 Input=> "number to english: 9 2 5 5 1 4" | Pred output=> "nine hundred twenty-five thousand five hundred fourteen" |


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 4, | Training loss: 0.01018 | Val loss: 0.00137 | 
 Input=> "english to number: thirty-two thousand eight hundred forty-one" | Pred output=> "3 2 8 4 1" |


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 5, | Training loss: 0.00602 | Val loss: 0.00012 | 
 Input=> "number to english: 2 0 8 0 9 8" | Pred output=> "two hundred eight thousand nine hundred ninety-eight" |


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 6, | Training loss: 0.00454 | Val loss: 0.00007 | 
 Input=> "number to english: 4 9 4 3 2" | Pred output=> "four hundred ninety-four thousand three hundred twenty-two" |


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 7, | Training loss: 0.00305 | Val loss: 0.00005 | 
 Input=> "number to english: 7 1 4 1 1" | Pred output=> "seven hundred fourteen thousand one hundred eleven" |


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 8, | Training loss: 0.00265 | Val loss: 0.00001 | 
 Input=> "number to english: 3 0 4 5 7 1" | Pred output=> "three hundred four thousand five hundred seventy-one" |


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 9, | Training loss: 0.00146 | Val loss: 0.00001 | 
 Input=> "english to number: one hundred seventy-one thousand eight hundred eighty-nine" | Pred output=> "1 7 1 8 8 9" |


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 10, | Training loss: 0.00260 | Val loss: 0.00037 | 
 Input=> "number to english: 2 0 8 6 5 4" | Pred output=> "two hundred eight thousand six hundred fifty-four" |


In [None]:
test_loss, input, output = evaluate(df_test_total_encoded)

print(f"Test loss: {test_loss:2.5f} |",
      f'Input=> "{input:}" |',
      f'Pred output=> "{output:}" |')

Test loss: 0.00004 | Input=> "number to english: 7 8 6 6 9 9" | Pred output=> "seven hundred eighty-six thousand six hundred ninety-nine" |


### Save model

In [None]:
PATH = 'num_and_eng.pt'
torch.save(model.state_dict(), PATH)

### Load model

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
model.load_state_dict(torch.load(PATH))

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-small and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

### Save at G-Drive

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
!cp num_and_eng.pt /gdrive/My\ Drive