In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import torch

!pip install transformers
from transformers.models.bert.modeling_bert import BertModel,BertForMaskedLM

from transformers import BertGenerationConfig, BertGenerationEncoder,BertGenerationDecoder,EncoderDecoderModel,BertTokenizer
from torch.utils.data import DataLoader
!pip install datasets
import datasets
from datasets import load_metric
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer



Since there is basically no external data file, this notebook is extremly simple to open in collab and to train your models there.

# **Task 4: **

**We're going to build a network that takes and converts dates from one format into another. **

For example, given a date string such as "14-03-2020", we want out network to, character by character read this string and output to us "The 14th of March 2020".

Since our data is a sequence of information, each part derives it's meaning from a prior part.
"2" as the second month character could either encode for Feb or for december depending on what number preceded it. This is a problem that is well handled by recurrent neural networks. 

We're going to be using **Transformer** to build this network, 


### Make Dataset

In [None]:
# we will use the same dataset which we use in task 3

In [2]:
def make_short_date(dt):
    return dt.strftime('%d-%m-%Y')

def make_long_date(dt):
    date = dt.strftime('%d')
    if date[-1] == '1':
        suffix = 'st'
    elif date[-1] == '2':
        suffix = 'nd'
    elif date[-1] == '3':
        suffix = 'rd'
    else:
        suffix = 'th'
    month = dt.strftime('%B')
    year = dt.strftime('%Y')
    
    return date + suffix + ' of ' + month + ' ' + year

def make_dataset(n):
    dates = pd.date_range(datetime(1990, 4, 14), periods=n, normalize=True)
    
    x = dates.map(make_short_date).values
    y = dates.map(make_long_date).values
    
    return x, y

In [3]:
x, y = make_dataset(5)
x[:5], y[:5]

(array(['14-04-1990', '15-04-1990', '16-04-1990', '17-04-1990',
        '18-04-1990'], dtype=object),
 array(['14th of April 1990', '15th of April 1990', '16th of April 1990',
        '17th of April 1990', '18th of April 1990'], dtype=object))

In [4]:
batch_size = 8  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  
#data_path="T4.txt"
dataset = make_dataset(num_samples)

In [6]:
dataset = make_dataset(num_samples)

In [5]:
with open('T4.txt', 'w') as writefile:
  for i in range(0, 10000):
    writefile.write(dataset[0][i])
    writefile.write('\t')
    writefile.write(dataset[1][i])     
    writefile.write('\n')

In [8]:
with open(data_path) as f:
    lines = f.read().split("\n")[:-1]
text_pairs = []
for line in lines:
    eng, spa = line.split("\t")
    spa = '\t' + spa + '\n'
    text_pairs.append((eng, spa))

### Making Predictions

Let's check how the model does with the training data.

### Create Model

**Hugging face**

In [6]:
input_texts = []
target_texts = []
input_chars = set()
target_chars = set()
with open("T4.txt", 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text = line.split('\t')
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_chars:
            input_chars.add(char)
    for char in target_text:
        if char not in target_chars:
            target_chars.add(char)

input_chars = sorted(list(input_chars))
target_chars = sorted(list(target_chars))
num_encoder_tokens = len(input_chars)
num_decoder_tokens = len(target_chars)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [7]:
input_token_id = dict([(char, a) for a, char in enumerate(input_chars)])
target_token_id = dict([(char, a) for a, char in enumerate(target_chars)])
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
df = pd.read_csv("T4.txt", sep='\t',header=None)

In [8]:
train_x = df[:7800]
test_x = df[78000:9200]
val_x = df[9200:10000]
train_x = datasets.Dataset.from_pandas(train_x)
test_x = datasets.Dataset.from_pandas(test_x)
val_x = datasets.Dataset.from_pandas(val_x)
train_dataloader = DataLoader(train_x, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(test_x, batch_size=8)
max_input_length = 20
max_target_length = 20
def preprocess_function(examples):
    inputs = [doc for doc in examples["0"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding=True ,truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["1"], max_length=max_target_length,padding=True ,truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_x = train_x.map(preprocess_function, batched=True)
val_x = val_x.map(preprocess_function, batched=True)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [10]:
args = Seq2SeqTrainingArguments(output_dir="./", evaluation_strategy = "epoch", learning_rate=2e-5, per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, weight_decay=0.01, save_total_limit=3, num_train_epochs=3, predict_with_generate=True, fp16=True,)
data_c = DataCollatorForSeq2Seq(tokenizer, model)
trainer = Seq2SeqTrainer(model, args, train_dataset=train_x, eval_dataset=val_x, data_collator=data_c, tokenizer=tokenizer,)
trainer.train()

Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: 1, 0. If 1, 0 are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7800
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2925


Epoch,Training Loss,Validation Loss
1,1.0812,0.003647
2,0.0193,0.001319
3,0.0111,0.001065


Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500/config.json
Model weights saved in ./checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
Special tokens file saved in ./checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: 1, 0. If 1, 0 are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 8
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Model weights saved in ./checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./checkpoint-1500
Configuration saved in ./checkpoint-1500

TrainOutput(global_step=2925, training_loss=0.19995493359035915, metrics={'train_runtime': 737.5325, 'train_samples_per_second': 31.727, 'train_steps_per_second': 3.966, 'total_flos': 43298802892800.0, 'train_loss': 0.19995493359035915, 'epoch': 3.0})

In [11]:
def convert_date(inputs):
    inputs = tokenizer([inputs], max_length=max_input_length, truncation=True, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")
    g_ids =  model.generate(input_ids, attention_mask=attention_mask,
                          num_return_sequences=1, do_sample=False, top_k= 50, top_p= 0.95,)
    out = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in g_ids]
    return out

In [12]:
print(convert_date('19-1-1991'))
print(convert_date('25-4-1991'))
print(convert_date('5-6-2004'))

['19th of January 1991']
['25th of April 1991']
['5th of June 2004']
