<a href="https://colab.research.google.com/github/bllendev/nlp/blob/main/translate_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers
!pip install -q keras_preprocessing


import keras_preprocessing
import transformers
import torch
import tensorflow as tf
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m60.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m76.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 KB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### methods - read in data

In [3]:
import pickle
import string
import tarfile
import gzip
from zipfile import ZipFile


def load_doc(file_name):
    text = ""
    if "gz" in file_name:
        with tarfile.open(file_name, 'r:gz') as tar:
            for member in tar.getmembers():
              f = tar.extractfile(member)
              if f is not None:
                text += f.read().decode("utf-8")

    elif "zip" in file_name:
      print(f"zip exists!!!")
      with ZipFile(file_name, 'r') as zip:
        file = zip.namelist()[0]
        with zip.open(file) as f:
          text = f.read().decode("utf-8")
  
    else:
        file = open(file_name, mode="r", encoding="utf-8")
        text = file.read()
        file.close()
    return text


def get_sentences(doc):
  return doc.strip().split('\n')


def get_sentence_lengths(sentences):
  lengths = [len(s.split()) for s in sentences]
  return min(lengths), max(lengths)

In [4]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

### methods - clean text data

In [5]:
import re
import string
import unicodedata

def clean_lines(lines):
  cleaned = list()
  # escaping special chars from text
  re_print = re.compile(f"[^{re.escape(string.printable)}]")
  # prepare to remove punctuation by using trans table
  table = str.maketrans('', '', string.punctuation)  # empty space stay empty and we delete punctuation !
  for line in lines:
    line = unicodedata.normalize('NFD', line).encode('ascii', 'ignore')     # The normalize() function ensures that the string is in a canonical form
    line = line.decode('UTF-8')                                             # the encode() method removes any non-ASCII characters.
    line = line.split()                                                     # tokenize on white space
    line = [word.lower() for word in line]                                  # lowercase
    line = [word.translate(table) for word in line]                         # NOTE: punctuation removed here, may need to reinstate
    line = [re_print.sub('', w) for w in line]
    line = [word for word in line if word.isalpha()]                        # removing tokens with word in them
    cleaned.append(' '.join(line))                                          # join words separated by " "
  return cleaned

In [6]:
eu_corpus_english_file_path = r"/content/drive/MyDrive/data/europarl-v7.es-en.en.zip"
eu_corpus_espanol_file_path = r"/content/drive/MyDrive/data/europarl-v7.es-en.es.zip"

doc_eng = load_doc(eu_corpus_english_file_path)
doc_esp = load_doc(eu_corpus_espanol_file_path)

zip exists!!!
zip exists!!!


In [7]:
print(doc_esp[:2])

Re


In [8]:
sentences_eng = get_sentences(doc_eng)
minlen_eng, maxlen_eng = get_sentence_lengths(sentences_eng)

sentences_esp = get_sentences(doc_esp)
minlen_esp, maxlen_esp = get_sentence_lengths(sentences_esp)

print(f"data eng: sentences-{len(sentences_eng)} | min-{minlen_eng} | max-{maxlen_eng}")      # NOTE: clean data set !!!
print(f"data esp: sentences-{len(sentences_esp)} | min-{minlen_esp} | max-{maxlen_esp}")      # NOTE: clean data set !!!

data eng: sentences-1965734 | min-0 | max-668
data esp: sentences-1965734 | min-0 | max-658


In [9]:
# consider creating save / load file / pickle for clean data set, skipping for now since we are using google colab

### remove rare vocab from dataset !

In [10]:
from collections import Counter
def get_vocab_counts(lines):
  vocab = Counter()
  for line in lines:
    tokens = line.split()
    vocab.update(tokens)
  return vocab

def get_filtered_vocab(vocab, min_occurance):
  token_set = {k for k, c in vocab.items() if c >= min_occurance}
  return token_set

### removing Out-Of-Vocabulary (OOV) words

In [11]:
from collections import Counter
from transformers import T5Config, T5ForConditionalGeneration, AutoTokenizer
from keras_preprocessing.sequence import pad_sequences
import torch


MIN_OCCURENCES = 5
TOKEN_COUNT = 512


def get_vocab_counts(lines):
  vocab = Counter()
  for line in lines:
    tokens = line.split()
    vocab.update(tokens)
  return vocab


def get_vocab_to_filter(lines, vocab, min_occurance):
  token_set = {k for k, c in vocab.items() if c >= min_occurance}
  return token_set


def get_oov_filtered_lines(lines, filtered_vocab):
  new_lines = list()
  for line in lines:
    new_tokens = list()
    for token in line.split():
      if token in filtered_vocab:
        new_tokens.append(token)
      else:
        new_tokens.append('unk')             # filtered words become ['unk'] ==> unknown ==> OOV
    new_line = ' '.join(new_tokens)          # join back into a sentence split by white-space
    new_lines.append(new_line)
  return new_lines


def get_updated_dataset(input_lines, target_lines):
  # get filtered vocab
  input_vocab_counts = get_vocab_counts(input_lines)
  target_vocab_counts = get_vocab_counts(target_lines)
  input_vocab_to_filter = get_vocab_to_filter(input_lines, input_vocab_counts, MIN_OCCURENCES)
  target_vocab_to_filter = get_vocab_to_filter(target_lines, target_vocab_counts, MIN_OCCURENCES)

  # prepare updated / filtered dataset
  new_input_lines = get_oov_filtered_lines(input_lines, input_vocab_to_filter)
  new_target_lines = get_oov_filtered_lines(target_lines, target_vocab_to_filter)
  return new_input_lines, new_target_lines

In [12]:
lines_eng, lines_esp = get_updated_dataset(sentences_eng, sentences_esp)

### geometric evaluations

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction


chencherry_smoother = SmoothingFunction()
def get_line_bleu_score(candidate, reference):
  line_bleu = sentence_bleu([reference], list(candidate), smoothing_function=chencherry_smoother.method1)
  return line_bleu


### giving it a try

In [None]:
lines_input = lines_eng
lines_target = lines_esp

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_config = T5Config.from_pretrained('t5-base')
tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration(config=model_config)
model.to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from keras_preprocessing.sequence import pad_sequences


MAX_LEN = 124


def tokenize_texts(texts, labels, max_length=512, batch_size=32, test_size=0.2, shuffle=True, random_state=42):
    # Tokenize the texts
    encoding = tokenizer.batch_encode_plus(
        texts,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_attention_mask=False,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"]
    
    # Split the input ids and labels into train and test sets
    input_ids_train, input_ids_test, labels_train, labels_test = train_test_split(
        input_ids,
        labels,
        test_size=test_size,
        shuffle=shuffle,
        random_state=random_state
    )
    
    for i in range(0, len(input_ids_train), batch_size):
        
        batch_input_ids = input_ids_train[i:i+batch_size]
        batch_labels = labels_train[i:i+batch_size]
         batch_input_ids, batch_labels



def get_attention_masks(ids, max_length=512, batch_size=32):
  attention_masks = []
  for seq in range(0, len(ids), batch_size):
    seq_mask = [float(i>0) for i in seq]
    yield seq_mask


def get_num_pair_sentences(lines, num_pair=2):
  total_i = 0
  num_pairs = []
  for i, line in enumerate(lines):
    if i % 4 == 0:
      try:
        num_pairs.append([lines[total_i], lines[total_i+1], lines[total_i+2], lines[total_i+3]])
        total_i += 4
      except Exception as e:
        remaining_eth = total_i % len(lines)
        remaining = len(lines) - remaining_eth
        num_pairs.append([lines[remaining_i] for remaining_i in range(remaining)])
  return num_pairs


In [None]:
train_input, test_input, train_target, test_target = train_test_split(input_ids, target_ids, random_state=42, test_size=.9)

In [None]:
train_masks, test_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=42, test_size=.9)

TypeError: ignored

In [None]:
train_input_tensor = torch.tensor(train_input)
test_input_tensor = torch.tensor(test_input)

train_target_tensor = torch.tensor(train_target)
test_target_tensor = torch.tensor(test_target)

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

In [None]:
train_data = TensorDataset(train_input_tensor, train_masks_tensor, train_target_tensor)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=8)

test_data = TensorDataset(test_input_tensor, test_masks_tensor, test_target_tensor)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=8)

In [None]:
# {k: v for k, v in model.named_parameters()}

In [None]:
# preprocess the input and target text
prefix = "translate English to Spanish: "
optimizer_counter = 1
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {
      'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.0
    }
]

In [None]:
# optimizer = transformers.BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=.1)
optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=2e-5)

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# Total number of training steps is number of batches * number of epochs.
# `train_dataloader` contains batched data so `len(train_dataloader)` gives 
# us the number of batches.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
t = []
train_loss_set = []
# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
    loss = outputs['loss']
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()

    # Update the learning rate.
    scheduler.step()
    
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in test_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits['logits'].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
