In [1]:
!pip install simpletransformers -q

[K     |████████████████████████████████| 231 kB 4.7 MB/s 
[K     |████████████████████████████████| 8.3 MB 20.9 MB/s 
[K     |████████████████████████████████| 43 kB 1.8 MB/s 
[K     |████████████████████████████████| 290 kB 42.7 MB/s 
[K     |████████████████████████████████| 3.3 MB 30.0 MB/s 
[K     |████████████████████████████████| 1.2 MB 35.3 MB/s 
[K     |████████████████████████████████| 1.7 MB 38.3 MB/s 
[K     |████████████████████████████████| 3.1 MB 12.3 MB/s 
[K     |████████████████████████████████| 56 kB 4.1 MB/s 
[K     |████████████████████████████████| 895 kB 32.1 MB/s 
[K     |████████████████████████████████| 596 kB 42.4 MB/s 
[K     |████████████████████████████████| 139 kB 47.7 MB/s 
[K     |████████████████████████████████| 97 kB 6.6 MB/s 
[K     |████████████████████████████████| 180 kB 49.2 MB/s 
[K     |████████████████████████████████| 63 kB 1.4 MB/s 
[K     |████████████████████████████████| 243 kB 53.8 MB/s 
[K     |████████████████████████

In [2]:
import logging
import pandas as pd
from simpletransformers.t5 import T5Model, T5Args

In [4]:
# Mount Google drive to upload datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# The path to the data on my drive
D = '/content/drive/My Drive/W266_Project_Data/pmi_data'

In [8]:
# Set up logging 
logging.basicConfig(level=logging.INFO)
mt5_logger = logging.getLogger("transformers")
mt5_logger.setLevel(logging.WARNING)

In [9]:
# Load the training and evaluation datasets
# Cast each to a string since MT5 is a sequence to sequence model which expects inputs and outputs to be strings of text
train_df = pd.read_csv(D+"/train/en_bn_te.csv").astype(str)
eval_df = pd.read_csv(D+"/dev/en_bn_te.csv").astype(str)

In [10]:
# Rename the source column so that it matches the expected field name
train_df.rename(columns={'source_text': 'input_text'}, inplace=True)
train_df.drop('Unnamed: 0', axis=1, inplace=True)

In [11]:
# Rename the source column so that it matches the expected field name
eval_df.rename(columns={'source_text': 'input_text'}, inplace=True)
eval_df.drop('Unnamed: 0', axis=1, inplace=True)

In [14]:
# Set the training parameters for the MT5 model
model_args = T5Args()
model_args.max_seq_length = 196
# Pick a low batch number for memory for training and testing
model_args.train_batch_size = 8
model_args.eval_batch_size = 8
model_args.num_train_epochs = 1
model_args.evaluate_during_training = False
model_args.use_multiprocessing = False
model_args.fp16 = False
model_args.save_steps = -1
model_args.save_eval_checkpoints = False
model_args_save_model_every_epoch = False
model_args.no_cache = True
model_args_reprocess_input_data = True
model_args_overwrite_output_dir = True
model_args.num_return_sequences = 1
model_args.wandb_project = "MT5-multiple-translator"

In [15]:
# Create an instance of the model
model = T5Model("mt5", "google/mt5-small", args=model_args)

DEBUG:filelock:Attempting to acquire lock 140322426916688 on /root/.cache/huggingface/transformers/97693496c1a0cae463bd18428187f9e9924d2dfbadaa46e4d468634a0fc95a41.dadce13f8f85f4825168354a04675d4b177749f8f11b167e87676777695d4fe4.lock
DEBUG:filelock:Lock 140322426916688 acquired on /root/.cache/huggingface/transformers/97693496c1a0cae463bd18428187f9e9924d2dfbadaa46e4d468634a0fc95a41.dadce13f8f85f4825168354a04675d4b177749f8f11b167e87676777695d4fe4.lock


Downloading:   0%|          | 0.00/553 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140322426916688 on /root/.cache/huggingface/transformers/97693496c1a0cae463bd18428187f9e9924d2dfbadaa46e4d468634a0fc95a41.dadce13f8f85f4825168354a04675d4b177749f8f11b167e87676777695d4fe4.lock
DEBUG:filelock:Lock 140322426916688 released on /root/.cache/huggingface/transformers/97693496c1a0cae463bd18428187f9e9924d2dfbadaa46e4d468634a0fc95a41.dadce13f8f85f4825168354a04675d4b177749f8f11b167e87676777695d4fe4.lock
DEBUG:filelock:Attempting to acquire lock 140322428074768 on /root/.cache/huggingface/transformers/8e7b2a80ddcb5611b27d8c89e1e8e33a947e105415051402a22b9c8d7d1caeb0.e22331f3a065b885b30ae3dd1ff11ccaf7fbc444485f6eb07ef5e0138bca8b70.lock
DEBUG:filelock:Lock 140322428074768 acquired on /root/.cache/huggingface/transformers/8e7b2a80ddcb5611b27d8c89e1e8e33a947e105415051402a22b9c8d7d1caeb0.e22331f3a065b885b30ae3dd1ff11ccaf7fbc444485f6eb07ef5e0138bca8b70.lock


Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140322428074768 on /root/.cache/huggingface/transformers/8e7b2a80ddcb5611b27d8c89e1e8e33a947e105415051402a22b9c8d7d1caeb0.e22331f3a065b885b30ae3dd1ff11ccaf7fbc444485f6eb07ef5e0138bca8b70.lock
DEBUG:filelock:Lock 140322428074768 released on /root/.cache/huggingface/transformers/8e7b2a80ddcb5611b27d8c89e1e8e33a947e105415051402a22b9c8d7d1caeb0.e22331f3a065b885b30ae3dd1ff11ccaf7fbc444485f6eb07ef5e0138bca8b70.lock
DEBUG:filelock:Attempting to acquire lock 140322413014032 on /root/.cache/huggingface/transformers/37d0f67f084f8c5fc5589e0bba5ff3c6307af833bb0b7f4eb33fbfd8d4038a9d.84ea7af2df68dc8db434d3160aab65cce8ac63ce5b6f7743f8c9a4a14b4f77e2.lock
DEBUG:filelock:Lock 140322413014032 acquired on /root/.cache/huggingface/transformers/37d0f67f084f8c5fc5589e0bba5ff3c6307af833bb0b7f4eb33fbfd8d4038a9d.84ea7af2df68dc8db434d3160aab65cce8ac63ce5b6f7743f8c9a4a14b4f77e2.lock


Downloading:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140322413014032 on /root/.cache/huggingface/transformers/37d0f67f084f8c5fc5589e0bba5ff3c6307af833bb0b7f4eb33fbfd8d4038a9d.84ea7af2df68dc8db434d3160aab65cce8ac63ce5b6f7743f8c9a4a14b4f77e2.lock
DEBUG:filelock:Lock 140322413014032 released on /root/.cache/huggingface/transformers/37d0f67f084f8c5fc5589e0bba5ff3c6307af833bb0b7f4eb33fbfd8d4038a9d.84ea7af2df68dc8db434d3160aab65cce8ac63ce5b6f7743f8c9a4a14b4f77e2.lock
DEBUG:filelock:Attempting to acquire lock 140322413013072 on /root/.cache/huggingface/transformers/685ac0ca8568ec593a48b61b0a3c272beee9bc194a3c7241d15dcadb5f875e53.f76030f3ec1b96a8199b2593390c610e76ca8028ef3d24680000619ffb646276.lock
DEBUG:filelock:Lock 140322413013072 acquired on /root/.cache/huggingface/transformers/685ac0ca8568ec593a48b61b0a3c272beee9bc194a3c7241d15dcadb5f875e53.f76030f3ec1b96a8199b2593390c610e76ca8028ef3d24680000619ffb646276.lock


Downloading:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140322413013072 on /root/.cache/huggingface/transformers/685ac0ca8568ec593a48b61b0a3c272beee9bc194a3c7241d15dcadb5f875e53.f76030f3ec1b96a8199b2593390c610e76ca8028ef3d24680000619ffb646276.lock
DEBUG:filelock:Lock 140322413013072 released on /root/.cache/huggingface/transformers/685ac0ca8568ec593a48b61b0a3c272beee9bc194a3c7241d15dcadb5f875e53.f76030f3ec1b96a8199b2593390c610e76ca8028ef3d24680000619ffb646276.lock
DEBUG:filelock:Attempting to acquire lock 140322413153872 on /root/.cache/huggingface/transformers/6a9e52d6dd21568e37b65fc180ada927968e8f7124f0acd6efcaf90cd2e0f4bb.4b81e5d952ad810ca1de2b3e362b9a26a5cc77b4b75daf20caf69fb838751c32.lock
DEBUG:filelock:Lock 140322413153872 acquired on /root/.cache/huggingface/transformers/6a9e52d6dd21568e37b65fc180ada927968e8f7124f0acd6efcaf90cd2e0f4bb.4b81e5d952ad810ca1de2b3e362b9a26a5cc77b4b75daf20caf69fb838751c32.lock


Downloading:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140322413153872 on /root/.cache/huggingface/transformers/6a9e52d6dd21568e37b65fc180ada927968e8f7124f0acd6efcaf90cd2e0f4bb.4b81e5d952ad810ca1de2b3e362b9a26a5cc77b4b75daf20caf69fb838751c32.lock
DEBUG:filelock:Lock 140322413153872 released on /root/.cache/huggingface/transformers/6a9e52d6dd21568e37b65fc180ada927968e8f7124f0acd6efcaf90cd2e0f4bb.4b81e5d952ad810ca1de2b3e362b9a26a5cc77b4b75daf20caf69fb838751c32.lock


In [16]:
# Train the model
model.train_model(train_df, eval_data=eval_df)

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/56686 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

INFO:simpletransformers.t5.t5_model: Training started


Using Adafactor for T5


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Running Epoch 0 of 1:   0%|          | 0/7086 [00:00<?, ?it/s]

INFO:simpletransformers.t5.t5_model: Training of google/mt5-small model complete. Saved to outputs/.


(7086, 3.4033266836022915)

In [17]:
# Import PyTorch to save the model
import torch

In [21]:
torch.save(model, '/content/drive/My Drive/W266_Project_Data/models/MT5/checkpoints/mycheckpoint/mt5_indic.pt')

In [22]:
# Get the evaluation data for testing
to_predict = [prefix + ": " + str(input_text) for prefix, input_text in zip(eval_df['prefix'].tolist(), eval_df['input_text'].tolist())]

# Get the actual translations and tasks
truth = eval_df["target_text"].tolist()
tasks = eval_df["prefix"].tolist()

In [24]:
# Get the model predicted values
preds = model.predict(to_predict)

Generating outputs:   0%|          | 0/250 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/2000 [00:00<?, ?it/s]



In [28]:
eval_df.columns

Index(['prefix', 'input_text', 'target_text'], dtype='object')

In [33]:
# Create a file of the translation task, actual text, and predicted text
df = pd.DataFrame({'prefix': eval_df.prefix.tolist(), 
                   'target_text': eval_df.target_text.tolist(), 'predicted_text': preds})

In [36]:
# Save the translations to a csv file
df.to_csv(D+'/predicted_text/english_to_te_and_bn.csv')