# Setup

In [1]:
import logging

import pandas as pd
from simpletransformers.t5 import T5Model, T5Args

from sklearn.model_selection import train_test_split

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Data Preparation

In [2]:
data_df = pd.read_csv("./data/PassiveDataWidderTSV.tsv", sep=".", names=["input_text", "target_text","empty"])
data_df = data_df.drop("empty", axis=1)
data_df.insert(0,"prefix","en_passive_to_active")

data_df.head()

Unnamed: 0,prefix,input_text,target_text
0,en_passive_to_active,the curious penguin was eaten by the curious c...,the curious chicken near the playground ...
1,en_passive_to_active,a big penguin was complimented by a cow in the...,a cow in the stadium complimented a bi...
2,en_passive_to_active,a goofy rabbit was stalked by a goofy fish at ...,a goofy fish at a barn stalked a goofy rabbit
3,en_passive_to_active,a big fish by a field was hated by the happy pig,the happy pig hated a big fish by a field
4,en_passive_to_active,the cow was hated by a teacher,a teacher hated the cow


# Data Splitting

In [None]:
train_df, eval_df = train_test_split(data_df, test_size=0.05)

In [None]:
train_df.shape # training dataset

In [None]:
eval_df.shape # evaluation dataset

# Model Training

In [None]:
# Configure the model
model_args = T5Args()

# Fix pytoch issue with Bad File descriptors
# See this for more info: https://github.com/ThilinaRajapakse/simpletransformers/issues/789
model_args.process_count = 1
model_args.use_multiprocessing = False
model_args.dataloader_num_workers = 1

# Basic model params
model_args.num_train_epochs = 200
model_args.no_save = False
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True

# Evaluate during training
model_args.evaluate_generated_text = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = False

# Don't save models every checkpoint
model_args.save_eval_checkpoints = False
model_args.save_model_every_epoch = False

# Create T5 Model
model = T5Model("t5-small", args=model_args, use_cuda=False)

# Train T5 Model on the new task
model.train_model(train_df, eval_data=eval_df)

# Model Evaluation

In [None]:
# Evaluate T5 Model on new task
results = model.eval_model(eval_df)

# Model Prediction

In [None]:
# Predict with trained T5 model
print(model.predict(["en_passive_to_active: our turkey was eaten by the dog"]))