In [1]:
pip install happytransformer



In [2]:
from happytransformer import HappyTextToText

happy_tt = HappyTextToText("T5", "t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
import pandas as pd
from datasets import load_dataset
import datasets

In [4]:
train_dataset = load_dataset("jfleg", split='validation[:]')

eval_dataset = load_dataset("jfleg", split='test[:]')

In [5]:
train_dataset

Dataset({
    features: ['sentence', 'corrections'],
    num_rows: 755
})

In [7]:
import csv

def generate_csv(csv_path, dataset):
    with open(csv_path, 'w', newline='') as csvfile:
        writter = csv.writer(csvfile)
        writter.writerow(["input", "target"])
        for case in dataset:
     	    # Adding the task's prefix to input
            input_text = "grammar: " + case["sentence"]
            for correction in case["corrections"]:
                # a few of the cases contain blank strings.
                if input_text and correction:
                    writter.writerow([input_text, correction])



generate_csv("train.csv", train_dataset)
generate_csv("eval.csv", eval_dataset)

In [8]:
before_result = happy_tt.eval("eval.csv")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2988 [00:00<?, ? examples/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [9]:
print("Before loss:", before_result.loss)

Before loss: 1.2803919315338135


In [10]:
from happytransformer import TTTrainArgs

args = TTTrainArgs(batch_size=8, num_train_epochs=1)
happy_tt.train("train.csv", args=args)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2714 [00:00<?, ? examples/s]

Map:   0%|          | 0/302 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
1,1.3217,1.138514
34,0.834,0.685736
68,0.7519,0.577079
102,0.6762,0.543681
136,0.6373,0.523778
170,0.64,0.51312
204,0.6128,0.503484
238,0.6141,0.498517
272,0.5694,0.495701
306,0.5508,0.493465


In [11]:
before_loss = happy_tt.eval("eval.csv")

print("After loss: ", before_loss.loss)

Map:   0%|          | 0/2988 [00:00<?, ? examples/s]

After loss:  0.47985807061195374


In [12]:
from happytransformer import TTSettings

beam_settings =  TTSettings(num_beams=5, min_length=1, max_length=20)

In [22]:
example_1 = "grammar: I has an problem, dont I"
result_1 = happy_tt.generate_text(example_1, args=beam_settings)
print(result_1.text)

I have a problem, don't I?


In [16]:
example_2 = "grammar: hi mu name si John."

result_2 = happy_tt.generate_text(example_2, args=beam_settings)
print(result_2.text)

Hi, my name is John.


In [15]:
happy_tt.save("model/")