
# **Install libraries**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install datasets tqdm pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

In [6]:
# Check we have a GPU and check the memory size of the GPU
!nvidia-smi

Tue Apr 25 15:08:59 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# **Import packages**

In [7]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **Set a seed**

In [8]:
import random
import numpy as np
import torch
import datasets

In [9]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

set_seed(42)



```
# This is formatted as code
```

# ***CLANG-8 dataset***

In [10]:
pd.set_option('display.max_colwidth', None)

In [13]:
df = pd.read_table('/content/drive/MyDrive/c4_200m/clang8_source_target_en.spacy_tokenized.tsv', error_bad_lines=False)
df.columns = ["input", "output"]
df = df.dropna()
df = df[0:100000]



  df = pd.read_table('/content/drive/MyDrive/c4_200m/clang8_source_target_en.spacy_tokenized.tsv', error_bad_lines=False)
b'Skipping line 4958: expected 2 fields, saw 3\nSkipping line 16194: expected 2 fields, saw 3\nSkipping line 25235: expected 2 fields, saw 3\nSkipping line 25710: expected 2 fields, saw 3\nSkipping line 26283: expected 2 fields, saw 3\nSkipping line 27115: expected 2 fields, saw 3\nSkipping line 29809: expected 2 fields, saw 3\nSkipping line 47655: expected 2 fields, saw 3\nSkipping line 55291: expected 2 fields, saw 3\nSkipping line 55946: expected 2 fields, saw 3\nSkipping line 60210: expected 2 fields, saw 3\nSkipping line 64424: expected 2 fields, saw 3\nSkipping line 66795: expected 2 fields, saw 3\nSkipping line 69214: expected 2 fields, saw 3\nSkipping line 81244: expected 2 fields, saw 3\nSkipping line 89459: expected 2 fields, saw 3\nSkipping line 89813: expected 2 fields, saw 3\nSkipping line 97182: expected 2 fields, saw 3\nSkipping line 100802: expecte

In [14]:
print(df.isna().sum())
df.head()
print(df.shape)

input     0
output    0
dtype: int64
(100000, 2)


In [15]:
from transformers import (
    T5ForConditionalGeneration, T5Tokenizer, 
    Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
  )

from torch.utils.data import Dataset, DataLoader

In [16]:
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [17]:
def calc_token_len(example):
    return len(tokenizer(example).input_ids)

In [18]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.10, shuffle=True)
train_df.shape, test_df.shape

((90000, 2), (10000, 2))

In [19]:
test_df['input_token_len'] = test_df['input'].apply(calc_token_len)

In [20]:
test_df.head()

Unnamed: 0,input,output,input_token_len
75796,Why Russians are difficult to understand English speech,Why Russians find it difficult to understand English speech,10
80260,I 'm going to have a dinner with my friend and I 'm late .,I 'm going to have dinner with my friend and I 'm late .,22
19880,"Finally , I decided to go a small busty salon because it opens till 11 pm .","Finally , I decided to go to a small busy salon because it opens till 11 pm .",23
76774,My letter,My letter,3
93088,This site is great for us to study foreigne languages .,This site is great for us to study foreign languages .,14


In [21]:
test_df['input_token_len'].describe()

count    10000.000000
mean        15.527900
std          8.995005
min          3.000000
25%         10.000000
50%         14.000000
75%         19.000000
max        185.000000
Name: input_token_len, dtype: float64

### We will use a token length of 64 since it will cover the vast majority of examples

In [22]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [23]:
test_dataset

Dataset({
    features: ['input', 'output', 'input_token_len', '__index_level_0__'],
    num_rows: 10000
})

### Load the Dataset

In [24]:
from torch.utils.data import Dataset, DataLoader
class GrammarDataset(Dataset):
    def __init__(self, dataset, tokenizer,print_text=False):         
        self.dataset = dataset
        self.pad_to_max_length = False
        self.tokenizer = tokenizer
        self.print_text = print_text
        self.max_len = 64
  
    def __len__(self):
        return len(self.dataset)


    def tokenize_data(self, example):
        input_, target_ = example['input'], example['output']

        # tokenize inputs
        tokenized_inputs = tokenizer(input_, pad_to_max_length=self.pad_to_max_length, 
                                            max_length=self.max_len,
                                            return_attention_mask=True)
    
        tokenized_targets = tokenizer(target_, pad_to_max_length=self.pad_to_max_length, 
                                            max_length=self.max_len,
                                            return_attention_mask=True)

        inputs={"input_ids": tokenized_inputs['input_ids'],
            "attention_mask": tokenized_inputs['attention_mask'],
            "labels": tokenized_targets['input_ids']
        }
        
        return inputs

  
    def __getitem__(self, index):
        inputs = self.tokenize_data(self.dataset[index])
        
        if self.print_text:
            for k in inputs.keys():
                print(k, len(inputs[k]))

        return inputs

In [25]:
dataset = GrammarDataset(test_dataset, tokenizer, True)
print(dataset[121])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


input_ids 13
attention_mask 13
labels 12
{'input_ids': [8, 625, 1554, 130, 1416, 600, 49, 145, 490, 589, 3, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [37, 625, 1554, 1416, 4038, 145, 8, 490, 589, 3, 5, 1]}


### Define Evaluator

In [26]:
!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [27]:
from datasets import load_metric
rouge_metric = load_metric("rouge")

  rouge_metric = load_metric("rouge")


### Train Model

In [28]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='longest', return_tensors='pt')

In [29]:
# defining training related arguments
batch_size = 16
args = Seq2SeqTrainingArguments(output_dir="./weights",
                        evaluation_strategy="steps",
                        per_device_train_batch_size=batch_size,
                        per_device_eval_batch_size=batch_size,
                        learning_rate=2e-5,
                        num_train_epochs=1,
                        weight_decay=0.01,
                        save_total_limit=2,
                        predict_with_generate=True,
                        # fp16 = True,
                        gradient_accumulation_steps = 6,
                        eval_steps = 500,
                        save_steps = 500,
                        load_best_model_at_end=True,
                        logging_dir="/logs")

In [30]:
import nltk
nltk.download('punkt')
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [31]:
# defining trainer using 🤗
trainer = Seq2SeqTrainer(model=model, 
                args=args, 
                train_dataset= GrammarDataset(train_dataset, tokenizer),
                eval_dataset=GrammarDataset(test_dataset, tokenizer),
                tokenizer=tokenizer,
                data_collator=data_collator,
                compute_metrics=compute_metrics)

In [32]:
os.environ["WANDB_DISABLED"] = "true"

In [33]:
trainer.train()



Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
500,0.3457,0.232596,92.3262,86.1093,91.9855,91.9803,13.5108


TrainOutput(global_step=937, training_loss=0.3071673463477396, metrics={'train_runtime': 2201.6147, 'train_samples_per_second': 40.879, 'train_steps_per_second': 0.426, 'total_flos': 3761875428802560.0, 'train_loss': 0.3071673463477396, 'epoch': 1.0})

In [34]:
trainer.save_model('t5_gec_model_clang')

In [35]:
!zip -r 't5_gec_model_clang.zip' 't5_gec_model_clang'

  adding: t5_gec_model_clang/ (stored 0%)
  adding: t5_gec_model_clang/config.json (deflated 62%)
  adding: t5_gec_model_clang/training_args.bin (deflated 49%)
  adding: t5_gec_model_clang/pytorch_model.bin (deflated 9%)
  adding: t5_gec_model_clang/special_tokens_map.json (deflated 86%)
  adding: t5_gec_model_clang/tokenizer_config.json (deflated 83%)
  adding: t5_gec_model_clang/spiece.model (deflated 48%)
  adding: t5_gec_model_clang/generation_config.json (deflated 29%)


In [36]:
!mv t5_gec_model_clang.zip /content/drive/MyDrive/c4_200m

I have uploaded this model to HuggingFace Model Zoo and we can run inference using it

## Testing

In [39]:
# import torch
# from transformers import T5Tokenizer, T5ForConditionalGeneration
# model_name = 'deep-learning-analytics/GrammarCorrector'
# torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
# tokenizer = T5Tokenizer.from_pretrained(model_name)
# model = T5ForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def correct_grammar(input_text,num_return_sequences):
  batch = tokenizer([input_text],truncation=True,padding='max_length',max_length=64, return_tensors="pt").to('cuda')
  translated = model.generate(**batch,max_length=64,num_beams=4, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

In [42]:
text = 'He are moving here.'
print(correct_grammar(text, num_return_sequences=3))

['He are moving here.', 'He is moving here.', "He's moving here."]


In [45]:
text = 'Cat drinked milk'
print(correct_grammar(text, num_return_sequences=4))

['Cat ate milk.', 'Cats drinked milk', 'Cat had a drink of milk', 'Cat drank milk.']
