In [2]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
!pip install pyspellchecker datasets evaluate transformers[torch] accelerate -U



In [9]:
import evaluate
import numpy as np
import torch
import spacy
import pandas as pd
from spellchecker import SpellChecker
from spacy import glossary
from copy import deepcopy
from torch import nn
from tqdm.auto import tqdm
from datasets import load_dataset, Dataset, load_metric
from transformers import BartForConditionalGeneration, BartTokenizer, get_scheduler, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM
from transformers.models.bart.modeling_bart import shift_tokens_right
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers.models.bart.modeling_bart import BartEncoder, BartDecoder


# Preprocessing

In [10]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
# code taken from https://www.cl.cam.ac.uk/research/nl/bea2019st/data/corr_from_m2.py
def m2_to_df(m2, id):
    # Do not apply edits with these error types
    skip = {"noop", "UNK", "Um"}
    ori_sentences = []
    corrected_sentences = []
    for sent in m2:
        sent = sent.split("\n")
        ori_sent = sent[0].split()[1:] # Ignore "S "
        cor_sent = ori_sent.copy()
        edits = sent[1:]
        offset = 0
        for edit in edits:
            edit = edit.split("|||")
            if edit[1] in skip: continue # Ignore certain edits
            coder = int(edit[-1])
            if coder != id: continue # Ignore other coders
            span = edit[0].split()[1:] # Ignore "A "
            start = int(span[0])
            end = int(span[1])
            cor = edit[2].split()
            cor_sent[start+offset:end+offset] = cor
            offset = offset-(end-start)+len(cor)
        ori_sentences.append(" ".join(ori_sent))
        corrected_sentences.append(" ".join(cor_sent))
    df = pd.DataFrame(list(zip(ori_sentences, corrected_sentences)),columns =['original', 'corrected'])
    return df

In [12]:
with open("/content/drive/MyDrive/CS4248NLP/wi+locness/m2/ABC.train.gold.bea19.m2", encoding="utf-8") as f:
    m2_train = f.read().strip().split("\n\n")
    train_df = m2_to_df(m2_train, 0)

with open("/content/drive/MyDrive/CS4248NLP/wi+locness/m2/ABCN.dev.gold.bea19.m2", encoding="utf-8") as f:
    m2_train = f.read().strip().split("\n\n")
    validation_df = m2_to_df(m2_train, 0)

In [13]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")
model_dir = "/content/drive/MyDrive/CS4248NLP/transformer_model/bart_simple"
batch_size = 64

In [18]:
display(train_df.head())
display(validation_df.head())

Unnamed: 0,original,corrected
0,My town is a medium size city with eighty thou...,My town is a medium - sized city with eighty t...
1,It has a high density population because its s...,It has a high - density population because of ...
2,"Despite of it is an industrial city , there ar...","Although it is an industrial city , there are ..."
3,I recommend visiting the artificial lake in th...,I recommend visiting the artificial lake in th...
4,Pasteries are very common and most of them off...,Pasteries are very common and most of them off...


Unnamed: 0,original,corrected
0,"It 's difficult answer at the question "" what ...","It 's difficult to answer the question "" what ..."
1,When I was younger I used to say that I wanted...,"When I was younger , I used to say that I want..."
2,I would like to study Psychology because one d...,"I would like to study Psychology , because one..."
3,It 's difficult because I 'll have to study ha...,It 's difficult because I 'll have to study ha...
4,"Maybe I 'll change my mind , maybe not .","Maybe I 'll change my mind , maybe not ."


In [15]:
train_data = Dataset.from_pandas(train_df).shuffle(seed=1)
val_data = Dataset.from_pandas(validation_df)

print(train_data)
print(val_data)

Dataset({
    features: ['original', 'corrected'],
    num_rows: 34308
})
Dataset({
    features: ['original', 'corrected'],
    num_rows: 4384
})


In [16]:
max_input_length=64
max_target_length=64

def preprocess_function(batch):
    model_inputs = tokenizer(batch["original"], padding='max_length', max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=batch["corrected"], padding='max_length', max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [17]:
train_data = train_data.map(
    preprocess_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["original", "corrected"]
)

val_data = val_data.map(
    preprocess_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["original", "corrected"]
)

Map:   0%|          | 0/34308 [00:00<?, ? examples/s]

Map:   0%|          | 0/4384 [00:00<?, ? examples/s]

# Model

In [None]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base", forced_bos_token_id=0)

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [None]:
import numpy as np
google_bleu = evaluate.load("google_bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = google_bleu.compute(predictions=decoded_preds, references=decoded_labels)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/8.64k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    logging_steps=100,
    save_strategy="no",
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    generation_max_length=64
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Google Bleu,Gen Len
1,0.1174,0.095302,0.8339,23.6004
2,0.0948,0.089449,0.8424,23.6702
3,0.0831,0.086148,0.8434,23.6109
4,0.075,0.086525,0.8452,23.6211
5,0.067,0.085445,0.846,23.6065
6,0.0625,0.086983,0.8478,23.6186
7,0.0571,0.087248,0.8486,23.6364
8,0.0547,0.087693,0.848,23.6348
9,0.0523,0.089411,0.847,23.6115
10,0.0505,0.088753,0.8474,23.6323


TrainOutput(global_step=5370, training_loss=0.16162829079441518, metrics={'train_runtime': 5187.0034, 'train_samples_per_second': 66.142, 'train_steps_per_second': 1.035, 'total_flos': 1.30742740058112e+16, 'train_loss': 0.16162829079441518, 'epoch': 10.0})

In [None]:
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir + "/tokenizer")

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('/content/drive/MyDrive/CS4248NLP/transformer_model/bart_simple/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/CS4248NLP/transformer_model/bart_simple/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/CS4248NLP/transformer_model/bart_simple/tokenizer/vocab.json',
 '/content/drive/MyDrive/CS4248NLP/transformer_model/bart_simple/tokenizer/merges.txt',
 '/content/drive/MyDrive/CS4248NLP/transformer_model/bart_simple/tokenizer/added_tokens.json',
 '/content/drive/MyDrive/CS4248NLP/transformer_model/bart_simple/tokenizer/tokenizer.json')

In [None]:
trainer.predict(val_data)

PredictionOutput(predictions=array([[    2,     0,   250, ...,     1,     1,     1],
       [    2,     0, 46805, ...,    71, 11323,     2],
       [    2,     0,   100, ...,     1,     1,     1],
       ...,
       [    2,     0,   100, ...,  -100,  -100,  -100],
       [    2,     0,   970, ...,  -100,  -100,  -100],
       [    2,     0,   100, ...,  -100,  -100,  -100]]), label_ids=array([[    0,   250,  1498, ...,     1,     1,     1],
       [    0, 46805,  2156, ..., 11323,  1949,     2],
       [    0,   100,   300, ...,     1,     1,     1],
       ...,
       [    0,   100,   524, ...,     1,     1,     1],
       [    0,   970,   128, ...,     1,     1,     1],
       [    0,   100,   679, ...,     1,     1,     1]]), metrics={'test_loss': 0.08875328302383423, 'test_google_bleu': 0.8474, 'test_gen_len': 23.6323, 'test_runtime': 319.0753, 'test_samples_per_second': 13.74, 'test_steps_per_second': 0.216})

# Inference

In [19]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir + "/tokenizer")

In [20]:
def preprocess_inference_function(batch):
    max_input_length=64
    max_target_length=64

    model_inputs = tokenizer(batch["original"], padding='max_length', max_length=max_input_length, truncation=True)

    return model_inputs

In [21]:
text = {"original": ["The boys goes to schol .", "I likes it"]}
# text = {"original": ["Thank you"]}
text_df = pd.DataFrame.from_dict(text)
display(text_df)

inference_data = Dataset.from_pandas(text_df)

inference_data = inference_data.map(
    preprocess_inference_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["original"]
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    logging_steps=100,
    save_strategy="no",
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    generation_max_length=64
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Unnamed: 0,original
0,The boys goes to schol .
1,I likes it


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [22]:
outputs = trainer.predict(inference_data)
print(outputs)
tokenizer.decode(outputs.predictions[0], skip_special_tokens=True)

PredictionOutput(predictions=array([[   2,    0,  133, 2786,  213,    7,  334,  479,    2,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1],
       [   2,    0,  100, 6640,   24,  479,    2,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1]]), label_ids=None, metrics={'test_runtime': 4.0027, 'test_samples_per_second': 0.5, 'test_steps_per_second': 0.25})


'The boys go to school.'

# Dev

In [23]:
outputs = trainer.predict(val_data)

In [1]:
dev_output_path = "/content/drive/MyDrive/CS4248NLP/dev_dataset_results/bart"

In [24]:
with open(dev_output_path + "/bart_simple_dev_output.txt", mode="w", encoding="utf-8") as file:
    for tokens in outputs.predictions:
        tokens = np.where(tokens != -100, tokens, tokenizer.pad_token_id)
        file.write(" ".join([i.text for i in nlp(tokenizer.decode(tokens, skip_special_tokens=True))]))
        file.write("\n")
print("done")

done


# Testing

In [None]:
with open("/content/drive/MyDrive/CS4248NLP/wi+locness/test/ABCN.test.bea19.orig", encoding="utf-8") as file:
    test_lines = file.read().strip().split("\n")
test = {"original": test_lines}
test_df = pd.DataFrame.from_dict(test)
display(test_df)

test_data = Dataset.from_pandas(test_df)

test_data = test_data.map(
    preprocess_inference_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["original"]
)

test_data

Unnamed: 0,original
0,"Dear Sir ,"
1,I have seen your advertisement for a job on th...
2,I am working as a teacher in Spanish school wi...
3,I am an easy going person with a lot of empath...
4,"On the other hand , in my leisure time , I usu..."
...,...
4472,It is also believed that Russian teachers are ...
4473,I totally disagree with this opinion because R...
4474,"To sum up , I would argue that the best way to..."
4475,"Moreover , today we have lots of opportunities..."


Map:   0%|          | 0/4477 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 4477
})

In [None]:
outputs =  trainer.predict(test_data)
outputs

PredictionOutput(predictions=array([[    2,     0, 23314, ...,  -100,  -100,  -100],
       [    2,     0,   100, ...,  -100,  -100,  -100],
       [    2,     0,   100, ...,  -100,  -100,  -100],
       ...,
       [    2,     0,  3972, ...,     1,     1,     1],
       [    2,     0, 42850, ...,     1,     1,     1],
       [    2,     0,   100, ...,     1,     1,     1]]), label_ids=None, metrics={'test_runtime': 294.042, 'test_samples_per_second': 15.226, 'test_steps_per_second': 0.238})

In [None]:
with open(model_dir + "/bart_simple_test_output.txt", mode="w", encoding="utf-8") as file:
    for tokens in outputs.predictions:
        tokens = np.where(tokens != -100, tokens, tokenizer.pad_token_id)
        file.write(" ".join([i.text for i in nlp(tokenizer.decode(tokens, skip_special_tokens=True))]))
        file.write("\n")
print("done")

done


# Calculate scores

In [3]:
!pip install errant



In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from nltk.translate.gleu_score import sentence_gleu

import spacy
import errant

In [5]:
ORIGINAL_PATH = '/content/drive/MyDrive/CS4248NLP/dev_dataset_results/eval_orig.txt' # path to original text - necessary for f0.5 only
PREDICTIONS_PATH = dev_output_path + "/bart_simple_dev_output.txt" # your model's predictions on the original text
GROUND_TRUTH_PATH = '/content/drive/MyDrive/CS4248NLP/dev_dataset_results/eval_corr.txt' # all corrected sentences

PREDICTIONS_M2 = PREDICTIONS_PATH.replace('.txt', '.m2')
GT_M2 = GROUND_TRUTH_PATH.replace('.txt', '.m2')

f_pred = open(PREDICTIONS_PATH, 'r')
f_gt = open(GROUND_TRUTH_PATH, 'r')

In [6]:
# calculating GLEU SCORE
gleu_scores = []

for pred, gt in zip(f_pred, f_gt):
  gleu_scores.append(sentence_gleu([gt.split()], pred.split()))

overall_gleu = np.mean(gleu_scores)
print(f'Overall gleu: {overall_gleu}')

Overall gleu: 0.8508143631270626


In [7]:
!errant_parallel -orig $ORIGINAL_PATH -cor $PREDICTIONS_PATH -out $PREDICTIONS_M2
!errant_parallel -orig $ORIGINAL_PATH -cor $GROUND_TRUTH_PATH -out $GT_M2
!errant_compare -hyp $PREDICTIONS_M2 -ref $GT_M2 -cse -cat 3

Loading resources...
Processing parallel files...
Loading resources...
Processing parallel files...

Category       TP       FP       FN       P        R        F0.5
M:ADJ          3        6        17       0.3333   0.15     0.2679
M:ADV          3        12       26       0.2      0.1034   0.1685
M:CONJ         2        7        23       0.2222   0.08     0.1639
M:CONTR        0        0        2        1.0      0.0      0.0
M:DET          194      109      168      0.6403   0.5359   0.6163
M:NOUN         2        12       43       0.1429   0.0444   0.099
M:NOUN:POSS    9        3        18       0.75     0.3333   0.6
M:OTHER        19       46       137      0.2923   0.1218   0.2284
M:PART         2        2        8        0.5      0.2      0.3846
M:PREP         47       39       110      0.5465   0.2994   0.4691
M:PRON         22       24       36       0.4783   0.3793   0.4545
M:PUNCT        601      254      503      0.7029   0.5444   0.6642
M:VERB         14       20       44  