In [1]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install errant pyspellchecker datasets evaluate transformers[torch] accelerate -U



In [3]:
import evaluate
import numpy as np
import torch
import spacy
import pandas as pd
from spellchecker import SpellChecker
from spacy import glossary
from copy import deepcopy
from torch import nn
from tqdm.auto import tqdm
from datasets import load_dataset, Dataset, load_metric
from transformers import BartForConditionalGeneration, BartTokenizer, get_scheduler, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM
from transformers.models.bart.modeling_bart import shift_tokens_right
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers.models.bart.modeling_bart import BartEncoder, BartDecoder


# Preprocessing

In [4]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
# code taken from https://www.cl.cam.ac.uk/research/nl/bea2019st/data/corr_from_m2.py
def m2_to_df(m2, id):
    # Do not apply edits with these error types
    skip = {"noop", "UNK", "Um"}
    ori_sentences = []
    corrected_sentences = []
    for sent in m2:
        sent = sent.split("\n")
        ori_sent = sent[0].split()[1:] # Ignore "S "
        cor_sent = ori_sent.copy()
        edits = sent[1:]
        offset = 0
        for edit in edits:
            edit = edit.split("|||")
            if edit[1] in skip: continue # Ignore certain edits
            coder = int(edit[-1])
            if coder != id: continue # Ignore other coders
            span = edit[0].split()[1:] # Ignore "A "
            start = int(span[0])
            end = int(span[1])
            cor = edit[2].split()
            cor_sent[start+offset:end+offset] = cor
            offset = offset-(end-start)+len(cor)
        ori_sentences.append(" ".join(ori_sent))
        corrected_sentences.append(" ".join(cor_sent))
    df = pd.DataFrame(list(zip(ori_sentences, corrected_sentences)),columns =['original', 'corrected'])
    return df

In [6]:
with open("/content/drive/MyDrive/CS4248NLP/wi+locness/m2/ABC.train.gold.bea19.m2", encoding="utf-8") as f:
    m2_train = f.read().strip().split("\n\n")
    train_df = m2_to_df(m2_train, 0)

with open("/content/drive/MyDrive/CS4248NLP/wi+locness/m2/ABCN.dev.gold.bea19.m2", encoding="utf-8") as f:
    m2_train = f.read().strip().split("\n\n")
    validation_df = m2_to_df(m2_train, 0)

In [7]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")
model_dir = "/content/drive/MyDrive/CS4248NLP/transformer_model/bart_tag"
batch_size = 64

def get_linguistic_features(df):
    pos_tokens = []
    tag_tokens = []
    dep_tokens = []
    with tqdm(total=len(df)) as progress_bar:
        for doc in nlp.pipe(df["original"].tolist()):
            pos_tokens.append(" ".join([token.pos_ for token in doc]))
            tag_tokens.append(" ".join([token.tag_ for token in doc]))
            dep_tokens.append(" ".join([token.dep_ for token in doc]))
            progress_bar.update(1)
    df["pos"] = pos_tokens
    df["tag"] = tag_tokens
    df["dep"] = dep_tokens
    return df

In [8]:
all_features = list(glossary.GLOSSARY.keys())
num_added_toks = tokenizer.add_tokens(all_features)
print("We have added", num_added_toks, "tokens")

We have added 302 tokens


In [9]:
get_linguistic_features(train_df)
get_linguistic_features(validation_df)

  0%|          | 0/34308 [00:00<?, ?it/s]

  0%|          | 0/4384 [00:00<?, ?it/s]

Unnamed: 0,original,corrected,pos,tag,dep
0,"It 's difficult answer at the question "" what ...","It 's difficult to answer the question "" what ...",PRON AUX ADJ NOUN ADP DET NOUN PUNCT PRON AUX ...,PRP VBZ JJ NN IN DT NN `` WP VBP PRP VBG TO VB...,nsubj ROOT amod attr prep det pobj punct dobj ...
1,When I was younger I used to say that I wanted...,"When I was younger , I used to say that I want...",SCONJ PRON AUX ADJ PRON VERB PART VERB SCONJ P...,WRB PRP VBD JJR PRP VBD TO VB IN PRP VBD TO VB...,advmod nsubj advcl acomp nsubj ROOT aux xcomp ...
2,I would like to study Psychology because one d...,"I would like to study Psychology , because one...",PRON AUX VERB PART VERB NOUN SCONJ NUM NOUN PR...,PRP MD VB TO VB NN IN CD NN PRP MD VB PRP$ JJ ...,nsubj aux ROOT aux xcomp dobj mark nummod npad...
3,It 's difficult because I 'll have to study ha...,It 's difficult because I 'll have to study ha...,PRON AUX ADJ SCONJ PRON AUX VERB PART VERB ADV...,"PRP VBZ JJ IN PRP MD VB TO VB RB CC DT NN , CC...",nsubj ROOT acomp mark nsubj aux advcl aux xcom...
4,"Maybe I 'll change my mind , maybe not .","Maybe I 'll change my mind , maybe not .",ADV PRON AUX VERB PRON NOUN PUNCT ADV PART PUNCT,"RB PRP MD VB PRP$ NN , RB RB .",advmod nsubj aux ROOT poss dobj punct advmod a...
...,...,...,...,...,...
4379,How much violence is there ?,How much violence is there ?,SCONJ ADJ NOUN AUX ADV PUNCT,WRB JJ NN VBZ RB .,advmod amod attr ROOT advmod punct
4380,The most violent cartoons are ' Teenage Mutant...,The most violent cartoons are ' Teenage Mutant...,DET ADV ADJ NOUN AUX PUNCT PROPN PROPN PROPN P...,"DT RBS JJ NNS VBP `` NNP NNP NNP NNPS '' , '' ...",det advmod amod nsubj ROOT punct compound comp...
4381,As for the shows containing sinister combat vi...,As for the shows containing sinister combat vi...,ADP ADP DET NOUN VERB ADJ NOUN NOUN PUNCT INTJ...,"IN IN DT NNS VBG JJ NN NN , UH DT VBZ RB IN IN...",prep prep det pobj acl amod compound dobj punc...
4382,The cause for television violence is to add te...,The reason for television violence is to add t...,DET NOUN ADP NOUN NOUN AUX PART VERB NOUN ADP ...,DT NN IN NN NN VBZ TO VB NN IN DT NN CC RB TO ...,det nsubj prep compound pobj ROOT aux xcomp do...


In [10]:
display(train_df.sample(5))
display(validation_df.sample(5))

Unnamed: 0,original,corrected,pos,tag,dep
30927,How is it ?,How is it ?,SCONJ AUX PRON PUNCT,WRB VBZ PRP .,advmod ROOT nsubj punct
29632,"With reference to his work experience , he has...","With reference to his work experience , he wor...",ADP NOUN ADP PRON NOUN NOUN PUNCT PRON AUX VER...,"IN NN IN PRP$ NN NN , PRP VBZ VBN IN DT JJ NN ...",prep pobj prep poss compound pobj punct nsubj ...
33544,"Consequentlu , both learning options have thei...","Consequently , both learning options have thei...",PROPN PUNCT PRON VERB NOUN VERB PRON ADJ CCONJ...,"NNP , DT VBG NNS VBP PRP$ JJ CC JJ NNS .",npadvmod punct preconj amod nsubj ROOT poss am...
15781,You are very lucky in choosing life partner I ...,You are very lucky in choosing a life partner ...,PRON AUX ADV ADJ ADP VERB NOUN NOUN PRON AUX V...,PRP VBP RB JJ IN VBG NN NN PRP VBP VBN PRP$ NN...,nsubj ROOT advmod acomp prep pcomp compound do...
33037,The note read and her tears rolled down her pa...,The note read and her tears rolled down her pa...,DET NOUN VERB CCONJ PRON NOUN VERB ADP PRON AD...,DT NN VBD CC PRP$ NNS VBD RP PRP$ JJ NN .,det nsubj ROOT cc poss nsubj conj prt poss amo...


Unnamed: 0,original,corrected,pos,tag,dep
2180,Michael closed the door and knew at the moment...,Michael closed the door and knew at that momen...,PROPN VERB DET NOUN CCONJ VERB ADP DET NOUN PR...,NNP VBD DT NN CC VBD IN DT NN PRP VBD DT NN .,nsubj ROOT det dobj cc conj prep det pobj nsub...
3169,The organisation was initially set up to conse...,The organisation was initially set up to conse...,DET NOUN AUX ADV VERB ADP PART VERB NOUN ADV A...,DT NN VBD RB VBN RP TO VB NNS RB IN DT NN IN P...,det nsubjpass auxpass advmod ROOT prt aux advc...
4201,This means I 'm able to store more information...,This means I 'm able to store more information...,PRON VERB PRON VERB VERB ADJ PART VERB ADJ NOU...,DT VBZ PRP VBP VBP JJ TO VB JJR NN IN DT JJ NN .,nsubj ROOT dobj ccomp ccomp acomp aux xcomp am...
3927,"It has therefore caused an unnecessary , delet...","It has therefore caused an unnecessary , delet...",PRON AUX ADV VERB DET ADJ PUNCT ADJ NOUN ADP D...,"PRP VBZ RB VBN DT JJ , JJ NN IN DT JJ NN CC NN...",nsubj aux advmod ROOT det amod punct amod dobj...
748,"Organized shares , where everyone clean up the...","Organized shares , where everyone cleans up th...",ADJ NOUN PUNCT SCONJ PRON VERB ADP DET NOUN PUNCT,"JJ NNS , WRB NN VBP RP DT NN .",amod ROOT punct advmod nsubj relcl prt det dob...


In [11]:
train_data = Dataset.from_pandas(train_df).shuffle(seed=1)
val_data = Dataset.from_pandas(validation_df)

print(train_data)
print(val_data)

Dataset({
    features: ['original', 'corrected', 'pos', 'tag', 'dep'],
    num_rows: 34308
})
Dataset({
    features: ['original', 'corrected', 'pos', 'tag', 'dep'],
    num_rows: 4384
})


In [12]:
max_input_length=64
max_target_length=64

def preprocess_function(batch):
    model_inputs = tokenizer(batch["original"], padding='max_length', max_length=max_input_length, truncation=True)
    pos_inputs = tokenizer(text_target=batch["pos"], padding='max_length', max_length=max_input_length, truncation=True)
    tag_inputs = tokenizer(text_target=batch["tag"], padding='max_length', max_length=max_input_length, truncation=True)
    dep_inputs = tokenizer(text_target=batch["dep"], padding='max_length', max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=batch["corrected"], padding='max_length', max_length=max_target_length, truncation=True)

    model_inputs["input_ids"] = np.concatenate([model_inputs["input_ids"], pos_inputs["input_ids"], tag_inputs["input_ids"], dep_inputs["input_ids"]], axis=1)
    model_inputs["attention_mask"] = np.concatenate([model_inputs.attention_mask, pos_inputs.attention_mask, tag_inputs.attention_mask, dep_inputs.attention_mask], axis=1)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [13]:
train_data = train_data.map(
    preprocess_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["original", "corrected", "pos", "tag", "dep"]
)

val_data = val_data.map(
    preprocess_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["original", "corrected", "pos", "tag", "dep"]
)

Map:   0%|          | 0/34308 [00:00<?, ? examples/s]

Map:   0%|          | 0/4384 [00:00<?, ? examples/s]

# Model

In [None]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base", forced_bos_token_id=0)
model.resize_token_embeddings(len(tokenizer))

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Embedding(50448, 768)

In [None]:
import numpy as np
google_bleu = evaluate.load("google_bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = google_bleu.compute(predictions=decoded_preds, references=decoded_labels)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/8.64k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    logging_steps=100,
    save_strategy="no",
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    generation_max_length=64
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Google Bleu,Gen Len
1,0.1806,0.144009,0.8067,32.3953
2,0.1436,0.131253,0.8116,32.4779
3,0.1276,0.123399,0.8131,32.4088
4,0.1161,0.120063,0.8156,32.466
5,0.1037,0.118043,0.8163,32.4911
6,0.0993,0.115977,0.8167,32.4945


Epoch,Training Loss,Validation Loss,Google Bleu,Gen Len
1,0.1806,0.144009,0.8067,32.3953
2,0.1436,0.131253,0.8116,32.4779
3,0.1276,0.123399,0.8131,32.4088
4,0.1161,0.120063,0.8156,32.466
5,0.1037,0.118043,0.8163,32.4911
6,0.0993,0.115977,0.8167,32.4945
7,0.0905,0.115483,0.8192,32.5169
8,0.088,0.114875,0.8187,32.5431
9,0.0844,0.115608,0.8188,32.5034
10,0.0818,0.114776,0.8189,32.5265


TrainOutput(global_step=5370, training_loss=0.18609608298573413, metrics={'train_runtime': 8074.6184, 'train_samples_per_second': 42.489, 'train_steps_per_second': 0.665, 'total_flos': 5.22970960232448e+16, 'train_loss': 0.18609608298573413, 'epoch': 10.0})

In [None]:
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir + "/tokenizer")

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('/content/drive/MyDrive/CS4248NLP/transformer_model/bart_tag/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/CS4248NLP/transformer_model/bart_tag/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/CS4248NLP/transformer_model/bart_tag/tokenizer/vocab.json',
 '/content/drive/MyDrive/CS4248NLP/transformer_model/bart_tag/tokenizer/merges.txt',
 '/content/drive/MyDrive/CS4248NLP/transformer_model/bart_tag/tokenizer/added_tokens.json',
 '/content/drive/MyDrive/CS4248NLP/transformer_model/bart_tag/tokenizer/tokenizer.json')

In [None]:
trainer.predict(val_data)

PredictionOutput(predictions=array([[    2,     0,   250, ...,     1,     1,     1],
       [    2,     0, 46805, ..., 12524, 25606,     2],
       [    2,     0,   100, ...,     1,     1,     1],
       ...,
       [    2,     0,   100, ...,     1,     1,     1],
       [    2,     0,   133, ...,     1,     1,     1],
       [    2,     0,   100, ...,     1,     1,     1]]), label_ids=array([[    0,   250,  1498, ...,     1,     1,     1],
       [    0, 46805,  1437, ..., 25606,     5,     2],
       [    0,   100,   300, ...,     1,     1,     1],
       ...,
       [    0,   100,   524, ...,     1,     1,     1],
       [    0,   133,   241, ...,     1,     1,     1],
       [    0,   100,   679, ...,     1,     1,     1]]), metrics={'test_loss': 0.11477609723806381, 'test_google_bleu': 0.8189, 'test_gen_len': 32.5265, 'test_runtime': 475.8261, 'test_samples_per_second': 9.213, 'test_steps_per_second': 0.145})

# Inference

In [14]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir + "/tokenizer")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
def correct_spelling(sentence):
    spell = SpellChecker(distance=1)
    tokens = sentence.split(" ")
    corrected_tokens = []
    for token in tokens:
        corrected_token = spell.correction(token)
        if corrected_token is not None:
            corrected_tokens.append(corrected_token)
        else:
            corrected_tokens.append(token)
    corrected_sentence = " ".join(corrected_tokens)
    return corrected_sentence

In [16]:
wrong_sentence = "The boys goes to schol ."
corrected = correct_spelling(wrong_sentence)
print(corrected)

The boys goes to school .


In [17]:
def preprocess_inference_function(batch):
    max_input_length=64
    max_target_length=64

    inputs = [correct_spelling(sentence) for sentence in batch["original"]]

    model_inputs = tokenizer(inputs, padding='max_length', max_length=max_input_length, truncation=True)
    pos_inputs = tokenizer(text_target=batch["pos"], padding='max_length', max_length=max_input_length, truncation=True)
    tag_inputs = tokenizer(text_target=batch["tag"], padding='max_length', max_length=max_input_length, truncation=True)
    dep_inputs = tokenizer(text_target=batch["dep"], padding='max_length', max_length=max_input_length, truncation=True)
    model_inputs["input_ids"] = np.concatenate([model_inputs["input_ids"], pos_inputs["input_ids"], tag_inputs["input_ids"], dep_inputs["input_ids"]], axis=1)
    model_inputs["attention_mask"] = np.concatenate([model_inputs.attention_mask, pos_inputs.attention_mask, tag_inputs.attention_mask, dep_inputs.attention_mask], axis=1)

    return model_inputs

In [18]:
text = {"original": ["The boys goes to schol .", "I likes it"]}
# text = {"original": ["Thank you"]}
text_df = pd.DataFrame.from_dict(text)
get_linguistic_features(text_df)
display(text_df)

inference_data = Dataset.from_pandas(text_df)

inference_data = inference_data.map(
    preprocess_inference_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["original", "pos", "tag", "dep"]
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    logging_steps=100,
    save_strategy="no",
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    generation_max_length=64
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,original,pos,tag,dep
0,The boys goes to schol .,DET NOUN VERB ADP NOUN PUNCT,DT NNS VBZ IN NN .,det nsubj ROOT prep pobj punct
1,I likes it,PRON VERB PRON,PRP VBZ PRP,nsubj ROOT dobj


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [19]:
outputs = trainer.predict(inference_data)
print(outputs)
tokenizer.decode(outputs.predictions[0], skip_special_tokens=True)

PredictionOutput(predictions=array([[   2,    0,  133, 2786,  213,    7,  334, 1437,    4,    2,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1],
       [   2,    0,  100,  101,   24,    2,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1]]), label_ids=None, metrics={'test_runtime': 1.3807, 'test_samples_per_second': 1.449, 'test_steps_per_second': 0.724})


'The boys go to school.'

# Dev

In [20]:
outputs = trainer.predict(val_data)

In [5]:
dev_output_path = "/content/drive/MyDrive/CS4248NLP/dev_dataset_results/bart"

In [22]:
with open(dev_output_path + "/bart_tag_dev_output.txt", mode="w", encoding="utf-8") as file:
    for tokens in outputs.predictions:
        tokens = np.where(tokens != -100, tokens, tokenizer.pad_token_id)
        file.write(" ".join([i.text for i in nlp(tokenizer.decode(tokens, skip_special_tokens=True))]))
        file.write("\n")
print("done")

done


# Testing

In [None]:
with open("/content/drive/MyDrive/CS4248NLP/wi+locness/test/ABCN.test.bea19.orig", encoding="utf-8") as file:
    test_lines = file.read().strip().split("\n")
test = {"original": test_lines}
test_df = pd.DataFrame.from_dict(test)
get_linguistic_features(test_df)
display(test_df)

test_data = Dataset.from_pandas(test_df)

test_data = test_data.map(
    preprocess_inference_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["original", "pos", "tag", "dep"]
)

test_data

  0%|          | 0/4477 [00:00<?, ?it/s]

Unnamed: 0,original,pos,tag,dep
0,"Dear Sir ,",PROPN PROPN PUNCT,"NNP NNP ,",amod ROOT punct
1,I have seen your advertisement for a job on th...,PRON AUX VERB PRON NOUN ADP DET NOUN ADP DET N...,PRP VBP VBN PRP$ NN IN DT NN IN DT NN CC PRP V...,nsubj aux ROOT poss dobj prep det pobj prep de...
2,I am working as a teacher in Spanish school wi...,PRON AUX VERB ADP DET NOUN ADP ADJ NOUN ADP NO...,PRP VBP VBG IN DT NN IN JJ NN IN NNS VBN IN CD...,nsubj aux ROOT prep det pobj prep amod pobj pr...
3,I am an easy going person with a lot of empath...,PRON AUX DET ADJ VERB NOUN ADP DET NOUN ADP NO...,PRP VBP DT JJ VBG NN IN DT NN IN NN IN NNS .,nsubj ROOT det amod amod attr prep det pobj pr...
4,"On the other hand , in my leisure time , I usu...",ADP DET ADJ NOUN PUNCT ADP PRON NOUN NOUN PUNC...,"IN DT JJ NN , IN PRP$ NN NN , PRP RB VBP NNS I...",prep det amod pobj punct prep poss compound po...
...,...,...,...,...
4472,It is also believed that Russian teachers are ...,PRON AUX ADV VERB SCONJ ADJ NOUN AUX PART ADV ...,PRP VBZ RB VBN IN JJ NNS VBP RB RB JJ IN DT IN...,nsubjpass auxpass advmod ROOT mark amod nsubj ...
4473,I totally disagree with this opinion because R...,PRON ADV VERB ADP DET NOUN SCONJ ADJ NOUN AUX ...,PRP RB VBP IN DT NN IN JJ NNS MD VB CD NNS CC ...,nsubj advmod ROOT prep det pobj mark amod nsub...
4474,"To sum up , I would argue that the best way to...",PART VERB ADP PUNCT PRON AUX VERB SCONJ DET AD...,"TO VB RP , PRP MD VB IN DT JJS NN TO VB DT NN ...",aux advcl prt punct nsubj aux ROOT mark det am...
4475,"Moreover , today we have lots of opportunities...",ADV PUNCT NOUN PRON VERB NOUN ADP NOUN PART VE...,"RB , NN PRP VBP NNS IN NNS TO VB PRP$ NNS JJ I...",advmod punct npadvmod nsubj ROOT dobj prep pob...


Map:   0%|          | 0/4477 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 4477
})

In [None]:
outputs =  trainer.predict(test_data)
outputs

In [None]:
with open(model_dir + "/bart_tag_test_output.txt", mode="w", encoding="utf-8") as file:
    for tokens in outputs.predictions:
        tokens = np.where(tokens != -100, tokens, tokenizer.pad_token_id)
        file.write(" ".join([i.text for i in nlp(tokenizer.decode(tokens, skip_special_tokens=True))]))
        file.write("\n")
print("done")

done


# Calculate scores

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from nltk.translate.gleu_score import sentence_gleu

import spacy
import errant

In [6]:
ORIGINAL_PATH = '/content/drive/MyDrive/CS4248NLP/dev_dataset_results/eval_orig.txt' # path to original text - necessary for f0.5 only
PREDICTIONS_PATH = dev_output_path + "/bart_tag_dev_output.txt" # your model's predictions on the original text
GROUND_TRUTH_PATH = '/content/drive/MyDrive/CS4248NLP/dev_dataset_results/eval_corr.txt' # all corrected sentences

PREDICTIONS_M2 = PREDICTIONS_PATH.replace('.txt', '.m2')
GT_M2 = GROUND_TRUTH_PATH.replace('.txt', '.m2')

f_pred = open(PREDICTIONS_PATH, 'r')
f_gt = open(GROUND_TRUTH_PATH, 'r')

In [7]:
# calculating GLEU SCORE
gleu_scores = []

for pred, gt in zip(f_pred, f_gt):
  gleu_scores.append(sentence_gleu([gt.split()], pred.split()))

overall_gleu = np.mean(gleu_scores)
print(f'Overall gleu: {overall_gleu}')

Overall gleu: 0.8200353960390034


In [8]:
!errant_parallel -orig $ORIGINAL_PATH -cor $PREDICTIONS_PATH -out $PREDICTIONS_M2
!errant_parallel -orig $ORIGINAL_PATH -cor $GROUND_TRUTH_PATH -out $GT_M2
!errant_compare -hyp $PREDICTIONS_M2 -ref $GT_M2 -cse -cat 3

Loading resources...
Processing parallel files...
Loading resources...
Processing parallel files...

Category       TP       FP       FN       P        R        F0.5
M:ADJ          2        2        18       0.5      0.1      0.2778
M:ADV          2        11       27       0.1538   0.069    0.1235
M:CONJ         2        7        23       0.2222   0.08     0.1639
M:CONTR        0        0        2        1.0      0.0      0.0
M:DET          149      114      213      0.5665   0.4116   0.5269
M:NOUN         1        7        44       0.125    0.0222   0.0649
M:NOUN:POSS    6        7        21       0.4615   0.2222   0.3797
M:OTHER        13       44       143      0.2281   0.0833   0.1693
M:PART         1        0        9        1.0      0.1      0.3571
M:PREP         31       40       126      0.4366   0.1975   0.3515
M:PRON         21       29       37       0.42     0.3621   0.407
M:PUNCT        571      301      533      0.6548   0.5172   0.6217
M:VERB         10       21       4