## Load Data

In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import pandas as pd
import re

In [None]:
# !pip install pyspellchecker
# !pip install contextualSpellCheck
# To assess different spell checkers for our needs

In [None]:
# code taken from https://www.cl.cam.ac.uk/research/nl/bea2019st/data/corr_from_m2.py
def m2_to_df(m2, id):
    # Do not apply edits with these error types
    skip = {"noop", "UNK", "Um"}
    ori_sentences = []
    corrected_sentences = []
    for sent in m2:
        sent = sent.split("\n")
        ori_sent = sent[0].split()[1:] # Ignore "S "
        cor_sent = ori_sent.copy()
        edits = sent[1:]
        offset = 0
        for edit in edits:
            edit = edit.split("|||")
            if edit[1] in skip: continue # Ignore certain edits
            coder = int(edit[-1])
            if coder != id: continue # Ignore other coders
            span = edit[0].split()[1:] # Ignore "A "
            start = int(span[0])
            end = int(span[1])
            cor = edit[2].split()
            cor_sent[start+offset:end+offset] = cor
            offset = offset-(end-start)+len(cor)
        ori_sentences.append(" ".join(ori_sent))
        corrected_sentences.append(" ".join(cor_sent))
    df = pd.DataFrame(list(zip(ori_sentences, corrected_sentences)),columns =['original', 'corrected'])
    return df

In [None]:
# Change path here
with open('/content/drive/MyDrive/CS4248NLP/wi+locness/m2/ABC.train.gold.bea19.m2') as f:
    m2_train = f.read().strip().split("\n\n")
    train_df = m2_to_df(m2_train, 0)

In [None]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None
pd.options.display.max_colwidth = None
print("First sentence")
print(train_df.loc[0]["original"])
print(train_df.loc[0]["corrected"])

First sentence
My town is a medium size city with eighty thousand inhabitants .
My town is a medium - sized city with eighty thousand inhabitants .


In [None]:
train_df.shape

(34308, 2)

In [None]:
train_df.head()

Unnamed: 0,original,corrected
0,My town is a medium size city with eighty thousand inhabitants .,My town is a medium - sized city with eighty thousand inhabitants .
1,It has a high density population because its small territory .,It has a high - density population because of its small territory .
2,"Despite of it is an industrial city , there are many shops and department stores .","Although it is an industrial city , there are many shops and department stores ."
3,I recommend visiting the artificial lake in the certer of the city which is surrounded by a park .,I recommend visiting the artificial lake in the center of the city which is surrounded by a park .
4,Pasteries are very common and most of them offer the special dessert from the city .,Pasteries are very common and most of them offer the special dessert of the city .


In [None]:
train_df['correct_char_count'] = train_df['corrected'].astype('str').apply(lambda x:len(x))
train_df['original_char_count'] = train_df['original'].astype('str').apply(lambda x:len(x))

In [None]:

train_df['correct_word_count'] = train_df['corrected'].astype('str').apply(lambda x:len(x.split()))
train_df['original_word_count'] = train_df['original'].astype('str').apply(lambda x:len(x.split()))

In [None]:
train_df.sample(5)

Unnamed: 0,original,corrected,correct_char_count,original_char_count,correct_word_count,original_word_count
21674,"I completely agree with you , maybe each word you say .","I completely agree with you , maybe each word you say .",55,55,12,12
22820,"So , you will be able to live and watch all this amazing exotic flowers , to hear Colombian music and see typical dances .","So , you will be able to experience and watch all these amazing exotic flowers , to hear Colombian music and see traditional dances .",133,122,25,25
33629,"Adults usually attend language courses , or use a variety of on - line courses .","Adults usually attend language courses , or use a variety of online courses .",77,80,14,16
16994,The building has two changing rooms for men and women .,"The building has two changing rooms , for men and women .",57,55,12,11
26694,The amount of talking done by the students was more that pleasing .,The amount of talking done by the students was more than pleasing .,67,67,13,13


Check NA

In [None]:
# prompt: get NA count for all columns

train_df.isna().sum()


original               0
corrected              0
correct_char_count     0
original_char_count    0
correct_word_count     0
original_word_count    0
dtype: int64

Count total no of sentences where original and corrected column are same

In [None]:
# prompt: count total no of sentences where original and corrected column are same

same_sentence_count = train_df[train_df['original'] == train_df['corrected']].shape[0]
print(f"Number of sentences where original and corrected are same: {same_sentence_count}")


Number of sentences where original and corrected are same: 11571


In [None]:
# prompt: count total no of sentences where original and corrected column are not same

initial_df = train_df[train_df['original'] != train_df['corrected']]
print(f"Number of sentences where original and corrected are different: {initial_df.shape[0]}")


Number of sentences where original and corrected are different: 22737


In [None]:
initial_df.shape

(22737, 6)

Remove duplicate

In [None]:
# prompt: Remove duplicate all columns
initial_df.duplicated().sum()
#initial_df_1 = initial_df.drop_duplicates(subset=['original', 'corrected', 'correct_char_count', 'original_char_count', 'correct_word_count', 'original_word_count'], keep='first')


27

In [None]:
initial_df[initial_df.duplicated(keep=False)].sort_values('corrected')

Unnamed: 0,original,corrected,correct_char_count,original_char_count,correct_word_count,original_word_count
10055,"All in all , using renewable energy and improve our regulation is two ways to cutting down pollution for a better future .","All in all , using renewable energy and improving our regulations are two ways to cut down on pollution for a better future .",125,122,24,23
10064,"All in all , using renewable energy and improve our regulation is two ways to cutting down pollution for a better future .","All in all , using renewable energy and improving our regulations are two ways to cut down on pollution for a better future .",125,122,24,23
31713,"Dear Mrs Kate Ashby ,","Dear Mrs Ashby ,",16,21,4,5
17447,"Dear Mrs Kate Ashby ,","Dear Mrs Ashby ,",16,21,4,5
9237,"Dear Mrs Kate Ashby ,","Dear Mrs Ashby ,",16,21,4,5
31806,Dear Sir :,"Dear Sir ,",10,10,3,3
13266,Dear Sir :,"Dear Sir ,",10,10,3,3
18334,"Dear sir / madam ,","Dear Sir / Madam ,",18,18,5,5
28485,"Dear sir / madam ,","Dear Sir / Madam ,",18,18,5,5
24208,"Sincerely ,","Faithfully ,",12,11,2,2


In [None]:
initial_df.shape

(22737, 6)

In [None]:
initial_df = initial_df.drop_duplicates().reset_index(drop=True)

In [None]:
initial_df[initial_df.duplicated(keep=False)].sort_values('corrected')

Unnamed: 0,original,corrected,correct_char_count,original_char_count,correct_word_count,original_word_count


In [None]:
initial_df.shape


(22710, 6)

Remove Sentences with length = 1

In [None]:
initial_df[initial_df['original_word_count']<2]

Unnamed: 0,original,corrected,correct_char_count,original_char_count,correct_word_count,original_word_count
925,kisses,Kisses,6,6,1,1
3003,Faithfully,Yours faithfully,16,10,2,1
4271,fsdjgdofg,,0,9,0,1
4497,toni,Toni,4,4,1,1
4831,lou,Lou,3,3,1,1
4884,victoria,Victoria,8,8,1,1
5156,Sincerely,Yours Faithfully,16,9,2,1
7345,hello,Hello,5,5,1,1
10519,Ou,,0,2,0,1
10747,sincerely,Yours faithfully,16,9,2,1


In [None]:
initial_df[initial_df['correct_word_count']<2]

Unnamed: 0,original,corrected,correct_char_count,original_char_count,correct_word_count,original_word_count
925,kisses,Kisses,6,6,1,1
1698,cell phone,cellphone,9,10,1,2
2147,good bye,Goodbye,7,8,1,2
4270,yes becouse yes,,0,15,0,3
4271,fsdjgdofg,,0,9,0,1
4497,toni,Toni,4,4,1,1
4830,dear .,Dear,4,6,1,2
4831,lou,Lou,3,3,1,1
4884,victoria,Victoria,8,8,1,1
7345,hello,Hello,5,5,1,1


In [None]:
initial_df[(initial_df['original_word_count']<2) | (initial_df['correct_word_count']<2)].shape

(25, 6)

In [None]:
final_df = initial_df[(initial_df['original_word_count']>=2) & (initial_df['correct_word_count']>=2)].reset_index(drop=True)
final_df.shape

(22685, 6)

In [None]:
final_df[final_df['correct_word_count'] <= 30].shape


(18688, 6)

In [None]:
print('Min correct word count:', final_df['correct_word_count'].min())
print('Max correct word count:', final_df['correct_word_count'].max())


Min correct word count: 2
Max correct word count: 236


In [None]:
print('Min correct word count:', final_df['original_word_count'].min())
print('Max correct word count:', final_df['original_word_count'].max())

Min correct word count: 2
Max correct word count: 220


In [None]:
final_df.shape

(22685, 6)

In [None]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# SOS_TOKEN = "[<SOS>]"
# EOS_TOKEN = "[<EOS>]"

# SPECIALS = [SOS_TOKEN, EOS_TOKEN]
# tokenizer.add_special_tokens({'additional_special_tokens': SPECIALS})

encoded_inputs:list[list[int]] = []
encoded_outputs:list[list[int]] = []

for sentence in final_df['original']:
  encoded_inputs.append(tokenizer.encode(sentence))

for sentence in final_df['corrected']:
  encoded_outputs.append(tokenizer.encode(sentence))

with open('/content/drive/MyDrive/CS4248NLP/bert_encoded_train_dataset.csv', mode="w") as file:
  for encoded_input, encoded_output in zip(encoded_inputs, encoded_outputs):
    file.write(" ".join([str(i) for i in encoded_input]))
    file.write(",")
    file.write(" ".join([str(i) for i in encoded_output]))
    file.write("\n")

In [None]:
tokenizer.save_vocabulary("/content/drive/MyDrive/CS4248NLP/vocab.txt")

('/content/drive/MyDrive/CS4248NLP/vocab.txt',)

In [None]:
tokenizer

BertTokenizer(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
# tokenizer.convert_tokens_to_ids(['[<SOS>]'])

In [None]:
# tokenizer.convert_ids_to_tokens([100])

## **BERT EDA**

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
input_text = "To be Journalist , I must study very hard so I 'll try to learn in order to make my dream comes true ."
output_text = "To be a journalist , I must study very hard , so I 'll try to learn in order to make my dream come true ."
print("Tokens of input:", tokenizer.tokenize(input_text))
print("Tokens of output:", tokenizer.tokenize(output_text))
encoded_input = tokenizer.encode(input_text)
print("Encoded OHE tensor:", encoded_input)
print("Decoded (Should be same as text):", tokenizer.decode(encoded_input))

Tokens of input: ['To', 'be', 'Journal', '##ist', ',', 'I', 'must', 'study', 'very', 'hard', 'so', 'I', "'", 'll', 'try', 'to', 'learn', 'in', 'order', 'to', 'make', 'my', 'dream', 'comes', 'true', '.']
Tokens of output: ['To', 'be', 'a', 'journalist', ',', 'I', 'must', 'study', 'very', 'hard', ',', 'so', 'I', "'", 'll', 'try', 'to', 'learn', 'in', 'order', 'to', 'make', 'my', 'dream', 'come', 'true', '.']
Encoded OHE tensor: [101, 1706, 1129, 3603, 1776, 117, 146, 1538, 2025, 1304, 1662, 1177, 146, 112, 1325, 2222, 1106, 3858, 1107, 1546, 1106, 1294, 1139, 4185, 2502, 2276, 119, 102]
Decoded (Should be same as text): [CLS] To be Journalist, I must study very hard so I'll try to learn in order to make my dream comes true. [SEP]


In [None]:
## OOV 1
input_text = preprocess_df['original'][3]
print(input_text)
encoded_input = tokenizer.encode(input_text)
# print("Tokens of input:", tokenizer.tokenize(input_text))
print("Encoded OHE tensor:", encoded_input)
print("Decoded (Should be same as text):", tokenizer.decode(encoded_input))

I recommend visiting the artificial lake in the certer of the city which is surrounded by a park . <EOS>
Encoded OHE tensor: [101, 146, 18029, 5807, 1103, 8246, 3521, 1107, 1103, 172, 7340, 1200, 1104, 1103, 1331, 1134, 1110, 4405, 1118, 170, 2493, 119, 133, 142, 9025, 135, 102]
Decoded (Should be same as text): [CLS] I recommend visiting the artificial lake in the certer of the city which is surrounded by a park. < EOS > [SEP]


In [None]:
## OOV 2
input_text = preprocess_df['original'][3]
print(input_text)
encoded_input = tokenizer.encode(input_text)
print("Tokens of input:", tokenizer.tokenize(input_text))
print("Encoded OHE tensor:", encoded_input)
print("Decoded (Should be same as text):", tokenizer.decode(encoded_input))

I recommend visiting the artificial lake in the certer of the city which is surrounded by a park . <EOS>
Tokens of input: ['I', 'recommend', 'visiting', 'the', 'artificial', 'lake', 'in', 'the', 'c', '##ert', '##er', 'of', 'the', 'city', 'which', 'is', 'surrounded', 'by', 'a', 'park', '.', '<', 'E', '##OS', '>']
Encoded OHE tensor: [101, 146, 18029, 5807, 1103, 8246, 3521, 1107, 1103, 172, 7340, 1200, 1104, 1103, 1331, 1134, 1110, 4405, 1118, 170, 2493, 119, 133, 142, 9025, 135, 102]
Decoded (Should be same as text): [CLS] I recommend visiting the artificial lake in the certer of the city which is surrounded by a park. < EOS > [SEP]


In [None]:
tokenizer.convert_ids_to_tokens([100])

['[UNK]']

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
SOS_TOKEN = "[<SOS>]"
EOS_TOKEN = "[<EOS>]"

SPECIALS = [SOS_TOKEN, EOS_TOKEN]
tokenizer.add_special_tokens({'additional_special_tokens': SPECIALS})


encoded_inputs:list[list[int]] = []
encoded_outputs:list[list[int]] = []
for sentence in preprocess_df['original']:
  encoded_inputs.append(tokenizer.encode(sentence))
for sentence in preprocess_df['corrected']:
  encoded_outputs.append(tokenizer.encode(sentence))
with open('/content/drive/MyDrive/CS4248NLP/bert_encoded_train_dataset.csv', mode="w") as file:
  for encoded_input, encoded_output in zip(encoded_inputs, encoded_outputs):
    file.write(" ".join([str(i) for i in encoded_input]))
    file.write(",")
    file.write(" ".join([str(i) for i in encoded_output]))
    file.write("\n")

## **Save Vocab**

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer.save_vocabulary("/content/drive/MyDrive/CS4248NLP/vocab.txt")


('/content/drive/MyDrive/CS4248NLP/vocab.txt',)

In [None]:
tokenizer.vocab_size

28996