## Terminology improvement analysis through language modeling
In this notebook, we compare the following best-performing (in terms of perplexity) language models for the word recommendation task (using a recommendation system-like metric):

* Statistical language model: **bidirectional 5-gram**
* Neural language model: **Fine-tuned BERT large uncased whole word masking**

In [88]:
# Import requirements
import random
import os
import pandas as pd
import re
import numpy as np
import json
import statistics as st
import nltk
from nltk.tokenize import TweetTokenizer, RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
from nltk.util import ngrams
import math
from statistics import mean
import string
from collections import defaultdict

import collections
from pprint import pprint
from pathlib import Path
from typing import Iterator
import itertools
from tqdm import tqdm

from expects import (contain_exactly, equal, expect, have_keys)
import attr
from functools import partial
from tabulate import tabulate

from transformers import pipeline, BertTokenizer, BertForMaskedLM, AutoTokenizer

import matplotlib.pyplot as plt
%matplotlib inline

#### Load pre-processed data
The cells below load the entire testing set. To load the 'short test step sentence' and 'long test step sentence' sets, uncomment the code with those conditions

In [296]:
data_dir = 'training_testing_data/with_name_objective/'

In [None]:
training_data = []
read_handle = open(data_dir + 'training_data.txt', 'r')
for line in read_handle:
    line = line.replace('\n', '').split(',')
    training_data.append(line)
print(len(training_data))

testing_data = []
read_handle = open(data_dir + 'testing_data.txt', 'r')
for line in read_handle:
    line = line.replace('\n', '').split(',')
    testing_data.append(line)
print(len(testing_data))
read_handle.close()

# Test steps with less than 5 words (short sentences)
# testing_data_short = []
# read_handle = open(data_dir + 'testing_data_stopwords.txt', 'r')
# for line in read_handle:
#     line = line.replace('\n', '').split(',')
#     if len(line) < 5:
#         testing_data_short.append(line)
# print(len(testing_data_short))
# read_handle.close()

# Test steps with more than 12 words (long sentences)
# testing_data_long = []
# read_handle = open(data_dir + 'testing_data_stopwords.txt', 'r')
# for line in read_handle:
#     line = line.replace('\n', '').split(',')
#     if len(line) > 12:
#         testing_data_long.append(line)
# print(len(testing_data_long))
# read_handle.close()

#### N-gram: bidirectional 5-gram

##### Add 'unk' token to the vocabulary of the training data to build the n-gram

In [73]:
random.shuffle(training_data)
vocab_count = defaultdict(lambda:0)
for sentence in training_data:
    for word in sentence[1:-1]:
        vocab_count[word] += 1

In [74]:
# Select 50 rare words (count == 1), which is about 2% of the vocab count, to replace by '<unk>'
count = 0
rare_word_list = []
for word in vocab_count:
    if vocab_count[word] == 1:
        rare_word_list.append(word)
        count += 1
        if count == 50:
            break

In [75]:
# Update training data with sentences that have the 'unk' token
training_data_with_unk = []
for sentence in training_data:
    sentence_with_unk = []
    for word in sentence:
        if word in rare_word_list:
            sentence_with_unk.append('<unk>')
        else:
            sentence_with_unk.append(word)
    training_data_with_unk.append(sentence_with_unk)

In [None]:
# Helper function to count ngrams
def ngram_count(tokenized_steps: list, n: int):
    if n == 1:
        # Create unigram
        vocab = set()
        all_words = list()
        unigram = {}
        
        for line in tokenized_steps:
            tokens = line
            
            # Discover new word
            for token in tokens:
                if token not in vocab:
                    vocab.add(token)
                all_words.append(token)
                token = tuple([token])
                if token not in unigram:
                    unigram[token] = 1
                else:
                    unigram[token] += 1  
        return [all_words, vocab, unigram]
        
    elif n == 2:
        # Create bigram and unigram
        vocab = set()
        all_words = list()
        unigram = {}
        bigram = {}
        
        for line in tokenized_steps:
            tokens = line
            bigrams = list(ngrams(tokens, n=2))

            # Discover new word
            for token in tokens:
                if token not in vocab:
                    vocab.add(token)
                all_words.append(token)
                token = tuple([token])
                if token not in unigram:
                    unigram[token] = 1
                else:
                    unigram[token] += 1

            for each_bigram in bigrams:
                if each_bigram not in bigram:
                    bigram[each_bigram] = 1
                else:
                    bigram[each_bigram] += 1
        
        return [all_words, vocab, unigram, bigram]

    elif n == 3:
        # Create trigram, bigram, and unigram
        vocab = set()
        all_words = list()
        unigram = {}
        bigram = {}
        trigram = {}
        
        for line in tokenized_steps:
            tokens = line 
            bigrams = list(ngrams(tokens, n=2))
            trigrams = list(ngrams(tokens, n=3))

            # Discover new word
            for token in tokens:
                if token not in vocab:
                    vocab.add(token)
                all_words.append(token)
                token = tuple([token])
                if token not in unigram:
                    unigram[token ] = 1
                else:
                    unigram[token] += 1

            for each_bigram in bigrams:
                if each_bigram not in bigram:
                    bigram[each_bigram] = 1
                else:
                    bigram[each_bigram] += 1
            for each_trigram in trigrams:
                if each_trigram not in trigram:
                    trigram[each_trigram] = 1
                else:
                    trigram[each_trigram] += 1
                    
        return [all_words, vocab, unigram, bigram, trigram]

    elif n == 4:
        # Create fourgram, trigram, bigram, and unigram
        vocab = set()
        all_words = list()
        unigram = {}
        bigram = {}
        trigram = {}
        fourgram = {}
        
        for line in tokenized_steps:
            tokens = line 
            bigrams = list(ngrams(tokens, n=2))
            trigrams = list(ngrams(tokens, n=3))
            fourgrams = list(ngrams(tokens, n=4))

            # Discover new word
            for token in tokens:
                if token not in vocab:
                    vocab.add(token)
                all_words.append(token)
                token = tuple([token])
                if token not in unigram:
                    unigram[token ] = 1
                else:
                    unigram[token] += 1

            for each_bigram in bigrams:
                if each_bigram not in bigram:
                    bigram[each_bigram] = 1
                else:
                    bigram[each_bigram] += 1
                    
            for each_trigram in trigrams:
                if each_trigram not in trigram:
                    trigram[each_trigram] = 1
                else:
                    trigram[each_trigram] += 1
        
            for each_fourgram in fourgrams:
                if each_fourgram not in fourgram:
                    fourgram[each_fourgram] = 1
                else:
                    fourgram[each_fourgram] += 1
                    
        return [all_words, vocab, unigram, bigram, trigram, fourgram]

    elif n == 5:
        # Create fivegram, fourgram, trigram, bigram, and unigram
        vocab = set()
        all_words = list()
        unigram = {}
        bigram = {}
        trigram = {}
        fourgram = {}
        fivegram = {}
        
        for line in tokenized_steps:
            tokens = line 
            bigrams = list(ngrams(tokens, n=2))
            trigrams = list(ngrams(tokens, n=3))
            fourgrams = list(ngrams(tokens, n=4))
            fivegrams = list(ngrams(tokens, n=5))

            # Discover new word
            for token in tokens:
                if token not in vocab:
                    vocab.add(token)
                all_words.append(token)
                token = tuple([token])
                if token not in unigram:
                    unigram[token ] = 1
                else:
                    unigram[token] += 1

            for each_bigram in bigrams:
                if each_bigram not in bigram:
                    bigram[each_bigram] = 1
                else:
                    bigram[each_bigram] += 1
                    
            for each_trigram in trigrams:
                if each_trigram not in trigram:
                    trigram[each_trigram] = 1
                else:
                    trigram[each_trigram] += 1
        
            for each_fourgram in fourgrams:
                if each_fourgram not in fourgram:
                    fourgram[each_fourgram] = 1
                else:
                    fourgram[each_fourgram] += 1  
            for each_fivegram in fivegrams:
                if each_fivegram not in fivegram:
                    fivegram[each_fivegram] = 1
                else:
                    fivegram[each_fivegram] += 1

        return [all_words, vocab, unigram, bigram, trigram, fourgram, fivegram]
    else:
        print("Error! Provide a valid value for n.")
        return

In [207]:
# Helper functions to estimate bidirectional n-grams' probabilities
def get_prob_unigram(word, count_all_tokens):
    try:
        return (unigram[tuple([word])]) / (count_all_tokens)
    except:
        # If word does not exist in vocab, return estimate for the 'unk' token
        return (unigram[('<unk>',)]) / (count_all_tokens)


def get_prob_bigram_before(words):
    try:
        return (bigram[words]) / (unigram[tuple([words[0]])])
    except:
        return 0
    
def get_prob_bigram_after(words):
    try:
        return (bigram[words]) / (unigram[tuple([words[-1]])])
    except:
        return 0    


def get_prob_trigram_before(words):
    try:
        return (trigram[words]) / (bigram[words[:2]])
    except:
        return 0

def get_prob_trigram_after(words):
    try:
        return (trigram[words]) / (bigram[words[-2:]])
    except:
        return 0


def get_prob_fourgram_before(words):
    try:
        return (fourgram[words]) / (trigram[words[:3]])
    except:
        return 0

def get_prob_fourgram_after(words):
    try:
        return (fourgram[words]) / (trigram[words[-3:]])
    except:
        return 0


def get_prob_fivegram_before(words):
    try:
        return (fivegram[words]) / (fourgram[words[:4]])
    except:
        return 0

def get_prob_fivegram_after(words):
    try:
        return (fivegram[words]) / (fourgram[words[-4:]])
    except:
        return 0

In [208]:
# Helper function to get the probability of a target word given the context words to the left (before) of the target word
def get_probability_word_before(words_before, target_word, len_words_before, count_all_tokens, unique_vocab_len):
    """" Get probability for 'target_word' using only context words that come before the word.
    Use back-off if n-gram does not exist.
    
    """
    used_unigram = False

    # Uses 5-gram
    if len_words_before == 4:
        p5 = get_prob_fivegram_before((words_before[-4], words_before[-3], words_before[-2], words_before[-1], target_word))
        # If p5 == 0, 5-gram never occurred, try 4-gram
        if p5 == 0:
            p5 = 0.4*get_prob_fourgram_before((words_before[-3], words_before[-2], words_before[-1], target_word))
            # If p4 == 0, 4-gram never occurred, try 3-gram
            if p5 == 0:
                p5 = 0.4*get_prob_trigram_before((words_before[-2], words_before[-1], target_word))
                # If p3 == 0, 3-gram never occurred, try bigram
                if p5 == 0:
                    p5 = 0.4*get_prob_bigram_before((words_before[-1], target_word))
                    # If p2 == 0, bigram never occurred, try unigram
                    if p5 == 0:
                        # Set variable to indicated that a unigram was used
                        used_unigram = True
                        p5 = get_prob_unigram((target_word), count_all_tokens)
        return [p5,used_unigram]
    
    # Uses 4-gram
    elif len_words_before == 3:
        p4 = get_prob_fourgram_before((words_before[-3], words_before[-2], words_before[-1], target_word))
        # If p4 == 0, 4-gram never occurred, try 3-gram
        if p4 == 0:
            p4 = 0.4*get_prob_trigram_before((words_before[-2], words_before[-1], target_word))
            # If p4 == 0, 3-gram never occurred, try bigram
            if p4 == 0:
                p4 = 0.4*get_prob_bigram_before((words_before[-1], target_word))
                # If p4 == 0, bigram never occurred, try unigram
                if p4 == 0:
                    # Set variable to indicated that a unigram was used
                    used_unigram = True
                    p4 = get_prob_unigram((target_word), count_all_tokens)
        return [p4,used_unigram]
    
    # Uses trigram
    elif len_words_before == 2:
        p3 = get_prob_trigram_before((words_before[-2], words_before[-1], target_word))
        # If p3 == 0, 3-gram never occurred, try bigram
        if p3 == 0:
            p3 = 0.4*get_prob_bigram_before((words_before[-1], target_word))
            # If p3 == 0, bigram never occurred, try unigram
            if p3 == 0:
                # Set variable to indicated that a unigram was used
                used_unigram = True
                p3 = get_prob_unigram((target_word), count_all_tokens)
        return [p3,used_unigram]
    
    # Uses bigram
    elif len_words_before == 1:
        p2 = get_prob_bigram_before((words_before[-1], target_word))
        # If p2 == 0, bigram never occurred, try unigram
        if p2 == 0:
            # Set variable to indicated that a unigram was used
            used_unigram = True
            p2 = get_prob_unigram((target_word), count_all_tokens)
        return [p2,used_unigram]
    
    # Uses unigram
    else:
        used_unigram = True
        p1 = get_prob_unigram((target_word), count_all_tokens)
        return [p1,used_unigram]

In [1]:
# Helper function to get the probability of a target word given the context words to the right (after) of the target word
def get_probability_word_after(words_after, target_word, len_words_after, count_all_tokens, unique_vocab_len):
    """" Get probability for 'target_word' using only context words that come after the word.
    Use back-off if n-gram does not exist.
    
    """  
    used_unigram = False
    
    # Uses 5-gram
    if len_words_after == 4:
        p5 = get_prob_fivegram_after((target_word, words_after[0], words_after[1], words_after[2], words_after[3]))
        # If p5 == 0, 5-gram never occurred, try 4-gram
        if p5 == 0:
            p5 = 0.4*get_prob_fourgram_after((target_word, words_after[0], words_after[1], words_after[2]))
            # If p4 == 0, 4-gram never occurred, try 3-gram
            if p5 == 0:
                p5 = 0.4*get_prob_trigram_after((target_word, words_after[0], words_after[1]))
                # If p3 == 0, 3-gram never occurred, try bigram
                if p5 == 0:
                    p5 = 0.4*get_prob_bigram_after((target_word, words_after[0]))
                    # If p2 == 0, bigram never occurred, try unigram
                    if p5 == 0:
                        # Set variable to indicated that a unigram was used
                        used_unigram = True
                        p5 = get_prob_unigram((target_word), count_all_tokens)
        return [p5,used_unigram]
    
    # Uses 4-gram
    elif len_words_after == 3:
        p4 = get_prob_fourgram_after((target_word, words_after[0], words_after[1], words_after[2]))
        # If p4 == 0, 4-gram never occurred, try 3-gram
        if p4 == 0:
            p4 = 0.4*get_prob_trigram_after((target_word, words_after[0], words_after[1]))
            # If p4 == 0, 3-gram never occurred, try bigram
            if p4 == 0:
                p4 = 0.4*get_prob_bigram_after((target_word, words_after[0]))
                # If p4 == 0, bigram never occurred, try unigram
                if p4 == 0: 
                    # Set variable to indicated that a unigram was used
                    used_unigram = True
                    p4 = get_prob_unigram((target_word), count_all_tokens)
        return [p4,used_unigram]
    
    # Uses trigram
    elif len_words_after == 2:
        p3 = get_prob_trigram_after((target_word, words_after[0], words_after[1]))
        # If p3 == 0, 3-gram never occurred, try bigram
        if p3 == 0:
            p3 = 0.4*get_prob_bigram_after((target_word, words_after[0]))
            # If p3 == 0, bigram never occurred, try unigram
            if p3 == 0:
                # Set variable to indicated that a unigram was used
                used_unigram = True
                p3 = get_prob_unigram((target_word), count_all_tokens)
        return [p3,used_unigram]
    
    # Uses bigram
    elif len_words_after == 1:
        p2 = get_prob_bigram_after((target_word, words_after[0]))
        # If p2 == 0, bigram never occurred, try unigram
        if p2 == 0:
            # Set variable to indicated that a unigram was used
            used_unigram = True
            p2 = get_prob_unigram((target_word), count_all_tokens)
        return [p2,used_unigram]
    
    # Uses unigram
    else:
        used_unigram = True
        p1 = get_prob_unigram((target_word), count_all_tokens)
        return [p1,used_unigram]

In [210]:
# Helper function to get the probability of a target word given the context words to the left (before) and right (after) of the target word - bidirectional
def get_probability_word_bidir(words_before, words_after, target_word, count_all_tokens, unique_vocab):
    """ Get probability of a 'target_word' given the context words 'words_before' and 'words_after'.
    Use back-off: when a n-gram is not found (returned zero probability),
    use (n-1)-gram, and so on.
    
    """
    unique_vocab_len = len(unique_vocab)    
    len_words_before = len(words_before)
    len_words_after = len(words_after)
    
    prob_before,used_unigram_bef = get_probability_word_before(words_before, target_word, len_words_before, count_all_tokens, unique_vocab_len)
    prob_after,used_unigram_aft = get_probability_word_after(words_after, target_word, len_words_after, count_all_tokens, unique_vocab_len)
    prob_final = (prob_before + prob_after)/2.0
    
    # Return True if either of prob_before or prob_after used unigram to make the prediction
    used_unigram = used_unigram_bef or used_unigram_aft
    return [prob_final,used_unigram]

In [455]:
# Helper function to go through all the words in the vocabulary and get the top-k most likely words
def get_likely_word_backoff_bidirec(words_before, words_after, count_all_tokens, unique_vocab, num_words_recommend):
    list_prob = []
    for word in unique_vocab:
        if (word == '[START]') or (word == '[END]'):
            continue
        prob,used_unigram = get_probability_word_bidir(words_before, words_after, word, count_all_tokens, unique_vocab)
        list_prob.append((word, prob, used_unigram))
        
    # Sort by probability
    list_prob.sort(key=lambda x: x[1], reverse=True)
    return list_prob[:num_words_recommend]

#### Build n-gram models with training data

In [533]:
ngram_models = ngram_count(training_data_with_unk, 5)
all_words = ngram_models[0]
unique_vocab = ngram_models[1]
unigram = ngram_models[2]
bigram = ngram_models[3]
trigram = ngram_models[4]
fourgram = ngram_models[5]
fivegram = ngram_models[6]

count_all_tokens = len(all_words)
unique_vocab_len = len(unique_vocab)

In [None]:
print("Total number of words in the training corpus: ", count_all_tokens)
print("Total number of unique words in the training corpus: ", unique_vocab_len)

#### BERT: Fine-tuned BERT large uncased whole word masking

In [106]:
my_bert_model = BertForMaskedLM.from_pretrained('/fine_tuned_bert_models/my_bert_whole_word_stopwords')
bert_tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking')

# Use the Huggingface pipeline 'fill-mask' with our fine-tuned model to make predictions efficiently
# https://huggingface.co/transformers/main_classes/pipelines.html
unmasker = pipeline('fill-mask', model=my_bert_model, tokenizer=bert_tokenizer)

#### Compute accuracy@k for the n-gram and BERT models separately

In [None]:
# N-gram evaluation
accuracy_k_ngram_dict = {}

num_words_recommend = 10
top3_accuracy_list = []
top5_accuracy_list = []
top10_accuracy_list = []

for sentence in testing_data:
    N = len(sentence)    
    top3_suggestions = []
    top5_suggestions = []
    top10_suggestions = []
    
    for index in range(1,N-1):
        token = sentence[index]
            
        if (index-4) >= 0:
            composed_token_before = sentence[index-4 : index]
        else:
            composed_token_before = sentence[:index]

        composed_token_after = sentence[index+1 : index+1 + 4]

        likely_words_score = get_likely_word_backoff_bidirec(composed_token_before, composed_token_after, count_all_tokens, unique_vocab, num_words_recommend)
        
        # Compute accuracy@k
        top3_likely_words = [x[0] for x in likely_words_score[:3]]
        if token in top3_likely_words:
            top3_suggestions.append(1)
        else:
            top3_suggestions.append(0)

        top5_likely_words = [x[0] for x in likely_words_score[:5]]
        if token in top5_likely_words:
            top5_suggestions.append(1)
        else:
            top5_suggestions.append(0)
            
        top10_likely_words = [x[0] for x in likely_words_score[:10]]
        if token in top10_likely_words:
            top10_suggestions.append(1)
        else:
            top10_suggestions.append(0)

    # Accuracy
    top3_accuracy = sum(top3_suggestions)/len(top3_suggestions)
    top3_accuracy_list.append(top3_accuracy)
    
    top5_accuracy = sum(top5_suggestions)/len(top5_suggestions)
    top5_accuracy_list.append(top5_accuracy)
    
    top10_accuracy = sum(top10_suggestions)/len(top10_suggestions)
    top10_accuracy_list.append(top10_accuracy)
    
accuracy_k_ngram_dict[3] = top3_accuracy_list
accuracy_k_ngram_dict[5] = top5_accuracy_list
accuracy_k_ngram_dict[10] = top10_accuracy_list


# BERT evaluation
accuracy_k_bert_dict = {}

# Get more predictions than necessary as we will exclude some predictions
num_words_recommend = 50
top3_accuracy_list = []
top5_accuracy_list = []
top10_accuracy_list = []

for sentence in testing_data_stopwords:
    N = len(sentence)    
    top3_suggestions = []
    top5_suggestions = []
    top10_suggestions = []
    
    for index in range(1,N-1):
        token = sentence[index]

        masked_sentence = []
        for i in range(1,N-1):
            if i == index:
                masked_sentence.append('[MASK]')
            else:
                masked_sentence.append(sentence[i])
        masked_sentence = ' '.join(masked_sentence)
        masked_sentence += '.'

        bert_predictions = unmasker(masked_sentence, top_k=(num_words_recommend + 150))
        bert_predictions_words = []
        for elem in bert_predictions:
            predicted_token = elem['token_str']
            # Skip recommendations of stop words, punctuations, and sub-words (indicated by ##)
            if (predicted_token not in string.punctuation) and (predicted_token != '—') and ('##' not in predicted_token):
                bert_predictions_words.append(predicted_token)
            else:
                continue
            if len(bert_predictions_words) == num_words_recommend:
                break
    
        # Compute accuracy@k
        top3_likely_words = bert_predictions_words[:3]
        if token in top3_likely_words:
            top3_suggestions.append(1)
        else:
            top3_suggestions.append(0)

        top5_likely_words = bert_predictions_words[:5]
        if token in top5_likely_words:
            top5_suggestions.append(1)
        else:
            top5_suggestions.append(0)

        top10_likely_words = bert_predictions_words[:10]
        if token in top10_likely_words:
            top10_suggestions.append(1)
        else:
            top10_suggestions.append(0)

    # Accuracy
    top3_accuracy = sum(top3_suggestions)/len(top3_suggestions)
    top3_accuracy_list.append(top3_accuracy)
    
    top5_accuracy = sum(top5_suggestions)/len(top5_suggestions)
    top5_accuracy_list.append(top5_accuracy)
    
    top10_accuracy = sum(top10_suggestions)/len(top10_suggestions)
    top10_accuracy_list.append(top10_accuracy)
    
accuracy_k_bert_dict[3] = top3_accuracy_list
accuracy_k_bert_dict[5] = top5_accuracy_list
accuracy_k_bert_dict[10] = top10_accuracy_list

In [None]:
# Print n-gram results
print(st.mean(accuracy_k_ngram_dict[3]))
print(st.mean(accuracy_k_ngram_dict[5]))
print(st.mean(accuracy_k_ngram_dict[10]))

print(st.median(accuracy_k_ngram_dict[3]))
print(st.median(accuracy_k_ngram_dict[5]))
print(st.median(accuracy_k_ngram_dict[10]))

In [None]:
# Print BERT results
print(st.mean(accuracy_k_bert_dict[3]))
print(st.mean(accuracy_k_bert_dict[5]))
print(st.mean(accuracy_k_bert_dict[10]))

print(st.median(accuracy_k_bert_dict[3]))
print(st.median(accuracy_k_bert_dict[5]))
print(st.median(accuracy_k_bert_dict[10]))

In [175]:
# Save dataframse with accuracy
df = pd.DataFrame(accuracy_k_ngram_dict[3], columns =['Accuracy'])
df.to_csv('ngram_bert_recommendation/ngram_accuracy_top3.csv', index = False)

df = pd.DataFrame(accuracy_k_ngram_dict[5], columns =['Accuracy'])
df.to_csv('ngram_bert_recommendation/ngram_accuracy_top5.csv', index = False)

df = pd.DataFrame(accuracy_k_ngram_dict[10], columns =['Accuracy'])
df.to_csv('ngram_bert_recommendation/ngram_accuracy_top10.csv', index = False)

df = pd.DataFrame(accuracy_k_bert_dict[3], columns =['Accuracy'])
df.to_csv('ngram_bert_recommendation/bert_accuracy_top3.csv', index = False)

df = pd.DataFrame(accuracy_k_bert_dict[5], columns =['Accuracy'])
df.to_csv('ngram_bert_recommendation/bert_accuracy_top5.csv', index = False)

df = pd.DataFrame(accuracy_k_bert_dict[10], columns =['Accuracy'])
df.to_csv('ngram_bert_recommendation/bert_accuracy_top10.csv', index = False)

### N-gram + BERT
Combining n-gram and BERT based on the observations drawn from a manual analysis on a sample in which one model fails (accuracy=0) and the other succeeds (accuracy=1)

In [None]:
# For efficiency, store BERT predictions for each test sentence in a read-only dict
bert_predictions_dict = {}

# Suggest top-3 most likely words
num_words_recommend = 100
top3_accuracy_list = []
top5_accuracy_list = []
top10_accuracy_list = []

for main_index in range(len(testing_data_stopwords)):
    sentence_bert = testing_data_stopwords[main_index]
    
    N = len(sentence_bert)    
    top3_suggestions = []
    top5_suggestions = []
    top10_suggestions = []

    prediction_index_sentence = 0
    for index in range(1,N-1):
        token = sentence_bert[index]

        masked_sentence = []
        for i in range(1,N-1):
            if i == index:
                masked_sentence.append('[MASK]')
            else:
                masked_sentence.append(sentence_bert[i])
        masked_sentence = ' '.join(masked_sentence)
        masked_sentence += '.'

        bert_predictions = unmasker(masked_sentence, top_k=num_words_recommend)
        bert_predictions_words = []
        for elem in bert_predictions:
            predicted_token = elem['token_str']
            if (predicted_token not in string.punctuation) and (predicted_token != '—') and ('##' not in predicted_token):
                bert_predictions_words.append(predicted_token)
            else:
                continue
            if len(bert_predictions_words) == 10:
                break
            
        if tuple(sentence_bert) in bert_predictions_dict:
            current_dict = bert_predictions_dict[tuple(sentence_bert)]
            current_dict[prediction_index_sentence] = bert_predictions_words
            bert_predictions_dict[tuple(sentence_bert)] = current_dict
        else:
            bert_predictions_dict[tuple(sentence_bert)] = {prediction_index_sentence : bert_predictions_words}
        
        prediction_index_sentence += 1

In [None]:
# N-gram + BERT prediction
accuracy_k_ngram_bert_dict = {}

# Suggest top-3 most likely words
num_words_recommend = 10
top3_accuracy_list = []
top5_accuracy_list = []
top10_accuracy_list = []


for sentence in testing_data_stopwords:
    N = len(sentence)    
    top3_suggestions = []
    top5_suggestions = []
    top10_suggestions = []

    prediction_index_sentence = 0
    for index in range(1,N-1):
        token = sentence[index]
            
        # Get predictions from n-gram
        if (index-4) >= 0:
            composed_token_before = sentence[index-4 : index]
        else:
            composed_token_before = sentence[:index]

        composed_token_after = sentence[index+1 : index+1 + 4]

        likely_words_score = get_likely_word_backoff_bidirec(composed_token_before, composed_token_after, count_all_tokens, unique_vocab, num_words_recommend)
        
        # Check if the n-gram backed off to the unigram and if the unigram prediction probability is below 0.5
        if any(likely_words_score[:][2]) and (likely_words_score[0][1] < 0.5):
            
            # Get BERT predictions as n-gram prediction confidence score is below the threshold and the n-gram backed off to the unigram
            try:
                bert_predictions_words = bert_predictions_dict[tuple(sentence)][prediction_index_sentence]
            except:
                continue
            
            top3_likely_words = bert_predictions_words[:3]
            if token in top3_likely_words:
                top3_suggestions.append(1)
            else:
                top3_suggestions.append(0)

            top5_likely_words = bert_predictions_words[:5]
            if token in top5_likely_words:
                top5_suggestions.append(1)
            else:
                top5_suggestions.append(0)

            top10_likely_words = bert_predictions_words[:10]
            if token in top10_likely_words:
                top10_suggestions.append(1)
            else:
                top10_suggestions.append(0)            

        else:

            # Keep the n-gram
            top3_likely_words = [x[0] for x in likely_words_score[:3]]
            if token in top3_likely_words:
                top3_suggestions.append(1)
            else:
                top3_suggestions.append(0)

            top5_likely_words = [x[0] for x in likely_words_score[:5]]
            if token in top5_likely_words:
                top5_suggestions.append(1)
            else:
                top5_suggestions.append(0)

            top10_likely_words = [x[0] for x in likely_words_score[:10]]
            if token in top10_likely_words:
                top10_suggestions.append(1)
            else:
                top10_suggestions.append(0)   

        prediction_index_sentence += 1

    # Accuracy
    top3_accuracy = sum(top3_suggestions)/len(top3_suggestions)
    top3_accuracy_list.append(top3_accuracy)

    top5_accuracy = sum(top5_suggestions)/len(top5_suggestions)
    top5_accuracy_list.append(top5_accuracy)

    top10_accuracy = sum(top10_suggestions)/len(top10_suggestions)
    top10_accuracy_list.append(top10_accuracy)


accuracy_k_ngram_bert_dict[3] = top3_accuracy_list
accuracy_k_ngram_bert_dict[5] = top5_accuracy_list
accuracy_k_ngram_bert_dict[10] = top10_accuracy_list 

In [None]:
# Print (n-gram + BERT) results
print(st.mean(accuracy_k_ngram_bert_dict[3]))
print(st.mean(accuracy_k_ngram_bert_dict[5]))
print(st.mean(accuracy_k_ngram_bert_dict[10]))

print(st.median(accuracy_k_ngram_bert_dict[3]))
print(st.median(accuracy_k_ngram_bert_dict[5]))
print(st.median(accuracy_k_ngram_bert_dict[10]))