## Terminology improvement analysis through language modeling

In this notebook, we perform experiments with different language models to improve the terminology of manual test case descriptions. We use the following types of language models:

* Statistical language models:
  * Unidirectional unigram, bigram, trigram, 4-gram, 5-gram
  * Bidirectional unigram, bigram, trigram, 4-gram, 5-gram

### Statistical language models

In [2]:
# Import necessary libraries
import random
import os
import pandas as pd
import re
import numpy as np
import json
import statistics as st
import nltk
from nltk.tokenize import TweetTokenizer, RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
from nltk.util import ngrams
import math
from statistics import mean
import string
from collections import defaultdict

import collections
from pprint import pprint
from pathlib import Path
from typing import Iterator
import itertools
from tqdm import tqdm

from expects import (contain_exactly, equal, expect, have_keys)
import attr
from functools import partial
from tabulate import tabulate

import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.offline as offline
import plotly.io as pio
import plotly.graph_objects as go

#### Load pre-processed data

In [3]:
data_dir = 'training_testing_data/with_name_objective/'

In [None]:
training_data = []
read_handle = open(data_dir + 'training_data.txt', 'r')
for line in read_handle:
    line = line.replace('\n', '').split(',')
    if len(line) > 3:
        training_data.append(line)
print(len(training_data))

testing_data = []
read_handle = open(data_dir + 'testing_data.txt', 'r')
for line in read_handle:
    line = line.replace('\n', '').split(',')
    if len(line) > 3:
        testing_data.append(line)
print(len(testing_data))

# testing_data_for_bert = []
# read_handle = open(data_dir + 'testing_data.txt', 'r')
# for line in read_handle:
#     line = line.replace('\n', '').split(',')
#     if len(line) > 3:
#         testing_data_for_bert.append(line[1:-1])
# print(len(testing_data_for_bert))
read_handle.close()

#### Add 'unk' token to the vocabulary
Replace rare words by the 'unk' token to handle out-of-vocabulary words

In [73]:
random.shuffle(training_data)
vocab_count = defaultdict(lambda:0)
for sentence in training_data:
    for word in sentence[1:-1]:
        vocab_count[word] += 1

In [74]:
# Select 50 rare words (count == 1), which is about 2% of the vocab count, to replace by '<unk>'
count = 0
rare_word_list = []
for word in vocab_count:
    if vocab_count[word] == 1:
        rare_word_list.append(word)
        count += 1
        if count == 50:
            break

In [75]:
# Update training data with sentences that have the 'unk' token
training_data_with_unk = []
for sentence in training_data:
    sentence_with_unk = []
    for word in sentence:
        if word in rare_word_list:
            sentence_with_unk.append('<unk>')
        else:
            sentence_with_unk.append(word)
    training_data_with_unk.append(sentence_with_unk)

#### Unidirectional n-grams

In [6]:
# Helper function to count ngrams
def ngram_count(tokenized_steps: list, n: int):
    if n == 1:
        # Create unigram
        vocab = set()
        all_words = list()
        unigram = {}
        
        for line in tokenized_steps:
            tokens = line
            
            # Discover new word
            for token in tokens:
                if token not in vocab:
                    vocab.add(token)
                all_words.append(token)
                token = tuple([token])
                if token not in unigram:
                    unigram[token] = 1
                else:
                    unigram[token] += 1  
        return [all_words, vocab, unigram]
        
    elif n == 2:
        # Create bigram and unigram
        vocab = set()
        all_words = list()
        unigram = {}
        bigram = {}
        
        for line in tokenized_steps:
            tokens = line
            bigrams = list(ngrams(tokens, n=2))

            # Discover new word
            for token in tokens:
                if token not in vocab:
                    vocab.add(token)
                all_words.append(token)
                token = tuple([token])
                if token not in unigram:
                    unigram[token] = 1
                else:
                    unigram[token] += 1

            for each_bigram in bigrams:
                if each_bigram not in bigram:
                    bigram[each_bigram] = 1
                else:
                    bigram[each_bigram] += 1
        
        return [all_words, vocab, unigram, bigram]

    elif n == 3:
        # Create trigram, bigram, and unigram
        vocab = set()
        all_words = list()
        unigram = {}
        bigram = {}
        trigram = {}
        
        for line in tokenized_steps:
            tokens = line 
            bigrams = list(ngrams(tokens, n=2))
            trigrams = list(ngrams(tokens, n=3))

            # Discover new word
            for token in tokens:
                if token not in vocab:
                    vocab.add(token)
                all_words.append(token)
                token = tuple([token])
                if token not in unigram:
                    unigram[token ] = 1
                else:
                    unigram[token] += 1

            for each_bigram in bigrams:
                if each_bigram not in bigram:
                    bigram[each_bigram] = 1
                else:
                    bigram[each_bigram] += 1
            for each_trigram in trigrams:
                if each_trigram not in trigram:
                    trigram[each_trigram] = 1
                else:
                    trigram[each_trigram] += 1
                    
        return [all_words, vocab, unigram, bigram, trigram]

    elif n == 4:
        # Create fourgram, trigram, bigram, and unigram
        vocab = set()
        all_words = list()
        unigram = {}
        bigram = {}
        trigram = {}
        fourgram = {}
        
        for line in tokenized_steps:
            tokens = line 
            bigrams = list(ngrams(tokens, n=2))
            trigrams = list(ngrams(tokens, n=3))
            fourgrams = list(ngrams(tokens, n=4))

            # Discover new word
            for token in tokens:
                if token not in vocab:
                    vocab.add(token)
                all_words.append(token)
                token = tuple([token])
                if token not in unigram:
                    unigram[token ] = 1
                else:
                    unigram[token] += 1

            for each_bigram in bigrams:
                if each_bigram not in bigram:
                    bigram[each_bigram] = 1
                else:
                    bigram[each_bigram] += 1
                    
            for each_trigram in trigrams:
                if each_trigram not in trigram:
                    trigram[each_trigram] = 1
                else:
                    trigram[each_trigram] += 1
        
            for each_fourgram in fourgrams:
                if each_fourgram not in fourgram:
                    fourgram[each_fourgram] = 1
                else:
                    fourgram[each_fourgram] += 1
                    
        return [all_words, vocab, unigram, bigram, trigram, fourgram]

    elif n == 5:
        # Create fivegram, fourgram, trigram, bigram, and unigram
        vocab = set()
        all_words = list()
        unigram = {}
        bigram = {}
        trigram = {}
        fourgram = {}
        fivegram = {}
        
        for line in tokenized_steps:
            tokens = line 
            bigrams = list(ngrams(tokens, n=2))
            trigrams = list(ngrams(tokens, n=3))
            fourgrams = list(ngrams(tokens, n=4))
            fivegrams = list(ngrams(tokens, n=5))

            # Discover new word
            for token in tokens:
                if token not in vocab:
                    vocab.add(token)
                all_words.append(token)
                token = tuple([token])
                if token not in unigram:
                    unigram[token ] = 1
                else:
                    unigram[token] += 1

            for each_bigram in bigrams:
                if each_bigram not in bigram:
                    bigram[each_bigram] = 1
                else:
                    bigram[each_bigram] += 1
                    
            for each_trigram in trigrams:
                if each_trigram not in trigram:
                    trigram[each_trigram] = 1
                else:
                    trigram[each_trigram] += 1
        
            for each_fourgram in fourgrams:
                if each_fourgram not in fourgram:
                    fourgram[each_fourgram] = 1
                else:
                    fourgram[each_fourgram] += 1  
            for each_fivegram in fivegrams:
                if each_fivegram not in fivegram:
                    fivegram[each_fivegram] = 1
                else:
                    fivegram[each_fivegram] += 1

        return [all_words, vocab, unigram, bigram, trigram, fourgram, fivegram]
    else:
        print("Error! Provide a valid value for n.")
        return

In [6]:
# Helper functions to estimate n-grams' probabilities
def get_prob_unigram(word, count_all_tokens):
    try:
        return (unigram[tuple([word])]) / (count_all_tokens)
    except:
        # If word does not exist in vocab, return estimate for the 'unk' token
        return (unigram[('<unk>',)]) / (count_all_tokens)
    
def get_prob_bigram(words):
    try:
        return (bigram[words]) / (unigram[tuple([words[0]])])
    except:
        return 0
    
def get_prob_trigram(words):
    try:
        return (trigram[words]) / (bigram[words[:2]])
    except:
        return 0

def get_prob_fourgram(words):
    try:
        return (fourgram[words]) / (trigram[words[:3]])
    except:
        return 0

def get_prob_fivegram(words):
    try:
        return (fivegram[words]) / (fourgram[words[:4]])
    except:
        return 0

In [194]:
# Helper function to get the probability of a target word given the context words
def get_probability_word(context_words, target_word, count_all_tokens, unique_vocab):
    """ Get probability of a 'target_word' given the context words 'words'.
    Use back-off: when a n-gram is not found (returned zero probability),
    uses (n-1)-gram, and so on.
    
    """
    unique_vocab_len = len(unique_vocab)
    # Uses 5-gram
    if len(context_words) == 4:
        p5 = get_prob_fivegram((context_words[-4], context_words[-3], context_words[-2], context_words[-1], target_word))
        # If p5 == 0, 5-gram never occurred, try 4-gram
        if p5 == 0:
            p5 = 0.4*get_prob_fourgram((context_words[-3], context_words[-2], context_words[-1], target_word))
            # If p4 == 0, 4-gram never occurred, try 3-gram
            if p5 == 0:
                p5 = 0.4*get_prob_trigram((context_words[-2], context_words[-1], target_word))
                # If p3 == 0, 3-gram never occurred, try bigram
                if p5 == 0:
                    p5 = 0.4*get_prob_bigram((context_words[-1], target_word))
                    # If p2 == 0, bigram never occurred, try unigram
                    if p5 == 0:
                        p5 = get_prob_unigram((target_word), count_all_tokens)
        return p5
    
    # Uses 4-gram
    elif len(context_words) == 3:
        p4 = get_prob_fourgram((context_words[-3], context_words[-2], context_words[-1], target_word))
        # If p4 == 0, 4-gram never occurred, try 3-gram
        if p4 == 0:
            p4 = 0.4*get_prob_trigram((context_words[-2], context_words[-1], target_word))
            # If p4 == 0, 3-gram never occurred, try bigram
            if p4 == 0:
                p4 = 0.4*get_prob_bigram((context_words[-1], target_word))
                # If p4 == 0, bigram never occurred, try unigram
                if p4 == 0:      
                    p4 = get_prob_unigram((target_word), count_all_tokens)
        return p4
    
    # Uses trigram
    elif len(context_words) == 2:
        p3 = get_prob_trigram((context_words[-2], context_words[-1], target_word))
        # If p3 == 0, 3-gram never occurred, try bigram
        if p3 == 0:
            p3 = 0.4*get_prob_bigram((context_words[-1], target_word))
            # If p3 == 0, bigram never occurred, try unigram
            if p3 == 0:
                p3 = get_prob_unigram((target_word), count_all_tokens)
        return p3
    
    # Uses bigram
    elif len(context_words) == 1:
        p2 = get_prob_bigram((context_words[-1], target_word))
        # If p2 == 0, bigram never occurred, try unigram
        if p2 == 0:
            p2 = get_prob_unigram((target_word), count_all_tokens)
        return p2
    
    # Uses unigram
    else:
        p1 = get_prob_unigram((target_word), count_all_tokens)
        return p1

In [103]:
def calculate_perplexity(sentence: list, ngram_counts: dict, count_all_tokens: int, unique_vocab_len: int, unique_vocab: list):
    """
    Calculate perplexity for a sentence

    Args:
       sentence: List of tokens/words
       n_gram_counts: Dictionary of counts of n-grams
       count_all_tokens: Int with total count of all tokens in training corpus
       unique_vocab_len: Int with size of vocabulary of training corpus
       unique_vocab: List of unique words in training corpus

    Returns:
       Perplexity score
    """
    # length of n-grams (e.g., when n = 5, perplexity is compute with 5-grams, unless less than 5 words are available, then the model backs off)
    n = len(list(ngram_counts.keys())[0])

    # Cast the sentence from a list to a tuple
    sentence = tuple(sentence)

    # length of sentence
    N = len(sentence)

    # Cumulative product of probabilities computed by the n-grams
    cumulative_product = 1.0

    # Next, iterate through each word, except first ([START] token) and last ([END] token) indices
    # Unigram
    if n == 1:
        for t in range(1, N-1):

            # Get the n-gram preceding the word at position t
            ngram = ()

            # Get the word at position t
            word = sentence[t]

            # Estimate the probability of the word given the n-gram
            probability = get_probability_word(ngram, word, count_all_tokens, unique_vocab) 

            # Update the product of the probabilities
            cumulative_product *= 1/probability
        
        
    elif n == 2:
        for t in range(1, N-1):

            # Get the n-gram preceding the word at position t
            ngram = sentence[t-1:t]

            # Get the word at position t
            word = sentence[t]

            # Estimate the probability of the word given the n-gram
            probability = get_probability_word(ngram, word, count_all_tokens, unique_vocab) 

            # Update the product of the probabilities
            cumulative_product *= 1/probability 

    elif n == 3:
        # Get bigram first
        t = 1
        ngram = sentence[t-1:t]
        word = sentence[t]
        # Estimate the probability of the word given the n-gram
        probability = get_probability_word(ngram, word, count_all_tokens, unique_vocab) 
        # Update the product of the probabilities
        cumulative_product *= 1/probability 
        
        for t in range(2, N-1):

            # Get the n-gram preceding the word at position t
            ngram = sentence[t-2:t]

            # Get the word at position t
            word = sentence[t]

            # Estimate the probability of the word given the n-gram
            probability = get_probability_word(ngram, word, count_all_tokens, unique_vocab) 

            # Update the product of the probabilities
            cumulative_product *= 1/probability 
         
    elif n == 4:
        # Get bigram and trigram first
        t = 1
        ngram = sentence[t-1:t]
        word = sentence[t]
        # Estimate the probability of the word given the n-gram
        probability = get_probability_word(ngram, word, count_all_tokens, unique_vocab) 
        # Update the product of the probabilities
        cumulative_product *= 1/probability 
        
        t = 2
        ngram = sentence[t-2:t]
        word = sentence[t]
        # Estimate the probability of the word given the n-gram
        probability = get_probability_word(ngram, word, count_all_tokens, unique_vocab) 
        # Update the product of the probabilities
        cumulative_product *= 1/probability 
        
        for t in range(3, N-1):

            # Get the n-gram preceding the word at position t
            ngram = sentence[t-3:t]

            # Get the word at position t
            word = sentence[t]

            # Estimate the probability of the word given the n-gram
            probability = get_probability_word(ngram, word, count_all_tokens, unique_vocab) 

            # Update the product of the probabilities
            cumulative_product *= 1/probability 
            
    elif n == 5:
        # Get bigram, trigram, and fourgram first
        t = 1
        ngram = sentence[t-1:t]
        word = sentence[t]
        # Estimate the probability of the word given the n-gram
        probability = get_probability_word(ngram, word, count_all_tokens, unique_vocab) 
        # Update the product of the probabilities
        cumulative_product *= 1/probability 
        
        t = 2
        ngram = sentence[t-2:t]
        word = sentence[t]
        # Estimate the probability of the word given the n-gram
        probability = get_probability_word(ngram, word, count_all_tokens, unique_vocab) 
        # Update the product of the probabilities
        cumulative_product *= 1/probability  

        t = 3
        ngram = sentence[t-3:t]
        word = sentence[t]
        # Estimate the probability of the word given the n-gram
        probability = get_probability_word(ngram, word, count_all_tokens, unique_vocab) 
        # Update the product of the probabilities
        cumulative_product *= 1/probability 
        
        for t in range(4, N-1):

            # Get the n-gram preceding the word at position t
            ngram = sentence[t-4:t]

            # Get the word at position t
            word = sentence[t]

            # Estimate the probability of the word given the n-gram
            probability = get_probability_word(ngram, word, count_all_tokens, unique_vocab) 

            # Update the product of the probabilities
            cumulative_product *= 1/probability 

    # Take the Nth root of the product to compute perplexity
    perplexity = cumulative_product**(1/N)
    return perplexity

#### Build n-gram models with training data

In [195]:
# Build n-grams using 5-gram
ngram_models = ngram_count(training_data_with_unk, 5)
all_words = ngram_models[0]
unique_vocab = ngram_models[1]
unigram = ngram_models[2]
bigram = ngram_models[3]
trigram = ngram_models[4]
fourgram = ngram_models[5]
fivegram = ngram_models[6]
count_all_tokens = len(all_words)
unique_vocab_len = len(unique_vocab)

In [None]:
print("Total number of words in the training corpus: ", count_all_tokens)
print("Total number of unique words in the training corpus: ", unique_vocab_len)

#### Compute perplexity score for training data with the unidirectional n-grams

In [196]:
# Define pandas df to store the compute perplexities
perplexity_df = pd.DataFrame(columns = ['perplexity_score', 'ngram', 'group'])
index_to_add = 0

In [197]:
# Iterate through training data to compute average perplexity
for train in training_data_with_unk:
    
    # Using unigram
    perplexity_uni = calculate_perplexity(train, unigram, count_all_tokens, unique_vocab_len, unique_vocab)
    perplexity_df.loc[index_to_add] = [perplexity_uni, 'Unigram', 'Training Data']
    index_to_add += 1
    
    # Using bigram
    perplexity_bi = calculate_perplexity(train, bigram, count_all_tokens, unique_vocab_len, unique_vocab)
    perplexity_df.loc[index_to_add] = [perplexity_bi, 'Bigram', 'Training Data']
    index_to_add += 1
    
    # Using trigram
    if len(train) >= 3:
        perplexity_tri = calculate_perplexity(train, trigram, count_all_tokens, unique_vocab_len, unique_vocab)
        perplexity_df.loc[index_to_add] = [perplexity_tri, 'Trigram', 'Training Data']
        index_to_add += 1
    
    else:
        perplexity_tri = None
    
    # Using fourgram
    if len(train) >= 4:
        perplexity_four = calculate_perplexity(train, fourgram, count_all_tokens, unique_vocab_len, unique_vocab)
        perplexity_df.loc[index_to_add] = [perplexity_four, '4-gram', 'Training Data']
        index_to_add += 1

    else:
        perplexity_four = None
        
    # Using fivegram
    if len(train) >= 5:
        perplexity_five = calculate_perplexity(train, fivegram, count_all_tokens, unique_vocab_len, unique_vocab)
        perplexity_df.loc[index_to_add] = [perplexity_five, '5-gram', 'Training Data']
        index_to_add += 1
        
    else:
        perplexity_five = None

#### Compute perplexity score for testing data with the unidreictional n-grams

In [198]:
# Iterate through testing data to compute average perplexity
perplexity_uni_test_list = []
perplexity_bi_test_list = []
perplexity_tri_test_list = []
perplexity_four_test_list = []
perplexity_five_test_list = []

for test in testing_data:
        
    # Using unigram
    perplexity_uni = calculate_perplexity(test, unigram, count_all_tokens, unique_vocab_len, unique_vocab)
    perplexity_df.loc[index_to_add] = [perplexity_uni, 'Unigram', 'Testing Data']
    index_to_add += 1
        
    # Using bigram
    perplexity_bi = calculate_perplexity(test, bigram, count_all_tokens, unique_vocab_len, unique_vocab)
    perplexity_df.loc[index_to_add] = [perplexity_bi, 'Bigram', 'Testing Data']
    index_to_add += 1
        
    # Using trigram
    if len(test) >= 3:
        perplexity_tri = calculate_perplexity(test, trigram, count_all_tokens, unique_vocab_len, unique_vocab)
        perplexity_df.loc[index_to_add] = [perplexity_tri, 'Trigram', 'Testing Data']
        index_to_add += 1
    
    else:
        perplexity_tri = None
    
    # Using fourgram
    if len(test) >= 4:
        perplexity_four = calculate_perplexity(test, fourgram, count_all_tokens, unique_vocab_len, unique_vocab)
        perplexity_df.loc[index_to_add] = [perplexity_four, '4-gram', 'Testing Data']
        index_to_add += 1
    else:
        perplexity_four = None
        
    # Using fivegram
    if len(test) >= 5:
        perplexity_five = calculate_perplexity(test, fivegram, count_all_tokens, unique_vocab_len, unique_vocab)
        perplexity_df.loc[index_to_add] = [perplexity_five, '5-gram', 'Testing Data']
        index_to_add += 1
    else:
        perplexity_five = None

In [200]:
# Function to remove outliers if necessary
def remove_outliers(df):
    temp_df = df.loc[(df['ngram'] == 'Unigram') & (df['group'] == 'Training Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    
    filter = (df['ngram'] == 'Unigram') & (df['group'] == 'Training Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_1 = df.loc[filter]
    
 
    temp_df = df.loc[(df['ngram'] == 'Unigram') & (df['group'] == 'Testing Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == 'Unigram') & (df['group'] == 'Testing Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_2 = df.loc[filter]
    
    
    temp_df = df.loc[(df['ngram'] == 'Bigram') & (df['group'] == 'Training Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == 'Bigram') & (df['group'] == 'Training Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_3 = df.loc[filter]    
    
 
    temp_df = df.loc[(df['ngram'] == 'Bigram') & (df['group'] == 'Testing Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == 'Bigram') & (df['group'] == 'Testing Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_4 = df.loc[filter]
    
    
    temp_df = df.loc[(df['ngram'] == 'Trigram') & (df['group'] == 'Training Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == 'Trigram') & (df['group'] == 'Training Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_5 = df.loc[filter]    
    
 
    temp_df = df.loc[(df['ngram'] == 'Trigram') & (df['group'] == 'Testing Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == 'Trigram') & (df['group'] == 'Testing Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_6 = df.loc[filter]
    
    
    temp_df = df.loc[(df['ngram'] == '4-gram') & (df['group'] == 'Training Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == '4-gram') & (df['group'] == 'Training Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_7 = df.loc[filter]    
    
 
    temp_df = df.loc[(df['ngram'] == '4-gram') & (df['group'] == 'Testing Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == '4-gram') & (df['group'] == 'Testing Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_8 = df.loc[filter]    
    
    
    temp_df = df.loc[(df['ngram'] == '5-gram') & (df['group'] == 'Training Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == '5-gram') & (df['group'] == 'Training Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_9 = df.loc[filter]    
    
 
    temp_df = df.loc[(df['ngram'] == '5-gram') & (df['group'] == 'Testing Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == '5-gram') & (df['group'] == 'Testing Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_10 = df.loc[filter]
    
    main_df = df_1.append([df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10])
    return main_df

#### Plot Perplexity metric

In [None]:
# Plot results
plotly.offline.init_notebook_mode()
pio.renderers.default = 'iframe' # or 'notebook' or 'colab' or 'jupyterlab'
fig = go.Figure()

perplexity_outlier_df = remove_outliers(perplexity_df)
fig = px.box(perplexity_outlier_df, x="ngram", y="perplexity_score", color="group", points=False)

fig.update_layout(
    title="Unidirectional N-grams",
    title_x=0.5,
    xaxis_title="N-gram order",
    yaxis_title="Perplexity metric",
    legend_title="Dataset",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)
fig.show()
# fig.write_image("images/unidir_ngram_perplexity.png")

In [185]:
# Compute median perplexities
median_unigram = st.median(perplexity_df[(perplexity_df['ngram'] == 'Unigram') & (perplexity_df['group'] == 'Testing Data')]['perplexity_score'].tolist())
median_bigram = st.median(perplexity_df[(perplexity_df['ngram'] == 'Bigram') & (perplexity_df['group'] == 'Testing Data')]['perplexity_score'].tolist())
median_trigram = st.median(perplexity_df[(perplexity_df['ngram'] == 'Trigram') & (perplexity_df['group'] == 'Testing Data')]['perplexity_score'].tolist())
median_fourgram = st.median(perplexity_df[(perplexity_df['ngram'] == '4-gram') & (perplexity_df['group'] == 'Testing Data')]['perplexity_score'].tolist())
median_fivegram = st.median(perplexity_df[(perplexity_df['ngram'] == '5-gram') & (perplexity_df['group'] == 'Testing Data')]['perplexity_score'].tolist())
print(median_unigram)
print(median_bigram)
print(median_trigram)
print(median_fourgram)
print(median_fivegram)

In [202]:
# Save dataframes as CSV
perplexity_df.to_csv('uni_ngrams/perplexity_ngram.csv', index=False)
perplexity_outlier_df.to_csv('uni_ngrams/perplexity_ngram_outlier.csv', index=False)

#### Bidirectional n-grams

In [207]:
# Helper functions to estimate bidirectional n-grams' probabilities
def get_prob_unigram(word, count_all_tokens):
    try:
        return (unigram[tuple([word])]) / (count_all_tokens)
    except:
        # If word does not exist in vocab, return estimate for the 'unk' token
        return (unigram[('<unk>',)]) / (count_all_tokens)


def get_prob_bigram_before(words):
    try:
        return (bigram[words]) / (unigram[tuple([words[0]])])
    except:
        return 0
    
def get_prob_bigram_after(words):
    try:
        return (bigram[words]) / (unigram[tuple([words[-1]])])
    except:
        return 0    


def get_prob_trigram_before(words):
    try:
        return (trigram[words]) / (bigram[words[:2]])
    except:
        return 0

def get_prob_trigram_after(words):
    try:
        return (trigram[words]) / (bigram[words[-2:]])
    except:
        return 0


def get_prob_fourgram_before(words):
    try:
        return (fourgram[words]) / (trigram[words[:3]])
    except:
        return 0

def get_prob_fourgram_after(words):
    try:
        return (fourgram[words]) / (trigram[words[-3:]])
    except:
        return 0


def get_prob_fivegram_before(words):
    try:
        return (fivegram[words]) / (fourgram[words[:4]])
    except:
        return 0

def get_prob_fivegram_after(words):
    try:
        return (fivegram[words]) / (fourgram[words[-4:]])
    except:
        return 0

In [208]:
# Helper function to get the probability of a target word given the context words to the left (before) of the target word
def get_probability_word_before(words_before, target_word, len_words_before, count_all_tokens, unique_vocab_len):
    """" Get probability for 'target_word' using only context words that come before the word.
    Use back-off if n-gram does not exist.
    
    """

    # Uses 5-gram
    if len_words_before == 4:
        p5 = get_prob_fivegram_before((words_before[-4], words_before[-3], words_before[-2], words_before[-1], target_word))
        # If p5 == 0, 5-gram never occurred, try 4-gram
        if p5 == 0:
            p5 = 0.4*get_prob_fourgram_before((words_before[-3], words_before[-2], words_before[-1], target_word))
            # If p4 == 0, 4-gram never occurred, try 3-gram
            if p5 == 0:
                p5 = 0.4*get_prob_trigram_before((words_before[-2], words_before[-1], target_word))
                # If p3 == 0, 3-gram never occurred, try bigram
                if p5 == 0:
                    p5 = 0.4*get_prob_bigram_before((words_before[-1], target_word))
                    # If p2 == 0, bigram never occurred, try unigram
                    if p5 == 0:
                        p5 = get_prob_unigram((target_word), count_all_tokens)
        return p5
    
    # Uses 4-gram
    elif len_words_before == 3:
        p4 = get_prob_fourgram_before((words_before[-3], words_before[-2], words_before[-1], target_word))
        # If p4 == 0, 4-gram never occurred, try 3-gram
        if p4 == 0:
            p4 = 0.4*get_prob_trigram_before((words_before[-2], words_before[-1], target_word))
            # If p4 == 0, 3-gram never occurred, try bigram
            if p4 == 0:
                p4 = 0.4*get_prob_bigram_before((words_before[-1], target_word))
                # If p4 == 0, bigram never occurred, try unigram
                if p4 == 0:   
                    p4 = get_prob_unigram((target_word), count_all_tokens)
        return p4
    
    # Uses trigram
    elif len_words_before == 2:
        p3 = get_prob_trigram_before((words_before[-2], words_before[-1], target_word))
        # If p3 == 0, 3-gram never occurred, try bigram
        if p3 == 0:
            p3 = 0.4*get_prob_bigram_before((words_before[-1], target_word))
            # If p3 == 0, bigram never occurred, try unigram
            if p3 == 0:
                p3 = get_prob_unigram((target_word), count_all_tokens)
        return p3
    
    # Uses bigram
    elif len_words_before == 1:
        p2 = get_prob_bigram_before((words_before[-1], target_word))
        # If p2 == 0, bigram never occurred, try unigram
        if p2 == 0:
            p2 = get_prob_unigram((target_word), count_all_tokens)
        return p2
    
    # Uses unigram
    else:
        p1 = get_prob_unigram((target_word), count_all_tokens)
        return p1

In [209]:
# Helper function to get the probability of a target word given the context words to the right (after) of the target word
def get_probability_word_after(words_after, target_word, len_words_after, count_all_tokens, unique_vocab_len):
    """" Get probability for 'target_word' using only context words that come after the word.
    Use back-off if n-gram does not exist.
    
    """  
    # Uses 5-gram
    if len_words_after == 4:
        p5 = get_prob_fivegram_after((target_word, words_after[0], words_after[1], words_after[2], words_after[3]))
        # If p5 == 0, 5-gram never occurred, try 4-gram
        if p5 == 0:
            p5 = 0.4*get_prob_fourgram_after((target_word, words_after[0], words_after[1], words_after[2]))
            # If p4 == 0, 4-gram never occurred, try 3-gram
            if p5 == 0:
                p5 = 0.4*get_prob_trigram_after((target_word, words_after[0], words_after[1]))
                # If p3 == 0, 3-gram never occurred, try bigram
                if p5 == 0:
                    p5 = 0.4*get_prob_bigram_after((target_word, words_after[0]))
                    # If p2 == 0, bigram never occurred, try unigram
                    if p5 == 0:
                        p5 = get_prob_unigram((target_word), count_all_tokens)
        return p5
    
    # Uses 4-gram
    elif len_words_after == 3:
        p4 = get_prob_fourgram_after((target_word, words_after[0], words_after[1], words_after[2]))
        # If p4 == 0, 4-gram never occurred, try 3-gram
        if p4 == 0:
            p4 = 0.4*get_prob_trigram_after((target_word, words_after[0], words_after[1]))
            # If p4 == 0, 3-gram never occurred, try bigram
            if p4 == 0:
                p4 = 0.4*get_prob_bigram_after((target_word, words_after[0]))
                # If p4 == 0, bigram never occurred, try unigram
                if p4 == 0:      
                    p4 = get_prob_unigram((target_word), count_all_tokens)
        return p4
    
    # Uses trigram
    elif len_words_after == 2:
        p3 = get_prob_trigram_after((target_word, words_after[0], words_after[1]))
        # If p3 == 0, 3-gram never occurred, try bigram
        if p3 == 0:
            p3 = 0.4*get_prob_bigram_after((target_word, words_after[0]))
            # If p3 == 0, bigram never occurred, try unigram
            if p3 == 0:
                p3 = get_prob_unigram((target_word), count_all_tokens)
        return p3
    
    # Uses bigram
    elif len_words_after == 1:
        p2 = get_prob_bigram_after((target_word, words_after[0]))
        # If p2 == 0, bigram never occurred, try unigram
        if p2 == 0:
            p2 = get_prob_unigram((target_word), count_all_tokens)
        return p2
    
    # Uses unigram
    else:
        p1 = get_prob_unigram((target_word), count_all_tokens)
        return p1

In [210]:
# Helper function to get the probability of a target word given the context words to the left (before) and right (after) of the target word - bidirectional
def get_probability_word_bidir(words_before, words_after, target_word, count_all_tokens, unique_vocab):
    """ Get probability of a 'target_word' given the context words 'words'.
    Use back-off: when a n-gram is not found (returned zero probability),
    use (n-1)-gram, and so on.
    
    """
    unique_vocab_len = len(unique_vocab)    
    len_words_before = len(words_before)
    len_words_after = len(words_after)
    
    prob_before = get_probability_word_before(words_before, target_word, len_words_before, count_all_tokens, unique_vocab_len)
    prob_after = get_probability_word_after(words_after, target_word, len_words_after, count_all_tokens, unique_vocab_len)
    prob_final = (prob_before + prob_after)/2.0
    
    return prob_final

In [211]:
def calculate_perplexity_bidir(sentence: list,
                         ngram_counts: dict,
                         count_all_tokens: int,
                         unique_vocab_len: int,
                         unique_vocab: list):
    
    n = len(list(ngram_counts.keys())[0])

    # Cast the sentence from a list to a tuple
    sentence = tuple(sentence)

    # length of sentence
    N = len(sentence)

    cumulative_product = 1.0
    
    # Computing perplexity - indices from 1 to N-1 to exclude [START] and [END] tokens
    for t in range(1, N-1): 
        if (t-n) < 0:
            previous_ngram = sentence[:t]
        else:
            previous_ngram = sentence[t-(n-1):t]

        word = sentence[t]

        subsequent_ngram = sentence[t+1:t+n]

        # Estimate the probability of the word given the n-gram before and after
        probability = get_probability_word_bidir(previous_ngram, subsequent_ngram, word, count_all_tokens, unique_vocab) 

        # Update the product of the probabilities
        cumulative_product *= 1/probability     

    # Take the Nth root of the product to obtain perplexity
    perplexity = cumulative_product**(1/N)
    return perplexity

#### Build n-gram models with training data

In [212]:
ngram_models = ngram_count(training_data_with_unk, 5)
all_words = ngram_models[0]
unique_vocab = ngram_models[1]
unigram = ngram_models[2]
bigram = ngram_models[3]
trigram = ngram_models[4]
fourgram = ngram_models[5]
fivegram = ngram_models[6]

count_all_tokens = len(all_words)
unique_vocab_len = len(unique_vocab)

In [None]:
print("Total number of words in the training corpus: ", count_all_tokens)
print("Total number of unique words in the training corpus: ", unique_vocab_len)

#### Compute perplexity score with training data for bidirectional n-grams

In [213]:
# Define pandas df to store computed perplexities
perplexity_bidir_df = pd.DataFrame(columns = ['perplexity_score', 'ngram', 'group'])
index_to_add = 0

In [214]:
# Iterate through training data to compute average perplexity
for train in training_data_with_unk:
    
    # Using unigram
    perplexity_uni = calculate_perplexity_bidir(train, unigram, count_all_tokens, unique_vocab_len, unique_vocab)
    perplexity_bidir_df.loc[index_to_add] = [perplexity_uni, 'Unigram', 'Training Data']
    index_to_add += 1
    
    # Using bigram
    perplexity_bi = calculate_perplexity_bidir(train, bigram, count_all_tokens, unique_vocab_len, unique_vocab)
    perplexity_bidir_df.loc[index_to_add] = [perplexity_bi, 'Bigram', 'Training Data']
    index_to_add += 1
    
    # Using trigram
    if len(train) >= 3:
        perplexity_tri = calculate_perplexity_bidir(train, trigram, count_all_tokens, unique_vocab_len, unique_vocab)
        perplexity_bidir_df.loc[index_to_add] = [perplexity_tri, 'Trigram', 'Training Data']
        index_to_add += 1
    
    else:
        perplexity_tri = None
    
    # Using fourgram
    if len(train) >= 4:
        perplexity_four = calculate_perplexity_bidir(train, fourgram, count_all_tokens, unique_vocab_len, unique_vocab)
        perplexity_bidir_df.loc[index_to_add] = [perplexity_four, '4-gram', 'Training Data']
        index_to_add += 1

    else:
        perplexity_four = None
        
    # Using fivegram
    if len(train) >= 5:
        perplexity_five = calculate_perplexity_bidir(train, fivegram, count_all_tokens, unique_vocab_len, unique_vocab)
        perplexity_bidir_df.loc[index_to_add] = [perplexity_five, '5-gram', 'Training Data']
        index_to_add += 1
        
    else:
        perplexity_five = None

#### Compute perplexity score with testing data for bidirectional n-grams

In [215]:
# Iterate through testing data to compute average perplexity
for test in testing_data:
        
    # Using unigram
    perplexity_uni = calculate_perplexity_bidir(test, unigram, count_all_tokens, unique_vocab_len, unique_vocab)
    perplexity_bidir_df.loc[index_to_add] = [perplexity_uni, 'Unigram', 'Testing Data']
    index_to_add += 1
        
    # Using bigram
    perplexity_bi = calculate_perplexity_bidir(test, bigram, count_all_tokens, unique_vocab_len, unique_vocab)
    perplexity_bidir_df.loc[index_to_add] = [perplexity_bi, 'Bigram', 'Testing Data']
    index_to_add += 1
        
    # Using trigram
    if len(test) >= 3:
        perplexity_tri = calculate_perplexity_bidir(test, trigram, count_all_tokens, unique_vocab_len, unique_vocab)
        perplexity_bidir_df.loc[index_to_add] = [perplexity_tri, 'Trigram', 'Testing Data']
        index_to_add += 1
    
    else:
        perplexity_tri = None
    
    # Using fourgram
    if len(test) >= 4:
        perplexity_four = calculate_perplexity_bidir(test, fourgram, count_all_tokens, unique_vocab_len, unique_vocab)
        perplexity_bidir_df.loc[index_to_add] = [perplexity_four, '4-gram', 'Testing Data']
        index_to_add += 1
    else:
        perplexity_four = None
        
    # Using fivegram
    if len(test) >= 5:
        perplexity_five = calculate_perplexity_bidir(test, fivegram, count_all_tokens, unique_vocab_len, unique_vocab)
        perplexity_bidir_df.loc[index_to_add] = [perplexity_five, '5-gram', 'Testing Data']
        index_to_add += 1
    else:
        perplexity_five = None

#### Plot Perplexity metric

In [217]:
# Function to remove outliers if necessary
def remove_outliers(df):
    temp_df = df.loc[(df['ngram'] == 'Unigram') & (df['group'] == 'Training Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == 'Unigram') & (df['group'] == 'Training Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_1 = df.loc[filter]
    
 
    temp_df = df.loc[(df['ngram'] == 'Unigram') & (df['group'] == 'Testing Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == 'Unigram') & (df['group'] == 'Testing Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_2 = df.loc[filter]
    
    
    temp_df = df.loc[(df['ngram'] == 'Bigram') & (df['group'] == 'Training Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == 'Bigram') & (df['group'] == 'Training Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_3 = df.loc[filter]    
    
 
    temp_df = df.loc[(df['ngram'] == 'Bigram') & (df['group'] == 'Testing Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == 'Bigram') & (df['group'] == 'Testing Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_4 = df.loc[filter]
    
    
    temp_df = df.loc[(df['ngram'] == 'Trigram') & (df['group'] == 'Training Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == 'Trigram') & (df['group'] == 'Training Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_5 = df.loc[filter]    
    
 
    temp_df = df.loc[(df['ngram'] == 'Trigram') & (df['group'] == 'Testing Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == 'Trigram') & (df['group'] == 'Testing Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_6 = df.loc[filter]
    
    
    temp_df = df.loc[(df['ngram'] == '4-gram') & (df['group'] == 'Training Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == '4-gram') & (df['group'] == 'Training Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_7 = df.loc[filter]    
    
 
    temp_df = df.loc[(df['ngram'] == '4-gram') & (df['group'] == 'Testing Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == '4-gram') & (df['group'] == 'Testing Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_8 = df.loc[filter]    
    
    
    temp_df = df.loc[(df['ngram'] == '5-gram') & (df['group'] == 'Training Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == '5-gram') & (df['group'] == 'Training Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_9 = df.loc[filter]    
    
 
    temp_df = df.loc[(df['ngram'] == '5-gram') & (df['group'] == 'Testing Data')]
    Q1 = temp_df['perplexity_score'].quantile(0.25)
    Q3 = temp_df['perplexity_score'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df['ngram'] == '5-gram') & (df['group'] == 'Testing Data') & (df['perplexity_score'] >= Q1 - 1.5 * IQR) & (df['perplexity_score'] <= Q3 + 1.5 *IQR)
    df_10 = df.loc[filter]
    
    main_df = df_1.append([df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10])
    return main_df

In [None]:
# Plot perplexities
pio.renderers.default = 'iframe' # or 'notebook' or 'colab' or 'jupyterlab'
fig = go.Figure()
fig = px.box(perplexity_bidir_outlier_df, x="ngram", y="perplexity_score", color="group", points=False)

fig.update_layout(
    title="Bidirectional N-grams",
    title_x=0.5,
    xaxis_title="N-gram order",
    yaxis_title="Perplexity metric",
    legend_title="Dataset",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)
fig.show()
# fig.write_image("images/bidir_ngram_perplexity.png")

In [None]:
# Compute median perplexities
median_unigram = st.median(perplexity_bidir_df[(perplexity_bidir_df['ngram'] == 'Unigram') & (perplexity_bidir_df['group'] == 'Testing Data')]['perplexity_score'].tolist())
median_bigram = st.median(perplexity_bidir_df[(perplexity_bidir_df['ngram'] == 'Bigram') & (perplexity_bidir_df['group'] == 'Testing Data')]['perplexity_score'].tolist())
median_trigram = st.median(perplexity_bidir_df[(perplexity_bidir_df['ngram'] == 'Trigram') & (perplexity_bidir_df['group'] == 'Testing Data')]['perplexity_score'].tolist())
median_fourgram = st.median(perplexity_bidir_df[(perplexity_bidir_df['ngram'] == '4-gram') & (perplexity_bidir_df['group'] == 'Testing Data')]['perplexity_score'].tolist())
median_fivegram = st.median(perplexity_bidir_df[(perplexity_bidir_df['ngram'] == '5-gram') & (perplexity_bidir_df['group'] == 'Testing Data')]['perplexity_score'].tolist())
print(median_unigram)
print(median_bigram)
print(median_trigram)
print(median_fourgram)
print(median_fivegram)

In [219]:
# Save dataframes
perplexity_bidir_df.to_csv('bidir_ngrams/perplexity_ngram_bidir.csv', index=False)
perplexity_bidir_outlier_df.to_csv('bidir_ngrams/perplexity_ngram_bidir_outlier.csv', index=False)