## Terminology improvement analysis through language modeling

In this notebook, we perform experiments with different language models to improve the terminology of manual test case descriptions. We use the following type of language models:

* Neural language models:
  * We evaluate the following pre-trained models and their fine-tuned versions:
    * BERT-based-uncased
    * DistilBERT-based-uncased
    * BERT large uncased whole word masking

#### Neural Language Model (BERT)

In [1]:
# Import necessary libraries
import random
import os
import pandas as pd
import re
import numpy as np
import json
import statistics as st
import nltk
from nltk.tokenize import TweetTokenizer, RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
from nltk.util import ngrams
import math
from statistics import mean
import string
from collections import defaultdict

import collections
from pprint import pprint
from pathlib import Path
from typing import Iterator
import itertools
from tqdm import tqdm
import copy

from expects import (contain_exactly, equal, expect, have_keys)
import attr
from functools import partial
from tabulate import tabulate

import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.offline as offline
import plotly.io as pio
import plotly.graph_objects as go

from transformers import BertForMaskedLM, BertTokenizerFast
import torch

#### Load pre-processed data
For the terminology improvement module, we do not perform stop word removal and lemmatization for the training and testing data

In [3]:
data_dir = 'training_testing_data/with_name_objective/'

In [None]:
training_data = []
read_handle = open(data_dir + 'training_data_stopwords.txt', 'r')
for line in read_handle:
    line = line.replace('\n', '').split(',')
    training_data.append(line)
print(len(training_data))

testing_data = []
read_handle = open(data_dir + 'testing_data_stopwords.txt', 'r')
for line in read_handle:
    line = line.replace('\n', '').split(',')
    testing_data.append(line)
print(len(testing_data))
read_handle.close()

#### Pre-trained models

In [87]:
# Helper function to compute perplexity using cross-entropy loss of the BERT-based masked language model
def calculate_perplexity_neural_model_direct(sentences: list, bert_model, bert_model_name, bert_tokenizer):
    perplexity_neural_df = pd.DataFrame(columns = ['perplexity_score', 'model'])
    index_to_add = 0
    for sentence in tqdm(sentences):        
        cross_entropy_loss_list = []
        clean_sentence = sentence.copy()[1:-1]
        
        labels = clean_sentence.copy()
        labels = ' '.join(labels)
        labels = bert_tokenizer(labels, return_tensors="pt")["input_ids"]
        clean_sentence = ' '.join(clean_sentence)
            
        # Tokenizer
        tokenized_sentence = bert_tokenizer(clean_sentence, return_tensors="pt")
        
        # Get output (logist) and loss from model
        for index in range(1, len(tokenized_sentence["input_ids"][0])-1):
            masked_sentence = bert_tokenizer(clean_sentence, return_tensors="pt")
            masked_sentence["input_ids"][0][index] = 103
            
            outputs = bert_model(**masked_sentence, labels=labels)
            loss = outputs.loss
            cross_entropy_loss_list.append(loss.item())
            
        # Get average cross-entropy loss per word and compute perplexity
        cross_entropy_loss = mean(cross_entropy_loss_list)
        perplexity_neural = math.exp(cross_entropy_loss)
        
        perplexity_neural_df.loc[index_to_add] = [perplexity_neural, bert_model_name]
        index_to_add += 1

    return perplexity_neural_df

#### BERT base uncased

In [None]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking')

In [None]:
perplexity_neural_df_direct = calculate_perplexity_neural_model_direct(testing_data, model, "BERT base uncased", tokenizer)

In [162]:
# Remove outliers if necessary
# Q1 = perplexity_neural_df_direct['perplexity_score'].quantile(0.25)
# Q3 = perplexity_neural_df_direct['perplexity_score'].quantile(0.75)
# IQR = Q3 - Q1    #IQR is interquartile range. 
# filter = (perplexity_neural_df_direct['perplexity_score'] >= Q1 - 1.5 * IQR) & (perplexity_neural_df_direct['perplexity_score'] <= Q3 + 1.5 *IQR)
# perplexity_neural_df_direct_outlier = perplexity_neural_df_direct.loc[filter]

In [None]:
# Plot results for quick visualization
pio.renderers.default = 'iframe' # or 'notebook' or 'colab' or 'jupyterlab'
fig = go.Figure()
fig = px.box(perplexity_neural_df_direct_outlier, x="model", y="perplexity_score", points=False)
fig.update_layout(
    title="BERT base uncased",
    title_x=0.5,
    xaxis_title="Model",
    yaxis_title="Perplexity metric",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)
fig.show()

In [164]:
# Save dataframes as CSV to be read by R and generate plots for the paper
perplexity_neural_df_direct.to_csv('pretrained_bert/perplexity_bert_base.csv', index=False)
perplexity_neural_df_direct_outlier.to_csv('pretrained_bert/perplexity_bert_base_outlier.csv', index=False)

#### DistilBERT base uncased

In [None]:
model = BertForMaskedLM.from_pretrained('distilbert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking')

In [None]:
perplexity_neural_df_distill_bert_direct = calculate_perplexity_neural_model_direct(testing_data, model, "DistilBERT base uncased", tokenizer)

In [98]:
# Remove outliers if necessary
# Q1 = perplexity_neural_df_distill_bert_direct['perplexity_score'].quantile(0.25)
# Q3 = perplexity_neural_df_distill_bert_direct['perplexity_score'].quantile(0.75)
# IQR = Q3 - Q1    #IQR is interquartile range. 
# filter = (perplexity_neural_df_distill_bert_direct['perplexity_score'] >= Q1 - 1.5 * IQR) & (perplexity_neural_df_distill_bert_direct['perplexity_score'] <= Q3 + 1.5 *IQR)
# perplexity_neural_df_distill_bert_direct_outlier = perplexity_neural_df_distill_bert_direct.loc[filter]

In [None]:
# Plot results for quick visualization
pio.renderers.default = 'iframe' # or 'notebook' or 'colab' or 'jupyterlab'
fig = go.Figure()
fig = px.box(perplexity_neural_df_distill_bert_direct_outlier, x="model", y="perplexity_score", points=False)
fig.update_layout(
    title="DistilBERT base uncased",
    title_x=0.5,
    xaxis_title="Model",
    yaxis_title="Perplexity metric",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)
fig.show()

In [100]:
# Save dataframes as CSV to be read by R and generate plots for the paper
perplexity_neural_df_distill_bert_direct.to_csv('pretrained_bert/perplexity_distilbert_base.csv', index=False)
perplexity_neural_df_distill_bert_direct_outlier.to_csv('pretrained_bert/perplexity_distilbert_base_outlier.csv', index=False)

#### BERT large uncased whole word masking

In [None]:
model = BertForMaskedLM.from_pretrained('bert-large-uncased-whole-word-masking')
tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking')

In [None]:
perplexity_neural_df_whole_word_direct = calculate_perplexity_neural_model_direct(testing_data, model, "BERT large uncased (whole word masking)", tokenizer)

In [103]:
# Remove outliers if necessary
# Q1 = perplexity_neural_df_whole_word_direct['perplexity_score'].quantile(0.25)
# Q3 = perplexity_neural_df_whole_word_direct['perplexity_score'].quantile(0.75)
# IQR = Q3 - Q1    #IQR is interquartile range. 
# filter = (perplexity_neural_df_whole_word_direct['perplexity_score'] >= Q1 - 1.5 * IQR) & (perplexity_neural_df_whole_word_direct['perplexity_score'] <= Q3 + 1.5 *IQR)
# perplexity_neural_df_whole_word_direct_outlier = perplexity_neural_df_whole_word_direct.loc[filter]

In [None]:
# Plot results for quick visualization
pio.renderers.default = 'iframe' # or 'notebook' or 'colab' or 'jupyterlab'
fig = go.Figure()
fig = px.box(perplexity_neural_df_whole_word_direct_outlier, x="model", y="perplexity_score", points=False)
fig.update_layout(
    title="BERT whole word masking",
    title_x=0.5,
    xaxis_title="Model",
    yaxis_title="Perplexity metric",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)
fig.show()

In [105]:
# Save dataframes as CSV to be read by R and generate plots for the paper
perplexity_neural_df_whole_word_direct.to_csv('pretrained_bert/perplexity_bert_whole_word.csv', index=False)
perplexity_neural_df_whole_word_direct_outlier.to_csv('pretrained_bert/perplexity_bert_whole_word_outlier.csv', index=False)

#### Fine-tuned models
The code for fine-tuning the models and the obtained fine-tuned models are in the 'fine-tune' directory

In [166]:
# Define dataframe to store perplexity scores
perplexity_bert_df = pd.DataFrame(columns = ['perplexity_score', 'model', 'stopwords'])
index_to_add = 0

In [167]:
# Helper function to compute perplexity using cross-entropy loss of the BERT-based masked language model
def calculate_perplexity_fine_tuned_bert(sentences, bert_model, bert_tokenizer, df, model_name, stopwords_status):
    global index_to_add
    
    for sentence in tqdm(sentences):        
        cross_entropy_loss_list = []
        clean_sentence = sentence.copy()[1:-1]
    
        labels = clean_sentence.copy()
        labels = ' '.join(labels)
        labels = bert_tokenizer(labels, return_tensors="pt")["input_ids"]
        
        clean_sentence = ' '.join(clean_sentence) 
        tokenized_sentence = bert_tokenizer(clean_sentence, return_tensors="pt")
        
        for index in range(1, len(tokenized_sentence["input_ids"][0])-1):
            masked_sentence = bert_tokenizer(clean_sentence, return_tensors="pt")
            masked_sentence["input_ids"][0][index] = 103
            
            outputs = bert_model(**masked_sentence, labels=labels)
            loss = outputs.loss
            cross_entropy_loss_list.append(loss.item())
            
        cross_entropy_loss = mean(cross_entropy_loss_list)
        perplexity_neural = math.exp(cross_entropy_loss)
        
        df.loc[index_to_add] = [perplexity_neural, model_name, stopwords_status]
        index_to_add += 1

#### Fine-tuned - BERT base uncased

In [182]:
# Load fine-tuned model weights
my_bert_model = BertForMaskedLM.from_pretrained('/fine_tuned_bert_models/my_bert_base_stopwords')
tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking')

In [None]:
# Compute perplexity
calculate_perplexity_fine_tuned_bert(testing_data, my_bert_model, tokenizer, perplexity_bert_df, "BERT base uncased", "With stopwords")

#### Fine-tuned - DistilBERT base uncased

In [None]:
# Load fine-tuned model weights
my_bert_model = BertForMaskedLM.from_pretrained('/fine_tuned_bert_models/my_bert_distilbert_stopwords')
tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking')

In [None]:
# Compute perplexity
calculate_perplexity_fine_tuned_bert(testing_data, my_bert_model, tokenizer, perplexity_bert_df, "DistilBERT base uncased", "With stopwords")

#### Fine-tuned - BERT large uncased whole word masking

In [173]:
# Load fine-tuned model weights
my_bert_model = BertForMaskedLM.from_pretrained('/fine_tuned_bert_models/my_bert_whole_word_stopwords')
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking')

In [None]:
# Compute perplexity
calculate_perplexity_fine_tuned_bert(testing_data, my_bert_model, tokenizer, perplexity_bert_df, "BERT large uncased (whole word masking)", "With stopwords")

In [187]:
# Remove outliers all at once if necessary
# Q1 = perplexity_bert_df['perplexity_score'].quantile(0.25)
# Q3 = perplexity_bert_df['perplexity_score'].quantile(0.75)
# IQR = Q3 - Q1    #IQR is interquartile range. 
# filter = (perplexity_bert_df['perplexity_score'] >= Q1 - 1.5 * IQR) & (perplexity_bert_df['perplexity_score'] <= Q3 + 1.5 *IQR)
# perplexity_bert_df_outlier = perplexity_bert_df.loc[filter]

In [None]:
# Plot results for quick visualization
pio.renderers.default = 'iframe' # or 'notebook' or 'colab' or 'jupyterlab'
fig = go.Figure()
fig = px.box(perplexity_bert_df_outlier, x="model", y="perplexity_score", color="stopwords", points=False)
fig.update_layout(
    title="Fine-tuned BERT",
    title_x=0.5,
    xaxis_title="Model",
    yaxis_title="Perplexity metric",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)
fig.show()

In [189]:
# Save dataframes as CSV to be read by R and generate plots for the paper
perplexity_bert_df.to_csv('fine_tuned_bert/perplexity_finetuned_bert.csv', index=False)
perplexity_bert_df_outlier.to_csv('fine_tuned_bert/perplexity_finetuned_bert_outlier.csv', index=False)