### Reading and pre-processing data
Notebook with code to read and apply pre-processing steps to test case descriptions

In [1]:
# Import requirements
import random
import os
import pandas as pd
import re
import numpy as np
import json
import statistics as st
import nltk
from nltk.tokenize import TweetTokenizer, RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
from nltk.util import ngrams
import math
from statistics import mean
import string

import collections
from pprint import pprint
from pathlib import Path
from typing import Iterator
import itertools
from tqdm import tqdm

from expects import (contain_exactly, equal, expect, have_keys)
import attr
from functools import partial
from tabulate import tabulate

In [176]:
class DataManipulator:
    """Class with methods to read and pre-process data
    """            
    def __init__(self, my_data_dir: str) -> None:
        self.data_dir = my_data_dir
        
    def load_data(self) -> pd.DataFrame:
        data_files = [os.path.join(root, name)
                     for root, dirs, files in os.walk(self.data_dir)
                     for name in files
                     if name.endswith((".xlsx"))]
        
        # Declare pandas df to be populated
        column_names = ["Type", "Key", "Name", "Objective", "Labels", "Step_ID", "Step"]
        test_steps_df = pd.DataFrame(columns = column_names)

        # Index to add data to the df
        index_to_add = 0

        print("Reading input data...")   
        for test_file in data_files:
            # load data and iterate through it to select only the columns we are interested in
            test_data_df = pd.read_excel(test_file)
            for index, row in test_data_df.iterrows():
                current_type = row["Type"]
                current_key = row["Key"]
                current_name = row["Name"]
                current_objective = row["Objective"]
                current_labels = row["Labels"]
                current_step_id = row["Step_ID"]
                current_steps = row["Step"]
                test_steps_df.loc[index_to_add] = [current_type, current_key, current_name, current_objective, current_labels, current_step_id, current_steps]
                index_to_add += 1
        print("Done!")
        print("Shape of data: ", test_steps_df.shape)
        return test_steps_df
        
    # Function to pre-process field (test case name, objective, or steps), such as tokenization, stop word removal, lemmatization
    def preprocess_test_data(self, df, field) -> pd.DataFrame:
        print("Pre-processing test cases...")
        
        # Lower case 
        df[field] = df[field].apply(lambda x: x.lower() if not (pd.isnull(x)) else x)
        
        # Replace urls with the keyword 'URL'
        df[field] = df[field].apply(lambda x: re.sub(r'http\S+', ' URL ', x) if not (pd.isnull(x)) else x)

        # remove underscores
        df[field] = df[field].apply(lambda x: re.sub('_', ' ', x).strip() if not (pd.isnull(x)) else x)
        
        # Remove digits and words with digits
        df[field] = df[field].apply(lambda x: re.sub('\w*\d\w*','', x) if not (pd.isnull(x)) else x)
        
        # Remove dashes
        df[field] = df[field].apply(lambda x: re.sub(' - ',' ', x) if not (pd.isnull(x)) else x)
        df[field] = df[field].apply(lambda x: re.sub('- ',' ', x) if not (pd.isnull(x)) else x)
        df[field] = df[field].apply(lambda x: re.sub(' -',' ', x) if not (pd.isnull(x)) else x)
        
        # Remove the remaining punctuations
        df[field] = df[field].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation.replace('-','')), ' ', x) if not (pd.isnull(x)) else x)

        # Remove extra spaces
        df[field] = df[field].apply(lambda x: re.sub(' +',' ',x) if not (pd.isnull(x)) else x)

        # Tokenization
        toknz = nltk.tokenize.TweetTokenizer() # use tweet tokenizer as it does not split apostrophes
        df[field] = df[field].apply(lambda x: toknz.tokenize(x) if not (pd.isnull(x)) else x)
           
        # Stopword removal
        stop_words = set(stopwords.words('english'))
        df[field] = df[field].apply(lambda x: [w for w in x if not w in stop_words] if not (np.all(pd.isnull(x))) else x)
        
        # Lemmatization
        lemmatizer = WordNetLemmatizer() 
        df[field] = df[field].apply(lambda x: [lemmatizer.lemmatize(w) for w in x] if not (np.all(pd.isnull(x))) else x)

        print("Pre-processing finished!")
        return df
    
    # Function to pre-process field (test case name, objective, or steps) without 'stop word removal' and 'lemmatization' (to be used in the language models)
    def preprocess_lightweight_test_data(self, df, field) -> pd.DataFrame:
        print("Pre-processing test cases...")
        
        # Lower case 
        df[field] = df[field].apply(lambda x: x.lower() if not (pd.isnull(x)) else x)

        # Replace urls with the keyword 'URL'
        df[field] = df[field].apply(lambda x: re.sub(r'http\S+', ' URL ', x) if not (pd.isnull(x)) else x)

        # remove underscores
        df[field] = df[field].apply(lambda x: re.sub('_', ' ', x).strip() if not (pd.isnull(x)) else x)
        
        # Remove digits and words with digits
        df[field] = df[field].apply(lambda x: re.sub('\w*\d\w*','', x) if not (pd.isnull(x)) else x)
        
        # Remove dashes
        df[field] = df[field].apply(lambda x: re.sub(' - ',' ', x) if not (pd.isnull(x)) else x)
        df[field] = df[field].apply(lambda x: re.sub('- ',' ', x) if not (pd.isnull(x)) else x)
        df[field] = df[field].apply(lambda x: re.sub(' -',' ', x) if not (pd.isnull(x)) else x)
        
        # Remove the remaining punctuations
        df[field] = df[field].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation.replace('-','')), ' ', x) if not (pd.isnull(x)) else x)

        # Remove extra spaces
        df[field] = df[field].apply(lambda x: re.sub(' +',' ',x) if not (pd.isnull(x)) else x)

        # Tokenization
        toknz = nltk.tokenize.TweetTokenizer() # use tweet tokenizer as it does not split apostrophes
        df[field] = df[field].apply(lambda x: toknz.tokenize(x) if not (pd.isnull(x)) else x)

        print("Lighweight pre-processing finished!")
        return df

In [177]:
# Data directory
data_dir = '/dataset/unpreprocessed_data'

# Instantiate data manipulator class
data_manipulator = DataManipulator(data_dir)

In [None]:
# Load the data and visualize head
test_step_df = data_manipulator.load_data()
test_step_df.head()

In [179]:
# Make a copy of the data so that we can apply the lighweight version of the pre-processing steps
test_step_stopwords_df = test_step_df.copy()

In [None]:
# Pre-process the data (full pre-processing)
test_step_df = data_manipulator.preprocess_test_data(test_step_df, "Name")
test_step_df = data_manipulator.preprocess_test_data(test_step_df, "Objective")
test_step_df = data_manipulator.preprocess_test_data(test_step_df, "Step")
test_step_df.head()

In [None]:
# Lighweight pre-process of the data (keep stopwords and does not perform lemmatization)
test_step_stopwords_df = data_manipulator.preprocess_lightweight_test_data(test_step_stopwords_df, "Name")
test_step_stopwords_df = data_manipulator.preprocess_lightweight_test_data(test_step_stopwords_df, "Objective")
test_step_stopwords_df = data_manipulator.preprocess_lightweight_test_data(test_step_stopwords_df, "Step")
test_step_stopwords_df.head()

#### Concatenate test steps, names, and objectives to build the dataset

In [None]:
# Concatenate steps, name, and objective
test_steps_data = []
test_steps_data.extend(test_step_df['Step'].tolist())
test_steps_data.extend(test_step_df['Name'].tolist())
test_steps_data.extend(test_step_df['Objective'].tolist())
print(len(test_steps_data))

In [185]:
# Add beginning and end of sentence indicators
for index, step in enumerate(test_steps_data):
    test_steps_data[index] = ['[START]'] + step + ['[END]']
    
# Save tokenized steps
tokenized_steps = test_steps_data
with open("training_testing_data/with_name_objective/tokenized_steps.txt", 'w') as write_handle:
    for step in tokenized_steps:
        write_handle.write(','.join(step))
        write_handle.write('\n')

In [None]:
# Concatenate steps, name, and objective for data with stop words
test_steps_data_stopwords = []
test_steps_data_stopwords.extend(test_step_stopwords_df['Step'].tolist())
test_steps_data_stopwords.extend(test_step_stopwords_df['Name'].tolist())
test_steps_data_stopwords.extend(test_step_stopwords_df['Objective'].tolist())
print(len(test_steps_data_stopwords))

In [188]:
# Add beginning and end of sentence indicators
for index, step in enumerate(test_steps_data_stopwords):
    test_steps_data_stopwords[index] = ['[START]'] + step + ['[END]']

# Save tokenized stopwords steps
tokenized_steps_stopwords = test_steps_data_stopwords
with open("training_testing_data/with_name_objective/tokenized_steps_stopwords.txt", 'w') as write_handle:
    for step in tokenized_steps_stopwords:
        write_handle.write(','.join(step))
        write_handle.write('\n')

#### Remove duplicates

In [None]:
# Remove duplicates
print(len(tokenized_steps))
tokenized_steps.sort()
tokenized_steps = list(tokenized_steps for tokenized_steps,_ in itertools.groupby(tokenized_steps))
print(len(tokenized_steps))

In [192]:
# Save unique tokenized steps
with open("training_testing_data/with_name_objective/tokenized_steps_unique.txt", 'w') as write_handle:
    for step in tokenized_steps:
        write_handle.write(','.join(step))
        write_handle.write('\n')

In [None]:
# Remove duplicates for data with stop words
print(len(tokenized_steps_stopwords))
tokenized_steps_stopwords.sort()
tokenized_steps_stopwords = list(tokenized_steps_stopwords for tokenized_steps_stopwords,_ in itertools.groupby(tokenized_steps_stopwords))
print(len(tokenized_steps_stopwords))

In [195]:
# Save unique tokenized steps with stopwords
with open("training_testing_data/with_name_objective/tokenized_steps_stopwords_unique.txt", 'w') as write_handle:
    for step in tokenized_steps_stopwords:
        write_handle.write(','.join(step))
        write_handle.write('\n')

#### Split into training and testing sets

In [None]:
# Split into training and testing and save as text file
random.seed(12)
shuffled_data = random.sample(tokenized_steps, k=len(tokenized_steps))
training_size = int(len(shuffled_data) * 0.8)

# Get training and testing data
training_data = shuffled_data[0:training_size]
testing_data = shuffled_data[training_size:]

# Get validation data from training data (only for BERT fine-tuning)
training_bert_size = int(len(training_data) * 0.8)
training_data_bert = training_data[0:training_bert_size]
validation_data_bert = training_data[training_bert_size:]

# Remove [START] and [END] as BERT adds its own sentence beginning and ending tokens
training_data_bert = [x[1:-1] for x in training_data_bert]
validation_data_bert = [x[1:-1] for x in validation_data_bert]

# Save all the obtained sets
with open('training_testing_data/with_name_objective/training_data.txt', 'w') as write_handle:
    for step in training_data:
        write_handle.write(','.join(step))
        write_handle.write('\n')
        
with open('training_testing_data/with_name_objective/testing_data.txt', 'w') as write_handle:
    for step in testing_data:
        write_handle.write(','.join(step))
        write_handle.write('\n')
        
with open('training_testing_data/with_name_objective/training_data_bert.txt', 'w') as write_handle:
    for step in training_data_bert:
        write_handle.write(','.join(step))
        write_handle.write('\n')
        
with open('training_testing_data/with_name_objective/validation_data_bert.txt', 'w') as write_handle:
    for step in validation_data_bert:
        write_handle.write(','.join(step))
        write_handle.write('\n')

In [None]:
# Split into training and testing for data with stop words and save as text file
random.seed(12)
shuffled_data = random.sample(tokenized_steps_stopwords, k=len(tokenized_steps_stopwords))
training_size = int(len(shuffled_data) * 0.8)

# Get training and testing data
training_data_stopwords = shuffled_data[0:training_size]
testing_data_stopwords = shuffled_data[training_size:]

# Get validation data from training data (only for BERT fine-tuning)
training_bert_size = int(len(training_data_stopwords) * 0.8)

training_data_stopwords_bert = training_data_stopwords[0:training_bert_size]
validation_data_stopwords_bert = training_data_stopwords[training_bert_size:]

# Remove [START] and [END] as BERT adds its own sentence beginning and ending tokens
training_data_stopwords_bert = [x[1:-1] for x in training_data_stopwords_bert]
validation_data_stopwords_bert = [x[1:-1] for x in validation_data_stopwords_bert]

with open('training_testing_data/with_name_objective/training_data_stopwords.txt', 'w') as write_handle:
    for step in training_data_stopwords:
        write_handle.write(','.join(step))
        write_handle.write('\n')
        
with open('training_testing_data/with_name_objective/testing_data_stopwords.txt', 'w') as write_handle:
    for step in testing_data_stopwords:
        write_handle.write(','.join(step))
        write_handle.write('\n')
        
with open('training_testing_data/with_name_objective/training_data_stopwords_bert.txt', 'w') as write_handle:
    for step in training_data_stopwords_bert:
        write_handle.write(','.join(step))
        write_handle.write('\n')
        
with open('training_testing_data/with_name_objective/validation_data_stopwords_bert.txt', 'w') as write_handle:
    for step in validation_data_stopwords_bert:
        write_handle.write(','.join(step))
        write_handle.write('\n')

#### Save pandas df with existing test cases

In [None]:
existing_test_cases_df.to_pickle('training_testing_data/existing_test_cases.pkl')