## Baseline 1 for test case similarity

Check if test cases have the exact same steps

In [141]:
# Import libraries
import os
import gc
import pandas as pd
import numpy as np
import math
import statistics as st
import re
import string
import time
import matplotlib.pyplot as plt
from collections import defaultdict  # For word frequency

from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer, word_tokenize, TweetTokenizer
from nltk.stem import WordNetLemmatizer 
import nltk

### Load clusters obtained by the best approach (ensemble)

In [2]:
experiment_results_dir = 'experiments/'

In [3]:
approach_ensemble_dir = experiment_results_dir + 'results_approach_ensemble/'

In [4]:
# Build dictionary to indicate the cluster ID of each test step
approach_ensemble_dict = {}
cluster_file = open(approach_ensemble_dir + 'ensemble_cluster_labels.txt')
for line in cluster_file:
    full_line = line.split()
    cluster_id = int(full_line[0].replace('[', '').replace(']', '').replace(':', ''))
    step_id_list = full_line[1].split(',')
    for step_id in step_id_list:
        approach_ensemble_dict[int(step_id)] = cluster_id

In [5]:
print("Number of test steps which were clustered by the approach: ", len(approach_ensemble_dict))

Number of test steps which were clustered by the approach:  15644


### Data preprocessing functions

In [6]:
# Function to compute number of unique words in df
def get_number_unique_words(df):
    words_list = list()
    test_steps = list(df["Steps"])
    for step in test_steps:
        for word in step:
            words_list.append(word)
    number_unique_words = len(set(words_list))
    return number_unique_words

In [8]:
# Function to get list of words that occur less than a certain number of times
def get_word_frequency(df):
    words_list = list()
    test_steps = list(df["Steps"])
    for step in test_steps:
        for word in step:
            words_list.append(word)
    unique_words_list = set(words_list)
    word_occurrence_dict = {}
    for each_word in unique_words_list:
        word_occurrence_dict[each_word] = 0

    for step in test_steps:
        for word in step:
            word_occurrence_dict[word] += 1
            
    ten_times_occurrence_words = list()
    # get list of words that occur only once
    for word, occurrence in word_occurrence_dict.items():
        if occurrence < 2:
            ten_times_occurrence_words.append(word)

    return ten_times_occurrence_words

In [10]:
# Function to remove problematic/mispelled words from vocabulary
def remove_problematic_words(df):
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps: ", number_unique_words)
    
    # load file with problematic words that exist in the test data
    problematic_words = open('word2vec_vocab_problematic.txt', 'r')
    problematic_words_list = list()
    for word in problematic_words:
        problematic_words_list.append(word.lstrip().rstrip())
    
    for index, row in df.iterrows():
        step = row["Steps"]
        df.loc[index]["Steps"] = [elem for elem in step if not elem in problematic_words_list]
        
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps after removing problematic words: ", number_unique_words)

In [11]:
# function to fix problematic/mispelled words from vocabulary
def fix_problematic_words(df):
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps: ", number_unique_words)
    
    # load file with problematic words that exist in the test data
    problematic_words = open('word2vec_vocab_to_fix.txt', 'r')
    problematic_words_dict = {}
    for line in problematic_words:
        full_line = line.split(':')
        try:
            problematic_words_dict[full_line[0]] = [x.replace('\n', '') for x in full_line[1].split(',')]
        except:
            problematic_words_dict[full_line[0]] = full_line[1].replace('\n', '')
    
    for index, row in df.iterrows():
        step = row["Steps"]
        modified_step = list()
        for word in step:
            if word in problematic_words_dict:
                modified_step.extend(problematic_words_dict[word])
            else:
                modified_step.append(word)
        df.loc[index]["Steps"] = modified_step 
        
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps after fixing problematic words: ", number_unique_words)

In [8]:
def preprocess_clean_data(df):

    # Preprocessing and clean test steps
    print("Cleaning test step field...")
    
    df["Steps"] = df["Steps"].apply(lambda x: re.sub(r'http\S+', 'URL', x))
    df["Steps"] = df["Steps"].apply(lambda x: re.sub('\/[\w-]*', '', x))
    df["Steps"] = df["Steps"].apply(lambda x: re.sub(r'\{[^)]*\}', '', x))

    # lowercase the step descriptions
    df["Steps"] = df["Steps"].apply(lambda x: x.lower())

    # remove digits and words with digits
    df["Steps"] = df["Steps"].apply(lambda x: re.sub('\w*\d\w*','', x))

    # remove punctuations
    df["Steps"] = df["Steps"].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x))

    # remove extra spaces
    df["Steps"] = df["Steps"].apply(lambda x: re.sub(' +',' ',x))

    # tokenization
    df["Steps"] = df["Steps"].apply(lambda x: TweetTokenizer().tokenize(x))
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps: ", number_unique_words)

    remove_problematic_words(df)
    fix_problematic_words(df)
    
    # stopword removal
    stop_words = set(stopwords.words('english'))
    df["Steps"] = df["Steps"].apply(lambda x: [w for w in x if not w in stop_words])
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words in test steps after stopword removal: ", number_unique_words)

    # lemmatization
    lemmatizer = WordNetLemmatizer() 
    df["Steps"] = df["Steps"].apply(lambda x: [lemmatizer.lemmatize(w) for w in x])
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words in test steps after lemmatization: ", number_unique_words)

    # remove words that occur less than 10 times
    ten_times_occurrence_words = get_word_frequency(df)
    print("Number of words that occurred less than 10 times in test steps: ", len(ten_times_occurrence_words))

    for index, row in df.iterrows():
        current_test_step = row["Steps"]
        list_words_to_remove = list()
        for word in current_test_step:
            if word in ten_times_occurrence_words:
                list_words_to_remove.append(word)

        test_steps_df.loc[index]["Steps"] = [elem for elem in current_test_step if not elem in list_words_to_remove]

    print("Dataset size after preprocessing: " , df.shape)

### Read and preprocess files with test cases and build dataframe

In [20]:
# Get data directory and list of xlsx files
current_dir = os.getcwd() 
parent_dir = os.path.dirname(current_dir) + "\\filtered_data\\"
xlsxfiles = [os.path.join(root, name)
             for root, dirs, files in os.walk(parent_dir)
             for name in files
             if name.endswith((".xlsx"))]

In [21]:
# Declare pandas df to be populated
column_names = ["Type", "Key", "Case_Name", "Step_ID", "Steps"]
test_steps_df = pd.DataFrame(columns = column_names)

# Index to add data to the df
index_to_add = 0

print("Reading input data...")   
for test_file in xlsxfiles:
    # load data and iterate through it
    test_data_df = pd.read_excel(test_file)
    for index, row in test_data_df.iterrows():
        current_type = row["Type"]
        current_key = row["Key"]
        current_name = row["Case_Name"]
        current_step_id = row["Step_ID"]
        current_steps = row["Steps"]
        test_steps_df.loc[index_to_add] = [current_type, current_key, current_name, current_step_id, current_steps]
        index_to_add += 1

print("Done!")
print("Shape of data => ", test_steps_df.shape)

Reading input data...
Done!
Shape of data =>  (15668, 5)


In [22]:
# Call preprocessing function
preprocess_clean_data(test_steps_df)

Cleaning test case name field...
Dataset size before preprocessing:  (15668, 5)
Number of unique words across all test names:  1519
Number of unique words in test names after stopword removal:  1447
Number of words that occurred only once in test case names:  164
Number of unique words in test names in the end:  1138
Dataset size after preprocessing:  (15668, 5)


In [25]:
# Build tuples with (step_id, step_text) - used to retrieve the step ID in the end (after the clustering) - and get only test steps for clustering
step_id_text_tuple_list = list()
test_steps_clustering_list = list()
for index, row in test_steps_df.iterrows():
    step_id = row["Step_ID"]
    step_text = row["Steps"]
    step_id_text_tuple_list.append((step_id,step_text))

    temp_list = list()
    if isinstance(row["Steps"], list):
        for elem in row["Steps"]:
            temp_list.append(elem)
    else:
        if isinstance(row["Steps"], str):
            temp_list.append(row["Steps"])
        
    # Build list of lists of tokens (words)
    test_steps_clustering_list.append(temp_list)
    
print("Length of list of tuples:" , len(step_id_text_tuple_list))
print("Length of list with test steps: " , len(test_steps_clustering_list))

Length of list of tuples: 15668
Length of list with test steps:  15668


In [26]:
# Remove empty steps
index = 0
steps_to_remove = list()
for step in test_steps_clustering_list:
    if len(step) == 0:
        steps_to_remove.append(index)
    index += 1

step_id_text_tuple_list = [step_id_text_tuple_list[index] for index in range(len(step_id_text_tuple_list)) if not index in steps_to_remove]
test_steps_clustering_list = [test_steps_clustering_list[index] for index in range(len(test_steps_clustering_list)) if not index in steps_to_remove]
print("Length of list of tuples:" , len(step_id_text_tuple_list))
print("Length of list with test steps: " , len(test_steps_clustering_list))

Length of list of tuples: 15644
Length of list with test steps:  15644


### Build dictionary of [test_cases] x [test steps] to indicate which steps each test case has

In [166]:
test_case_steps_dict = {}
for index, row in test_steps_df.iterrows():
    test_case_key = row['Key']
    test_step_id = row['Step_ID']
    test_steps = row['Steps']
    if len(test_steps) == 0:
        continue
        
    if test_case_key in test_case_steps_dict:
        existing_list = test_case_steps_dict[test_case_key]
        existing_list.append(test_steps)
        test_case_steps_dict[test_case_key] = existing_list
    else:
        test_case_steps_dict[test_case_key] = [test_steps]
print("Number of test cases: ", len(test_case_steps_dict))

Number of test cases:  3323


In [169]:
test_case_steps_keys_list = list(test_case_steps_dict.keys())

In [170]:
duplicate_test_case_tuples = list()
for i in range(len(test_case_steps_keys_list)-1):
    for j in range(i+1, len(test_case_steps_keys_list)):
        case_key_1 = test_case_steps_keys_list[i]
        case_key_2 = test_case_steps_keys_list[j]
        step_list_1 = test_case_steps_dict[case_key_1]
        step_list_2 = test_case_steps_dict[case_key_2]
        step_list_1 = [tuple(x) for x in step_list_1]
        step_list_2 = [tuple(x) for x in step_list_2]
        if set(step_list_1) == set(step_list_2):
            duplicate_test_case_tuples.append((i,j))

In [172]:
# Merge test cases that are similar
duplicate_test_cases_list = list()
for test_case_tuple in duplicate_test_case_tuples:
    index_1 = test_case_tuple[0]
    index_2 = test_case_tuple[1]
    found = False
    for test_case_set in duplicate_test_cases_list:
        if (index_1 in test_case_set) or (index_2 in test_case_set):
            test_case_set.add(index_1)
            test_case_set.add(index_2)
            found = True
            break
    if not found:
        temp_set = set()
        temp_set.add(index_1)
        temp_set.add(index_2)
        duplicate_test_cases_list.append(temp_set)
print("Number of groups of similar test cases: ", len(duplicate_test_cases_list))

Number of groups of similar test cases:  329


In [174]:
baseline_1_dict = {}
cluster_id = 0
indices_of_similar_cases = list()
for each_set in duplicate_test_cases_list:
    for elem in each_set:
        indices_of_similar_cases.append(elem)
        case_key = test_case_steps_keys_list[elem]
        baseline_1_dict[case_key] = cluster_id
    cluster_id += 1
for elem in range(len(test_case_steps_keys_list)):
    if elem not in indices_of_similar_cases:
        case_key = test_case_steps_keys_list[elem]
        baseline_1_dict[case_key] = cluster_id
        cluster_id += 1

In [221]:
print("Number of test cases that have at least another similar case: ", len(indices_of_similar_cases))
print("Number of test cases that do NOT have any similar case: ", ( len(test_case_steps_dict) - len(indices_of_similar_cases) ))

Number of test cases that have at least another similar case:  1558
Number of test cases that do NOT have any similar case:  1765


In [219]:
file_name = 'results_baseline_1/baseline_1_similar_test_cases.txt'
output_file = open(file_name, 'w')

In [220]:
counter = 0
for key in baseline_1_dict:
    output_file.write(key + ":" + str(baseline_1_dict[key]) + "\n")
output_file.close()