## Technique 4 for test case similarity

* Technique 4 combines cosine similarity (of test case representation vectors) with distance between test case name embeddings (using Word2Vec)

In [1]:
# import libraries
import os
import gc
import pandas as pd
import numpy as np
import math
import statistics as st
import re
import string
import time
import matplotlib.pyplot as plt
from collections import defaultdict  # For word frequency

from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer, word_tokenize, TweetTokenizer
from nltk.stem import WordNetLemmatizer 
import nltk

### Load clusters obtained by the best approach (ensemble)

In [2]:
experiment_results_dir = 'experiments/'

In [3]:
approach_ensemble_dir = experiment_results_dir + 'results_approach_ensemble/'

In [4]:
# Build dictionary to indicate the cluster ID of each test step
approach_ensemble_dict = {}
cluster_file = open(approach_ensemble_dir + 'ensemble_cluster_labels.txt')
for line in cluster_file:
    full_line = line.split()
    cluster_id = int(full_line[0].replace('[', '').replace(']', '').replace(':', ''))
    step_id_list = full_line[1].split(',')
    for step_id in step_id_list:
        approach_ensemble_dict[int(step_id)] = cluster_id

In [5]:
print("Number of test steps which were clustered by the approach: ", len(approach_ensemble_dict))

Number of test steps which were clustered by the approach:  15644


### Data preprocessing functions

In [6]:
# Function to compute number of unique words in df
def get_number_unique_words(df):
    words_list = list()
    test_steps = list(df["Steps"])
    for step in test_steps:
        for word in step:
            words_list.append(word)
    number_unique_words = len(set(words_list))
    return number_unique_words

In [8]:
# Function to get list of words that occur less than a certain number of times
def get_word_frequency(df):
    words_list = list()
    test_steps = list(df["Steps"])
    for step in test_steps:
        for word in step:
            words_list.append(word)
    unique_words_list = set(words_list)
    word_occurrence_dict = {}
    for each_word in unique_words_list:
        word_occurrence_dict[each_word] = 0

    for step in test_steps:
        for word in step:
            word_occurrence_dict[word] += 1
            
    ten_times_occurrence_words = list()
    # get list of words that occur only once
    for word, occurrence in word_occurrence_dict.items():
        if occurrence < 2:
            ten_times_occurrence_words.append(word)

    return ten_times_occurrence_words

In [10]:
# Function to remove problematic/mispelled words from vocabulary
def remove_problematic_words(df):
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps: ", number_unique_words)
    
    # load file with problematic words that exist in the test data
    problematic_words = open('word2vec_vocab_problematic.txt', 'r')
    problematic_words_list = list()
    for word in problematic_words:
        problematic_words_list.append(word.lstrip().rstrip())
    
    for index, row in df.iterrows():
        step = row["Steps"]
        df.loc[index]["Steps"] = [elem for elem in step if not elem in problematic_words_list]
        
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps after removing problematic words: ", number_unique_words)

In [11]:
# function to fix problematic/mispelled words from vocabulary
def fix_problematic_words(df):
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps: ", number_unique_words)
    
    # load file with problematic words that exist in the test data
    problematic_words = open('word2vec_vocab_to_fix.txt', 'r')
    problematic_words_dict = {}
    for line in problematic_words:
        full_line = line.split(':')
        try:
            problematic_words_dict[full_line[0]] = [x.replace('\n', '') for x in full_line[1].split(',')]
        except:
            problematic_words_dict[full_line[0]] = full_line[1].replace('\n', '')
    
    for index, row in df.iterrows():
        step = row["Steps"]
        modified_step = list()
        for word in step:
            if word in problematic_words_dict:
                modified_step.extend(problematic_words_dict[word])
            else:
                modified_step.append(word)
        df.loc[index]["Steps"] = modified_step 
        
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps after fixing problematic words: ", number_unique_words)

In [8]:
def preprocess_clean_data(df):

    # Preprocessing and clean test steps
    print("Cleaning test step field...")
    
    df["Steps"] = df["Steps"].apply(lambda x: re.sub(r'http\S+', 'URL', x))
    df["Steps"] = df["Steps"].apply(lambda x: re.sub('\/[\w-]*', '', x))
    df["Steps"] = df["Steps"].apply(lambda x: re.sub(r'\{[^)]*\}', '', x))

    # lowercase the step descriptions
    df["Steps"] = df["Steps"].apply(lambda x: x.lower())

    # remove digits and words with digits
    df["Steps"] = df["Steps"].apply(lambda x: re.sub('\w*\d\w*','', x))

    # remove punctuations
    df["Steps"] = df["Steps"].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x))

    # remove extra spaces
    df["Steps"] = df["Steps"].apply(lambda x: re.sub(' +',' ',x))

    # tokenization
    df["Steps"] = df["Steps"].apply(lambda x: TweetTokenizer().tokenize(x))
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps: ", number_unique_words)

    remove_problematic_words(df)
    fix_problematic_words(df)
    
    # stopword removal
    stop_words = set(stopwords.words('english'))
    df["Steps"] = df["Steps"].apply(lambda x: [w for w in x if not w in stop_words])
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words in test steps after stopword removal: ", number_unique_words)

    # lemmatization
    lemmatizer = WordNetLemmatizer() 
    df["Steps"] = df["Steps"].apply(lambda x: [lemmatizer.lemmatize(w) for w in x])
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words in test steps after lemmatization: ", number_unique_words)

    # remove words that occur less than 10 times
    ten_times_occurrence_words = get_word_frequency(df)
    print("Number of words that occurred less than 10 times in test steps: ", len(ten_times_occurrence_words))

    for index, row in df.iterrows():
        current_test_step = row["Steps"]
        list_words_to_remove = list()
        for word in current_test_step:
            if word in ten_times_occurrence_words:
                list_words_to_remove.append(word)

        test_steps_df.loc[index]["Steps"] = [elem for elem in current_test_step if not elem in list_words_to_remove]

    print("Dataset size after preprocessing: " , df.shape)

### Read and preprocess files with test cases and build dataframe

In [20]:
# Get data directory and list of xlsx files
current_dir = os.getcwd() 
parent_dir = os.path.dirname(current_dir) + "\\filtered_data\\"
xlsxfiles = [os.path.join(root, name)
             for root, dirs, files in os.walk(parent_dir)
             for name in files
             if name.endswith((".xlsx"))]

In [21]:
# Declare pandas df to be populated
column_names = ["Type", "Key", "Case_Name", "Step_ID", "Steps"]
test_steps_df = pd.DataFrame(columns = column_names)

# Index to add data to the df
index_to_add = 0

print("Reading input data...")   
for test_file in xlsxfiles:
    # load data and iterate through it
    test_data_df = pd.read_excel(test_file)
    for index, row in test_data_df.iterrows():
        current_type = row["Type"]
        current_key = row["Key"]
        current_name = row["Case_Name"]
        current_step_id = row["Step_ID"]
        current_steps = row["Steps"]
        test_steps_df.loc[index_to_add] = [current_type, current_key, current_name, current_step_id, current_steps]
        index_to_add += 1

print("Done!")
print("Shape of data => ", test_steps_df.shape)

Reading input data...
Done!
Shape of data =>  (15668, 5)


In [22]:
# Call preprocessing function
preprocess_clean_data(test_steps_df)

Cleaning test case name field...
Dataset size before preprocessing:  (15668, 5)
Number of unique words across all test names:  1519
Number of unique words in test names after stopword removal:  1447
Number of words that occurred only once in test case names:  164
Number of unique words in test names in the end:  1138
Dataset size after preprocessing:  (15668, 5)


In [25]:
# Build tuples with (step_id, step_text) - used to retrieve the step ID in the end (after the clustering) - and get only test steps for clustering
step_id_text_tuple_list = list()
test_steps_clustering_list = list()
for index, row in test_steps_df.iterrows():
    step_id = row["Step_ID"]
    step_text = row["Steps"]
    step_id_text_tuple_list.append((step_id,step_text))

    temp_list = list()
    if isinstance(row["Steps"], list):
        for elem in row["Steps"]:
            temp_list.append(elem)
    else:
        if isinstance(row["Steps"], str):
            temp_list.append(row["Steps"])
        
    # Build list of lists of tokens (words)
    test_steps_clustering_list.append(temp_list)
    
print("Length of list of tuples:" , len(step_id_text_tuple_list))
print("Length of list with test steps: " , len(test_steps_clustering_list))

Length of list of tuples: 15668
Length of list with test steps:  15668


In [26]:
# Remove empty steps
index = 0
steps_to_remove = list()
for step in test_steps_clustering_list:
    if len(step) == 0:
        steps_to_remove.append(index)
    index += 1

step_id_text_tuple_list = [step_id_text_tuple_list[index] for index in range(len(step_id_text_tuple_list)) if not index in steps_to_remove]
test_steps_clustering_list = [test_steps_clustering_list[index] for index in range(len(test_steps_clustering_list)) if not index in steps_to_remove]
print("Length of list of tuples:" , len(step_id_text_tuple_list))
print("Length of list with test steps: " , len(test_steps_clustering_list))

Length of list of tuples: 15644
Length of list with test steps:  15644


### Build numeric matrix of [test_cases] x [clusters] to indicate which clusters are related to each test case and how many steps the test case has in a cluster

In [24]:
test_case_steps_dict = {}
for index, row in test_steps_df.iterrows():
    test_case_key = row['Key']
    test_step_id = row['Step_ID']
    test_steps = row['Steps']
    if len(test_steps) == 0:
        continue
    if test_case_key in test_case_steps_dict:
        existing_list = test_case_steps_dict[test_case_key]
        existing_list.append(test_step_id)
        test_case_steps_dict[test_case_key] = existing_list
    else:
        test_case_steps_dict[test_case_key] = [test_step_id]
print("Number of test cases: ", len(test_case_steps_dict))

Number of test cases:  3323


In [25]:
numeric_matrix = np.zeros((len(test_case_steps_dict),number_clusters))

In [26]:
row_index = 0
for test_case_key in test_case_steps_dict:
    steps_ids_list = test_case_steps_dict[test_case_key]
    cluster_ids_to_fill_list = list()
    for each_step_id in steps_ids_list:
        cluster_id = approach_ensemble_dict[each_step_id]
        cluster_ids_to_fill_list.append(cluster_id)
    
    # Fill in matrix with specific column indices (cluster ids)
    tuple_count_clusters = list()
    cluster_ids_set = set(cluster_ids_to_fill_list)
    for elem in cluster_ids_set:
        counter = 0
        for cluster_id in cluster_ids_to_fill_list:
            if cluster_id == elem:
                counter += 1
        tuple_count_clusters.append((elem,counter))

    first_tuple_elements = [a_tuple[0] for a_tuple in tuple_count_clusters] # indices of columns to be filled in
    second_tuple_elements = [a_tuple[1] for a_tuple in tuple_count_clusters] # number of steps in each cluster (Data to be filled in)

    numeric_matrix[row_index,first_tuple_elements] = second_tuple_elements
    row_index += 1

In [28]:
row_names = [test_case_key for test_case_key in test_case_steps_dict]
numeric_matrix_df = pd.DataFrame(numeric_matrix, index=row_names)

In [None]:
# Check numeric matrix as a dataframe
numeric_matrix_df

### Compute cosine similarity score and build distance matrix with this score

In [32]:
simil_matrix = np.zeros((len(test_case_steps_dict),len(test_case_steps_dict)))

In [33]:
row_names = [test_case_key for test_case_key in test_case_steps_dict]
col_names = row_names
step_simil_matrix_df = pd.DataFrame(simil_matrix, index = row_names, columns = col_names)

In [None]:
# Check similarity matrix as dataframe
step_simil_matrix_df 

In [None]:
test_case_dict_keys_list = list(test_case_steps_dict.keys())
for i in range(len(test_case_steps_dict)):
    print(i)
    for j in range(i, len(test_case_steps_dict)):
        case_key_1 = test_case_dict_keys_list[i]
        case_key_2 = test_case_dict_keys_list[j]
        computed_dist = 1 - spatial.distance.cosine(np.array(numeric_matrix_df.loc[case_key_1,:].tolist()), np.array(numeric_matrix_df.loc[case_key_2,:].tolist()))  
        step_dist_matrix_df.loc[case_key_1,case_key_2] = step_dist_matrix_df.loc[case_key_2,case_key_1] = computed_dist

In [None]:
step_simil_matrix_df

In [97]:
# The shape of step simil matrix should be [# test cases] x [# test cases] = [3323] x [3323]
step_dist_matrix_df.shape

(3323, 3323)

### Embed test case names and compute distance between them

In [98]:
# Load the word2vec model
my_model = Word2Vec.load('results_approach_1/appr_1_my_model.model')

In [99]:
# Function to compute number of unique words in df ('test case name' field)
def get_number_unique_words_name(df):
    words_list = list()
    test_names = list(df["Case_Name"])
    for name in test_names:
        for word in name:
            words_list.append(word)
    number_unique_words = len(set(words_list))
    return number_unique_words

In [100]:
# Function to get list of words that occur less than a certain number of times ('test case name' field)
def get_word_frequency_name(df):
    words_list = list()
    test_names = list(df["Case_Name"])
    for name in test_names:
        for word in name:
            words_list.append(word)
    unique_words_list = set(words_list)
    word_occurrence_dict = {}
    for each_word in unique_words_list:
        word_occurrence_dict[each_word] = 0

    for name in test_names:
        for word in name:
            word_occurrence_dict[word] += 1
            
    ten_times_occurrence_words = list()
    # get list of words that occur only once
    for word, occurrence in word_occurrence_dict.items():
        if occurrence < 2:
            ten_times_occurrence_words.append(word)

    return ten_times_occurrence_words

In [10]:
# Function to remove problematic/mispelled words from vocabulary
def remove_problematic_words(df):
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps: ", number_unique_words)
    
    # load file with problematic words that exist in the test data
    problematic_words = open('word2vec_vocab_problematic.txt', 'r')
    problematic_words_list = list()
    for word in problematic_words:
        problematic_words_list.append(word.lstrip().rstrip())
    
    for index, row in df.iterrows():
        step = row["Steps"]
        df.loc[index]["Steps"] = [elem for elem in step if not elem in problematic_words_list]
        
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps after removing problematic words: ", number_unique_words)

In [11]:
# function to fix problematic/mispelled words from vocabulary
def fix_problematic_words(df):
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps: ", number_unique_words)
    
    # load file with problematic words that exist in the test data
    problematic_words = open('word2vec_vocab_to_fix.txt', 'r')
    problematic_words_dict = {}
    for line in problematic_words:
        full_line = line.split(':')
        try:
            problematic_words_dict[full_line[0]] = [x.replace('\n', '') for x in full_line[1].split(',')]
        except:
            problematic_words_dict[full_line[0]] = full_line[1].replace('\n', '')
    
    for index, row in df.iterrows():
        step = row["Steps"]
        modified_step = list()
        for word in step:
            if word in problematic_words_dict:
                modified_step.extend(problematic_words_dict[word])
            else:
                modified_step.append(word)
        df.loc[index]["Steps"] = modified_step 
        
    number_unique_words = get_number_unique_words(df)
    print("Number of unique words across all test steps after fixing problematic words: ", number_unique_words)

### Data preprocessing and cleaning

In [103]:
def preprocess_clean_data(df):
    print("Cleaning test case name field...")
    print("Dataset size before preprocessing: " , df.shape)
    
    # preprocessing and clean test name
    
    # replace url and similar structures (e.g, paths) with the keyword 'URL'
    df["Case_Name"] = df["Case_Name"].apply(lambda x: re.sub(r'http\S+', 'URL', x))
    df["Case_Name"] = df["Case_Name"].apply(lambda x: re.sub('\/[\w-]*', '', x))
    df["Case_Name"] = df["Case_Name"].apply(lambda x: re.sub(r'\{[^)]*\}', '', x))
    
    # lowercase the step descriptions
    df["Case_Name"] = df["Case_Name"].apply(lambda x: x.lower())
    
    # remove digits and words with digits
    df["Case_Name"] = df["Case_Name"].apply(lambda x: re.sub('\w*\d\w*','', x))
    
    # remove punctuations
    df["Case_Name"] = df["Case_Name"].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x))

    # remove extra spaces
    df["Case_Name"] = df["Case_Name"].apply(lambda x: re.sub(' +',' ',x))

    # tokenization
    df["Case_Name"] = df["Case_Name"].apply(lambda x: TweetTokenizer().tokenize(x))
    number_unique_words = get_number_unique_words_name(df)
    print("Number of unique words across all test names: ", number_unique_words)
    
    # stopword removal
    stop_words = set(stopwords.words('english'))
    df["Case_Name"] = df["Case_Name"].apply(lambda x: [w for w in x if not w in stop_words])
    number_unique_words = get_number_unique_words_name(df)
    print("Number of unique words in test names after stopword removal: ", number_unique_words)
    
    # lemmatization
    lemmatizer = WordNetLemmatizer() 
    df["Case_Name"] = df["Case_Name"].apply(lambda x: [lemmatizer.lemmatize(w) for w in x])
    
    # remove words that occur a certain number of times
    ten_times_occurrence_words = get_word_frequency_name(df)
    print("Number of words that occurred only once in test case names: ", len(ten_times_occurrence_words))
    
    # list of words to be removed
    for index, row in df.iterrows():
        current_test_name = row["Case_Name"]
        list_words_to_remove = list()
        for word in current_test_name:
            if word in ten_times_occurrence_words:
                list_words_to_remove.append(word)
        
        df.loc[index]["Case_Name"] = [elem for elem in current_test_name if not elem in list_words_to_remove]

#     # remove single letters present in the data
#     df["Name"] = df["Name"].apply(lambda x: [w for w in x if len(w.strip()) > 1])

    # remove instances with empty names
    df = df.loc[df["Case_Name"] != '']
    
    number_unique_words = get_number_unique_words_name(df)
    print("Number of unique words in test names in the end: ", number_unique_words)
    print("Dataset size after preprocessing: " , df.shape)

In [105]:
preprocess_clean_data(test_steps_df)

Cleaning test name field...
Dataset size before preprocessing:  (15668, 5)
Number of unique words across all test names:  1519
Number of unique words across all test steps:  1742
Number of unique words across all test steps after removing problematic words:  1742
Number of unique words across all test steps:  1742
Number of unique words across all test steps after fixing problematic words:  1507
Number of unique words in test names after stopword removal:  1443
Number of unique words in test names after lemmatization:  1298
Number of words that occurred only once in test names:  163
Number of unique words in test names after one-occurring-word removal:  1135
Number of unique words in test names in the end:  1135
Dataset size after preprocessing:  (15668, 5)


In [109]:
# Check one example
list_keys = list(set(test_steps_df['Key'].tolist()))
test_steps_df[test_steps_df['Key'] == list_keys[0]]

In [112]:
# Initialize matrix to store distances between test case names
dist_matrix_test_case_name = np.zeros((len(test_case_steps_dict),len(test_case_steps_dict)))

In [113]:
row_names = [test_case_key for test_case_key in test_case_steps_dict]
col_names = row_names
dist_matrix_test_case_name_df = pd.DataFrame(dist_matrix_test_case_name, index = row_names, columns = col_names)

In [None]:
dist_matrix_test_case_name_df

In [None]:
for i in range(len(test_case_steps_dict)):
    print(i)
    for j in range(i, len(test_case_steps_dict)):
        case_key_1 = test_case_dict_keys_list[i]
        case_key_2 = test_case_dict_keys_list[j]
        test_case_name_1 = test_steps_df[test_steps_df['Key'] == case_key_1]['Case_Name'].tolist()[0]
        test_case_name_2 = test_steps_df[test_steps_df['Key'] == case_key_2]['Case_Name'].tolist()[0]
        computed_dist = my_model.wv.wmdistance(test_case_name_1, test_case_name_2)
        dist_matrix_test_case_name_df.loc[case_key_1,case_key_2] = dist_matrix_test_case_name_df.loc[case_key_2,case_key_1] = computed_dist

In [None]:
dist_matrix_test_case_name_df

### Normalize values of wmdistance to the range [0,1] and modify such that distance 1 corresponds to similar names

In [None]:
# Find max value
max_distance = 0
for i in range(len(dist_matrix_test_case_name_df)):
    print(i)
    for j in range(i, len(dist_matrix_test_case_name_df)):
        case_key_1 = test_case_dict_keys_list[i]
        case_key_2 = test_case_dict_keys_list[i]
        value_to_compare = dist_matrix_test_case_name_df.loc[case_key_1, case_key_2]
        if value_to_compare  > max_distance:
            max_distance = value_to_compare

In [None]:
# Replace 'inf' values by 15 (which is close to 14, the max value without considering 'inf')
for i in range(len(dist_matrix_test_case_name_df)):
    print(i)
    for j in range(len(dist_matrix_test_case_name_df)):
        case_key_1 = test_case_dict_keys_list[i]
        case_key_2 = test_case_dict_keys_list[j]
        value = dist_matrix_test_case_name_df.loc[case_key_1, case_key_2]
        if value > 15:
            dist_matrix_test_case_name_df.loc[case_key_1, case_key_2] = 15.0

In [None]:
# Find max value again to check if there is no 'inf'
max_distance = 0
for i in range(len(dist_matrix_test_case_name_df)):
    print(i)
    for j in range(i, len(dist_matrix_test_case_name_df)):
        case_key_1 = test_case_dict_keys_list[i]
        case_key_2 = test_case_dict_keys_list[i]
        value_to_compare = dist_matrix_test_case_name_df.loc[case_key_1, case_key_2]
        if value_to_compare  > max_distance:
            max_distance = value_to_compare
print(max_distance)

In [None]:
# Normalize scores
for i in range(len(test_case_steps_dict)):
    print(i)
    for j in range(i, len(test_case_steps_dict)):
        case_key_1 = test_case_dict_keys_list[i]
        case_key_2 = test_case_dict_keys_list[j]
        current_value = dist_matrix_test_case_name_df.loc[case_key_1,case_key_2]
        new_value = (abs(current_value - max_distance)/max_distance)
        dist_matrix_test_case_name_df.loc[case_key_1,case_key_2] = dist_matrix_test_case_name_df.loc[case_key_2,case_key_1] = new_value

In [None]:
dist_matrix_test_case_name_df

### Combine test step and test name distance/similarity matrices

In [150]:
final_dist_matrix = np.zeros((len(test_case_steps_dict),len(test_case_steps_dict)))

In [151]:
row_names = [test_case_key for test_case_key in test_case_steps_dict]
col_names = row_names
final_dist_matrix_df = pd.DataFrame(final_dist_matrix, index = row_names, columns = col_names)

### Parameter tuning

In [153]:
grid_search_step_name_weight = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
grid_search_cosine_threshold = np.array([0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0])

In [None]:
# Build dictionary with distances (faster to access)
step_name_distances_dict = {}
for i in range(len(test_case_steps_dict)):
    print(i)
    for j in range(i, len(test_case_steps_dict)):
        case_key_1 = test_case_dict_keys_list[i]
        case_key_2 = test_case_dict_keys_list[j]
        step_distance = step_dist_matrix_df.loc[case_key_1,case_key_2]
        name_distance = dist_matrix_test_case_name_df.loc[case_key_1,case_key_2]
        step_name_distances_dict[(case_key_1,case_key_2)] = [step_distance, name_distance]

In [154]:
step_name_distances_dict[('TM4J-T5233','TM4J-T5234')]

[0.5773502691896257, 0.7860774358350118]

In [155]:
for weight in grid_search_step_name_weight:
    print("Weight: " , weight)
    for threshold in grid_search_cosine_threshold:
        print("Threshold: " , threshold)
        
        # Initialize final distance matrix
        final_dist_matrix = np.zeros((len(test_case_steps_dict),len(test_case_steps_dict)))
        row_names = [test_case_key for test_case_key in test_case_steps_dict]
        col_names = row_names
        final_dist_matrix_df = pd.DataFrame(final_dist_matrix, index = row_names, columns = col_names)
        
        test_cases_overlap_tuple_list = list()
        
        # Compute weighted distance
        for i in range(len(test_case_steps_dict)):
            for j in range(i, len(test_case_steps_dict)):
                case_key_1 = test_case_dict_keys_list[i]
                case_key_2 = test_case_dict_keys_list[j]
                distances = step_name_distances_dict[(case_key_1,case_key_2)]
                step_distance = distances[0]
                name_distance = distances[1]
                final_distance = (weight*step_distance) + ((1-weight)*name_distance)
                final_dist_matrix_df.loc[case_key_1,case_key_2] = final_dist_matrix_df.loc[case_key_2,case_key_1] = final_distance
        
                # Identify pairs of test cases that are similar (using dataframe) according to the threshold
                if i != j:
                    if (final_distance >= threshold):
                        test_cases_overlap_tuple_list.append((i,j))

        # Merge test cases that are similar
        similar_test_cases_list = list()
        for test_case_tuple in test_cases_overlap_tuple_list:
            index_1 = test_case_tuple[0]
            index_2 = test_case_tuple[1]
            found = False
            for test_case_set in similar_test_cases_list:
                if (index_1 in test_case_set) or (index_2 in test_case_set):
                    test_case_set.add(index_1)
                    test_case_set.add(index_2)
                    found = True
                    break
            if not found:
                temp_set = set()
                temp_set.add(index_1)
                temp_set.add(index_2)
                similar_test_cases_list.append(temp_set)
        print("Number of groups of similar test cases: ", len(similar_test_cases_list))

        test_case_key_unique = list()
        for elem in similar_test_cases_list:
            for index in elem:
                if index not in test_case_key_unique:
                    test_case_key_unique.append(index)
        print("Number of test cases that have at least another similar case: ", len(test_case_key_unique))
        print("Number of test cases that do NOT have any similar case: ", ( len(test_case_steps_dict) - len(test_case_key_unique) ))

        approach_4_dict = {}
        cluster_id = 0
        for each_set in similar_test_cases_list:
            for elem in each_set:
                case_key = test_case_dict_keys_list[elem]
                approach_4_dict[case_key] = cluster_id
            cluster_id += 1

        for elem in range(len(test_case_steps_dict)):
            if elem not in test_case_key_unique:
                case_key = test_case_dict_keys_list[elem]
                approach_4_dict[case_key] = cluster_id
                cluster_id += 1
        
        # Save groups of similar test cases
        file_name = 'test_case_similarity/results_approach_4_name_embed/approach_4_name_embed_' + str(weight) + '_' + str(threshold) + '.txt'
        output_file = open(file_name, 'w')

        for key in approach_4_dict:
            output_file.write(key + ":" + str(approach_4_dict[key]) + "\n")
        output_file.close()

Weight:  0.1
Threshold:  0.1
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  3313
Number of test cases that do NOT have any similar case:  10
Threshold:  0.15
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  3311
Number of test cases that do NOT have any similar case:  12
Threshold:  0.2
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  3311
Number of test cases that do NOT have any similar case:  12
Threshold:  0.25
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  3311
Number of test cases that do NOT have any similar case:  12
Threshold:  0.3
Number of groups of similar test cases:  1
Number of test cases that have at least another similar case:  3311
Number of test cases that do NOT have any similar case:  12
Threshold:  0.35
Number of groups of similar test 

### Use the best model (after running the notebook for test case similarity evaluation) to compute distribution of similar test cases

Threshold = 0.75

Weight = 0.50

In [157]:
best_threhold = 0.75
best_weight = 0.50

# Initialize final distance matrix
final_dist_matrix = np.zeros((len(test_case_steps_dict),len(test_case_steps_dict)))
row_names = [test_case_key for test_case_key in test_case_steps_dict]
col_names = row_names
final_dist_matrix_df = pd.DataFrame(final_dist_matrix, index = row_names, columns = col_names)

test_cases_overlap_tuple_list = list()

# Compute weighted distance
for i in range(len(test_case_steps_dict)):
    for j in range(i, len(test_case_steps_dict)):
        case_key_1 = test_case_dict_keys_list[i]
        case_key_2 = test_case_dict_keys_list[j]
        distances = step_name_distances_dict[(case_key_1,case_key_2)]
        step_distance = distances[0]
        name_distance = distances[1]
        final_distance = (best_weight*step_distance) + ((1-best_weight)*name_distance)
        final_dist_matrix_df.loc[case_key_1,case_key_2] = final_dist_matrix_df.loc[case_key_2,case_key_1] = final_distance

        # Identify pairs of test cases that are similar (using dataframe)
        if i != j:
            if (final_distance >= best_threhold):
                test_cases_overlap_tuple_list.append((i,j))

# Merge test cases that are similar
similar_test_cases_list = list()
for test_case_tuple in test_cases_overlap_tuple_list:
    index_1 = test_case_tuple[0]
    index_2 = test_case_tuple[1]
    found = False
    for test_case_set in similar_test_cases_list:
        if (index_1 in test_case_set) or (index_2 in test_case_set):
            test_case_set.add(index_1)
            test_case_set.add(index_2)
            found = True
            break
    if not found:
        temp_set = set()
        temp_set.add(index_1)
        temp_set.add(index_2)
        similar_test_cases_list.append(temp_set)
print("Number of groups of similar test cases: ", len(similar_test_cases_list))

test_case_key_unique = list()
for elem in similar_test_cases_list:
    for index in elem:
        if index not in test_case_key_unique:
            test_case_key_unique.append(index)
print("Number of test cases that have at least another similar case: ", len(test_case_key_unique))
print("Number of test cases that do NOT have any similar case: ", ( len(test_case_steps_dict) - len(test_case_key_unique) ))

approach_4_dict = {}
cluster_id = 0
for each_set in similar_test_cases_list:
    for elem in each_set:
        case_key = test_case_dict_keys_list[elem]
        approach_4_dict[case_key] = cluster_id
    cluster_id += 1

for elem in range(len(test_case_steps_dict)):
    if elem not in test_case_key_unique:
        case_key = test_case_dict_keys_list[elem]
        approach_4_dict[case_key] = cluster_id
        cluster_id += 1
        
clusters_dict = {}
for key in approach_4_dict:
    cluster_id = approach_4_dict[key]
    if cluster_id not in clusters_dict:
        clusters_dict[cluster_id] = [key]
    else:
        existing_list = clusters_dict[cluster_id]
        existing_list.append(key)
        clusters_dict[cluster_id] = existing_list

Number of groups of similar test cases:  429
Number of test cases that have at least another similar case:  2153
Number of test cases that do NOT have any similar case:  1170


In [158]:
size_clusters_list = list()
for key in clusters_dict:
    elem = clusters_dict[key]
    size_clusters_list.append(len(elem))

In [None]:
# Plot the distribution of the number of test cases in the clusters
plt.figure()
pd.DataFrame(size_clusters_list).plot.box()
plt.title('Dist. similar test cases')
plt.show()
print(pd.DataFrame(size_clusters_list).describe(), end='\n\n')