### Experiments with zero-shot classification techniques to classify manual test cases (i.e., textual descriptions of test cases) into the game features that they cover.

We experiment with the following models:

In [9]:
# Import necessary libraries
import os
import re
import time
import string
import pandas as pd
import numpy as np
from statistics import median, mean
import pathlib
from pathlib import Path
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import torch
import torch.nn.functional as F
from nltk.tokenize import word_tokenize, TweetTokenizer
import nltk 
from nltk.corpus import stopwords
import gensim.downloader as api
from gensim.models import Word2Vec, Phrases, KeyedVectors
import fasttext
from scipy import spatial
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import multilabel_confusion_matrix, precision_recall_fscore_support
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib
import warnings
from IPython.display import display, HTML
warnings.filterwarnings("ignore")

In [None]:
# Notebook configurations
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
display(HTML("<style>.container { width:100% !important; }</style>"))

In [15]:
# Import modules with different classification methods
from zero_shot_nli import run_zero_shot_nli
from zero_shot_nli_metrics_per_class import run_zero_shot_nli_metrics_per_class
from zero_shot_latent_embedding import run_zero_shot_latent_emb
from baseline import run_baseline
import utils

---

### Load and pre-process labeled data

In [None]:
# Load labeled data
labeled_test_cases_df = utils.read_data()
labeled_test_cases_df.head()

In [None]:
# Pre-process data
(test_case_name_df, test_case_name_obj_df) = utils.preprocess_data(labeled_test_cases_df)

In [None]:
# Get list of unique labels (game features)
unique_labels = []
for index,row in test_case_name_df.iterrows():
    labels = row['labels']
    for lab in labels:
        if lab not in unique_labels:
            unique_labels.append(lab)

In [None]:
# Dict with counter of unique labels
unique_labels_count = dict.fromkeys(unique_labels,0)
for index,row in test_case_name_df.iterrows():
    labels = row['labels']
    for lab in labels:
        unique_labels_count[lab] += 1

In [None]:
# Avg number of unique labels
mean_label_counter = mean(list(unique_labels_count.values()))
print("There are on average {count} unique labels.".format(count=mean_label_counter))

In [None]:
# Load labels (game features)
candidate_label_file = "INSERT_DIR_OF_LIST_OF_GAME_FEATURES"
candidate_labels = candidate_label_file.read().splitlines()
print("There are {count} candidate labels.".format(count=len(candidate_labels)))

In [12]:
# Set Mlflow experiment dir
experiment_dir = "INSERT_DIR_TO_RECORD_EXPERIMENTS_WITH_MLFLOW"

---

### Baseline

In [None]:
# Define name and description of experiment
experiment_name = "Baseline experiment - Test case name and objective"
experiment_active = mlflow.set_experiment(experiment_name)
experiment_id = experiment_active.experiment_id
MlflowClient().set_experiment_tag(experiment_id, 
     "mlflow.note.content","Evaluate keyword-based approach to classify test cases (with test case name and objective).")

In [None]:
# Replace dash by space in candidate labels with more than one word (achieves better performance)
candidate_labels_mod = []
for elem in candidate_labels:
    res = ' '.join(elem.split('-'))
    candidate_labels_mod.append(res)

In [None]:
# Test cases represented by name
run_name = "Test case name"
run_baseline(test_case_name_df, candidate_labels, candidate_labels_mod, experiment_name, run_name, experiment_dir)

In [None]:
# Test cases represented by name + objective
run_name = "Test case name + objective"
run_baseline(test_case_name_obj_df, candidate_labels, candidate_labels_mod, experiment_name, run_name, experiment_dir)

### Experiments with individual zero-shot techniques

#### BartLargeMNLI - [facebook/bart-large-mnli](https://huggingface.co/facebook/bart-large-mnli)

In [None]:
# Define name and description of experiment
experiment_name = "BartLargeMNLI - Test case name and objective"
experiment_active = mlflow.set_experiment(experiment_name)
experiment_id = experiment_active.experiment_id
MlflowClient().set_experiment_tag(experiment_id, 
     "mlflow.note.content","Evaluate BartLargeMNLI to classify test cases (with test case name and objective).")

In [None]:
# Load zero-shot classifier from the HF pipeline - set device=0 to use GPU for faster inference
zero_shot_nli_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)

In [None]:
# Run classifier - considering test case as test case name
run_name = "Test case name"
run_zero_shot_nli(zero_shot_nli_classifier, candidate_labels, test_case_name_df, experiment_name, run_name, experiment_dir)

In [None]:
# Run classifier - considering test case as test case name + test case objective
run_name = "Test case name + objective"
run_zero_shot_nli(zero_shot_nli_classifier, candidate_labels, test_case_name_obj_df, experiment_name, run_name, experiment_dir)

#### CrossEncoderNLI - [cross-encoder/nli-distilroberta-base](https://huggingface.co/cross-encoder/nli-distilroberta-base)

In [None]:
# Define name and description of experiment
experiment_name = "CrossEncoderNLI - Test case name and objective"
experiment_active = mlflow.set_experiment(experiment_name)
experiment_id = experiment_active.experiment_id
MlflowClient().set_experiment_tag(experiment_id, 
     "mlflow.note.content","Evaluate CrossEncoderNLI to classify test cases (with test case name and objective).")

In [None]:
# Load zero-shot classifier from the HF pipeline - set device=0 to use GPU for faster inference
zero_shot_nli_cross_enc_classifier = pipeline("zero-shot-classification", model='cross-encoder/nli-distilroberta-base', device=0)

In [None]:
# Run classifier - considering test case as test case name
run_name = "Test case name"
run_zero_shot_nli(zero_shot_nli_cross_enc_classifier, candidate_labels, test_case_name_df, experiment_name, run_name, experiment_dir)

In [None]:
# Run classifier - considering test case as test case name + test case objective
run_name = "Test case name + objective"
run_zero_shot_nli(zero_shot_nli_cross_enc_classifier, candidate_labels, test_case_name_obj_df, experiment_name, run_name, experiment_dir)

#### LatentEmb - [latent-embeddings](https://joeddav.github.io/blog/2020/05/29/ZSL.html)

We experiment with Wor2vec, Fasttext, and Glove embedding models together with the SBERT sentence embedding model

In [None]:
# Define name and description of experiment
experiment_name = "Zero-shot Latent embeddings - Word embbeding models"
experiment_active = mlflow.set_experiment(experiment_name)
experiment_id = experiment_active.experiment_id
MlflowClient().set_experiment_tag(experiment_id, 
     "mlflow.note.content","Evaluate different word embedding models for zero-shot with latent embeddings approach")

In [None]:
sbert_name = 'sentence-t5-large'
sbert_model = SentenceTransformer(sbert_name, device='cuda')

In [None]:
def ordinary_least_squares_lr(
    X: torch.Tensor, Y: torch.Tensor, alpha: float = 0) -> torch.Tensor:
    """Computes ordinary least squares
    For more information on the derivation of the closed-form expression,
    check it the Wikipedia page here:
    https://en.wikipedia.org/wiki/Ordinary_least_squares#Matrix/vector_formulation
    In brief: we find a matrix, w, that transforms X to Y according to:
    Y = Xw
    (X.T X)^-1 X.T Y = [(X.T X)^-1 X.T X]w
    w = (X.T X + alpha*I)^-1 X.T Y
    where I is the identity matrix and alpha is the amount of regularization.
    alpha = 0 is equivalent to OLS (ordinary least squares)
    alpha >= 0 is ridge regression / l2 regularization
    """
    X_norm = F.normalize(X, p=2, dim=1)
    Y_norm = F.normalize(Y, p=2, dim=1)
    I = torch.eye(X_norm.shape[1])

    inner = torch.matmul(X_norm.T, X_norm) + alpha * I
    # Z = torch.linalg.inv(inner)
    Z = torch.inverse(inner)
    Z = torch.matmul(Z, X_norm.T)
    w = torch.matmul(Z, Y_norm)
    return w

##### Zero-shot Latent embeddings - Word2Vec

In [None]:
# Load word2vec pre-trained model
w2v_model = api.load('word2vec-google-news-300')

In [None]:
topk_words_pretrained = w2v_model.index_to_key[:20000]
print("Len of topk word vector", len(topk_words_pretrained))

# Remove stopwords
topk_words_pretrained = [x for x in topk_words_pretrained if x not in stopwords.words('english')]

# Remove punctuations
topk_words_pretrained = [x for x in topk_words_pretrained if ( (x not in string.punctuation) or (x[0] not in string.punctuation) )]

# Remove single letters/digits
topk_words_pretrained = [x for x in topk_words_pretrained if len(x) > 1]

# Remove any remaining number
topk_words_pretrained = [x for x in topk_words_pretrained if not x.isdigit()]

print("Len of topk word vector after filtering", len(topk_words_pretrained))

In [None]:
# Get w2v embeddings
w2v_emb_vectors = []
for word in topk_words_pretrained:
    w2v_emb_vectors.append(w2v_model.get_vector(word))
    
w2v_emb_vectors = np.array(w2v_emb_vectors)
w2v_emb_vectors = torch.tensor(w2v_emb_vectors)
print("Len of w2v embedding vector list", len(w2v_emb_vectors))

In [None]:
# Get SBERT embeddings for the same set of words
sbert_emb_vectors = []
for word in topk_words_pretrained:
    sbert_emb_vectors.append(sbert_model.encode(word))
    
sbert_emb_vectors = np.array(sbert_emb_vectors)
sbert_emb_vectors = torch.tensor(sbert_emb_vectors)
print("Len of sbert embedding vector list", len(sbert_emb_vectors))

In [None]:
# Compute transfer matrix
transfer_matrix = ordinary_least_squares_lr(sbert_emb_vectors, w2v_emb_vectors, alpha=0)
print(transfer_matrix.shape)

In [None]:
# Get embeddings of candidate labels
candidate_label_embeddings = sbert_model.encode(candidate_labels_mod)
print(candidate_label_embeddings.shape)

# Covert to tensor
candidate_label_embeddings = torch.tensor(candidate_label_embeddings)
print(candidate_label_embeddings.shape)

# Apply linear transformation
candidate_label_embeddings_transformed = torch.mm(candidate_label_embeddings, transfer_matrix)
print(candidate_label_embeddings_transformed.shape)

In [None]:
# Run classifier - considering test case as test case name
run_name = 'Test case name - ' + 'Word2Vec + ' + sbert_name
run_zero_shot_latent_emb(test_case_name_df, candidate_labels_mod, candidate_label_embeddings_transformed,
                         sbert_model, transfer_matrix, experiment_name, run_name, experiment_dir)

In [None]:
# Run classifier - considering test case as test case name + test case objective
run_name = 'Test case name + objective - ' + 'Word2Vec + ' + sbert_name
run_zero_shot_latent_emb(test_case_name_obj_df, candidate_labels_mod, candidate_label_embeddings_transformed,
                         sbert_model, transfer_matrix, experiment_name, run_name, experiment_dir)

##### Zero-shot Latent embeddings - Fasttext

In [None]:
# fasttext.util.download_model('en', if_exists='ignore')  # English
fasttext_model = fasttext.load_model("INSERT_PATH_OF_MODEL")

In [None]:
topk_words_pretrained = fasttext_model.get_words()[:20000]
print("Len of topk word vector", len(topk_words_pretrained))

# Remove stopwords
topk_words_pretrained = [x for x in topk_words_pretrained if x not in stopwords.words('english')]

# Remove punctuations
topk_words_pretrained = [x for x in topk_words_pretrained if ( (x not in string.punctuation) or (x[0] not in string.punctuation) )]

# Remove single letters/digits
topk_words_pretrained = [x for x in topk_words_pretrained if len(x) > 1]

# Remove any remaining number
topk_words_pretrained = [x for x in topk_words_pretrained if not x.isdigit()]

print("Len of topk word vector after filtering", len(topk_words_pretrained))

In [None]:
# Get fasttext embeddings
ft_emb_vectors = []
for word in topk_words_pretrained:
    ft_emb_vectors.append(fasttext_model.get_word_vector(word))
    
ft_emb_vectors = np.array(ft_emb_vectors)
ft_emb_vectors = torch.tensor(ft_emb_vectors)
print("Len of fasttext embedding vector list", len(ft_emb_vectors))

In [None]:
# Get SBERT embeddings for the same set of words
sbert_emb_vectors = []
for word in topk_words_pretrained:
    sbert_emb_vectors.append(sbert_model.encode(word))
    
sbert_emb_vectors = np.array(sbert_emb_vectors)
sbert_emb_vectors = torch.tensor(sbert_emb_vectors)
print("Len of sbert embedding vector list", len(sbert_emb_vectors))

In [None]:
# Compute transfer matrix
transfer_matrix = ordinary_least_squares_lr(sbert_emb_vectors, ft_emb_vectors, alpha=0)
print(transfer_matrix.shape)

In [None]:
# Get embeddings of candidate labels
candidate_label_embeddings = sbert_model.encode(candidate_labels_mod)
print(candidate_label_embeddings.shape)

# Covert to tensor
candidate_label_embeddings = torch.tensor(candidate_label_embeddings)
print(candidate_label_embeddings.shape)

# Apply linear transformation
candidate_label_embeddings_transformed = torch.mm(candidate_label_embeddings, transfer_matrix)
print(candidate_label_embeddings_transformed.shape)

In [None]:
# Run classifier - considering test case as test case name
run_name = 'Test case name - ' + 'Fasttext + ' + sbert_name
run_zero_shot_latent_emb(test_case_name_df, candidate_labels_mod, candidate_label_embeddings_transformed,
                         sbert_model, transfer_matrix, experiment_name, run_name, experiment_dir)

In [None]:
# Run classifier - considering test case as test case name + test case objective
run_name = 'Test case name + objective - ' + 'Fasttext + ' + sbert_name
run_zero_shot_latent_emb(test_case_name_obj_df, candidate_labels_mod, candidate_label_embeddings_transformed,
                         sbert_model, transfer_matrix, experiment_name, run_name, experiment_dir)

##### Zero-shot Latent embeddings - Glove

In [None]:
# Create dictionary with word embeddings from Glove
embeddings_index = {}
f = open('INSERT_PATH_OF_MODEL','r',encoding='utf-8')
for line in f:
    values = line.split(' ')
    word = values[0]
    vector = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = vector
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
# Get glove embeddings (using the same 'topk_words_pretrained' as before since we cannot get word frequency from glove)
glove_emb_vectors = []
sbert_emb_vectors = []

for word in topk_words_pretrained:
    try:
        glove_emb_vectors.append(embeddings_index[word])
        sbert_emb_vectors.append(sbert_model.encode(word))
    except:
        continue

glove_emb_vectors = np.array(glove_emb_vectors)
glove_emb_vectors = torch.tensor(glove_emb_vectors)

# Convert from float64 (double) to float
glove_emb_vectors = glove_emb_vectors.type(torch.float32) 
print("Len of glove embedding vector list", len(glove_emb_vectors))

sbert_emb_vectors = np.array(sbert_emb_vectors)
sbert_emb_vectors = torch.tensor(sbert_emb_vectors)
print("Len of sbert embedding vector list", len(sbert_emb_vectors))

In [None]:
# Compute transfer matrix
transfer_matrix = ordinary_least_squares_lr(sbert_emb_vectors, glove_emb_vectors, alpha=0)
print(transfer_matrix.shape)

In [None]:
# Get embeddings of candidate labels
candidate_label_embeddings = sbert_model.encode(candidate_labels_mod)
print(candidate_label_embeddings.shape)

# Covert to tensor
candidate_label_embeddings = torch.tensor(candidate_label_embeddings)
print(candidate_label_embeddings.shape)

# Apply linear transformation
candidate_label_embeddings_transformed = torch.mm(candidate_label_embeddings, transfer_matrix)
print(candidate_label_embeddings_transformed.shape)

In [None]:
# Run classifier - considering test case as test case name
run_name = 'Test case name - ' + 'Glove + ' + sbert_name
run_zero_shot_latent_emb(test_case_name_df, candidate_labels_mod, candidate_label_embeddings_transformed,
                         sbert_model, transfer_matrix, experiment_name, run_name, experiment_dir)

In [None]:
# Run classifier - considering test case as test case name + test case objective
run_name = 'Test case name + objective - ' + 'Glove + ' + sbert_name
run_zero_shot_latent_emb(test_case_name_obj_df, candidate_labels_mod, candidate_label_embeddings_transformed,
                         sbert_model, transfer_matrix, experiment_name, run_name, experiment_dir)

### Experiments with **ensembles** of individual zero-shot techniques

#### EnsMajorVoting - Ensemble with majority voting

#### EnsFullInters - Ensemble with full intersection

#### EnsBackOffTwo - Ensemble with back-off using top-2 models

#### EnsBackOffComplete - Ensemble with back-off using all models