### Experiments with zero-shot classification techniques to classify manual test cases (i.e., textual descriptions of test cases) into the game features that they cover.

We experiment with the following models:

In [9]:
# Import necessary libraries
import os
import re
import time
import string
import pandas as pd
import numpy as np
from statistics import median, mean
import pathlib
from pathlib import Path
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import torch
import torch.nn.functional as F
from nltk.tokenize import word_tokenize, TweetTokenizer
import nltk 
from nltk.corpus import stopwords
import gensim.downloader as api
from gensim.models import Word2Vec, Phrases, KeyedVectors
import fasttext
from scipy import spatial
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import multilabel_confusion_matrix, precision_recall_fscore_support
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib
import warnings
from IPython.display import display, HTML
warnings.filterwarnings("ignore")

In [None]:
# Notebook configurations
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
display(HTML("<style>.container { width:100% !important; }</style>"))

In [7]:
# Import modules with different classification methods
from zero_shot_nli import run_zero_shot_nli
from zero_shot_nli_metrics_per_class import run_zero_shot_nli_metrics_per_class
from zero_shot_latent_w2v import run_zero_shot_latent_w2v
from baseline import run_baseline
import utils

---

### Load and pre-process labeled data

In [None]:
# Load labeled data
labeled_test_cases_df = utils.read_data()
labeled_test_cases_df.head()

In [None]:
# Pre-process data
(test_case_name_df, test_case_name_obj_df) = utils.preprocess_data(labeled_test_cases_df)

In [None]:
# Get list of unique labels (game features)
unique_labels = []
for index,row in test_case_name_df.iterrows():
    labels = row['labels']
    for lab in labels:
        if lab not in unique_labels:
            unique_labels.append(lab)

In [None]:
# Dict with counter of unique labels
unique_labels_count = dict.fromkeys(unique_labels,0)
for index,row in test_case_name_df.iterrows():
    labels = row['labels']
    for lab in labels:
        unique_labels_count[lab] += 1

In [None]:
# Avg number of unique labels
mean_label_counter = mean(list(unique_labels_count.values()))
print("There are on average {count} unique labels.".format(count=mean_label_counter))

In [None]:
# Load labels (game features)
candidate_label_file = "INSERT_DIR_OF_LIST_OF_GAME_FEATURES"
candidate_labels = candidate_label_file.read().splitlines()
print("There are {count} candidate labels.".format(count=len(candidate_labels)))

In [12]:
# Set Mlflow experiment dir
experiment_dir = "INSERT_DIR_TO_RECORD_EXPERIMENTS_WITH_MLFLOW"

---

### Baseline

In [None]:
# Define name and description of experiment
experiment_name = "Baseline experiment - Test case name and objective"
experiment_active = mlflow.set_experiment(experiment_name)
experiment_id = experiment_active.experiment_id
MlflowClient().set_experiment_tag(experiment_id, 
     "mlflow.note.content","Evaluate keyword-based approach to classify test cases (with test case name and objective).")

In [None]:
# Replace dash by space in candidate labels with more than one word (achieves better performance)
candidate_labels_mod = []
for elem in candidate_labels:
    res = ' '.join(elem.split('-'))
    candidate_labels_mod.append(res)

In [None]:
# Test cases represented by name
run_name = "Test case name"
run_baseline(test_case_name_df, candidate_labels, candidate_labels_mod, experiment_name, run_name, experiment_dir)

In [None]:
# Test cases represented by name + objective
run_name = "Test case name + objective"
run_baseline(test_case_name_obj_df, candidate_labels, candidate_labels_mod, experiment_name, run_name, experiment_dir)

### Experiments with individual zero-shot techniques

#### BartLargeMNLI - [facebook/bart-large-mnli](https://huggingface.co/facebook/bart-large-mnli)

In [None]:
# Define name and description of experiment
experiment_name = "BartLargeMNLI - Test case name and objective"
experiment_active = mlflow.set_experiment(experiment_name)
experiment_id = experiment_active.experiment_id
MlflowClient().set_experiment_tag(experiment_id, 
     "mlflow.note.content","Evaluate BartLargeMNLI to classify test cases (with test case name and objective).")

In [None]:
# Load zero-shot classifier from the HF pipeline - set device=0 to use GPU for faster inference
zero_shot_nli_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)

In [None]:
# Run classifier - considering test case as test case name
run_name = "Test case name"
run_zero_shot_nli(zero_shot_nli_classifier, candidate_labels, test_case_name_df, experiment_name, run_name, experiment_dir)

In [None]:
# Run classifier - considering test case as test case name + test case objective
run_name = "Test case name + objective"
run_zero_shot_nli(zero_shot_nli_classifier, candidate_labels, test_case_name_obj_df, experiment_name, run_name, experiment_dir)

#### CrossEncoderNLI - [cross-encoder/nli-distilroberta-base](https://huggingface.co/cross-encoder/nli-distilroberta-base)

In [None]:
# Define name and description of experiment
experiment_name = "CrossEncoderNLI - Test case name and objective"
experiment_active = mlflow.set_experiment(experiment_name)
experiment_id = experiment_active.experiment_id
MlflowClient().set_experiment_tag(experiment_id, 
     "mlflow.note.content","Evaluate CrossEncoderNLI to classify test cases (with test case name and objective).")

In [None]:
# Load zero-shot classifier from the HF pipeline - set device=0 to use GPU for faster inference
zero_shot_nli_cross_enc_classifier = pipeline("zero-shot-classification", model='cross-encoder/nli-distilroberta-base', device=0)

In [None]:
# Run classifier - considering test case as test case name
run_name = "Test case name"
run_zero_shot_nli(zero_shot_nli_cross_enc_classifier, candidate_labels, test_case_name_df, experiment_name, run_name, experiment_dir)

In [None]:
# Run classifier - considering test case as test case name + test case objective
run_name = "Test case name + objective"
run_zero_shot_nli(zero_shot_nli_cross_enc_classifier, candidate_labels, test_case_name_obj_df, experiment_name, run_name, experiment_dir)

#### LatentEmb - [latent-embeddings](https://joeddav.github.io/blog/2020/05/29/ZSL.html)

### Experiments with **ensembles** of individual zero-shot techniques

#### EnsMajorVoting - Ensemble with majority voting

#### EnsFullInters - Ensemble with full intersection

#### EnsBackOffTwo - Ensemble with back-off using top-2 models

#### EnsBackOffComplete - Ensemble with back-off using all models