In [215]:
import os
import pickle
import random
import numpy as np
import pandas as pd
from sklearn import svm
from typing import Dict
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

import glob
import torch

from setfit import SetFitModel, SetFitTrainer
from datasets import load_dataset, logging
logging.set_verbosity_error()

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import XLNetForSequenceClassification, RobertaForSequenceClassification
from transformers import XLMRobertaForSequenceClassification, DistilBertForSequenceClassification
from transformers import RobertaTokenizer, XLMRobertaTokenizer, DistilBertTokenizer, XLNetTokenizer

### Common Helper Functions

In [336]:
def get_avg_report(results, folds):
    
    """
    function takes the input of predicted model results on five folds and returns
    average of weighted and macro Precision, Recall, F-1 
    
    """
    
    weighted_precision = []
    weighted_recall = []
    weighted_f1 = []
    
    macro_precision = []
    macro_recall = []
    macro_f1 = []
    
    for result_df in results:                        
        res_rows = result_df.tail(3)

        precision_scores =  res_rows['precision'].tolist()
        recall_scores =  res_rows['recall'].tolist()
        f1_scores =  res_rows['f1-score'].tolist()

        precision_macro_avg =  precision_scores[1]
        precision_weighted_avg = precision_scores[2]

        recall_macro_avg =  recall_scores[1]
        recall_weighted_avg = recall_scores[2]

        fl_accuracy = f1_scores[0]
        f1_scores_macro_avg =  f1_scores[1]
        f1_scores_weighted_avg = f1_scores[2]
                
        weighted_precision.append(precision_weighted_avg)
        weighted_recall.append(recall_weighted_avg)
        weighted_f1.append(f1_scores_weighted_avg)
        
        macro_precision.append(precision_macro_avg)
        macro_recall.append(recall_macro_avg)
        macro_f1.append(f1_scores_macro_avg)
                
    weighted_average = round(sum(weighted_precision) / folds, 2), round(sum(weighted_recall) / folds, 2), round(sum(weighted_f1) / folds, 2)
    macro_average = round(sum(macro_precision) / folds, 2), round(sum(macro_recall) / folds, 2), round(sum(macro_f1) / folds, 2)
            
    return weighted_average, macro_average

def get_accuracy(y_actual, y_predicted):
    """
    function takes the actual and predicted labels to return
    the accuracy per fold
    
    """
    count = 0
    for index in zip(y_actual, y_predicted):
        
        if index[0] == index[1]:
                count += 1
    topk_acc = round(count / len(y_actual), 2)
    return topk_acc


### ML alogrithms Pipeline

In [170]:
def load_ML_model_files(model_name, model_path, pca):
    
    """
    function load the ML models relevant files based 
    on the parameters given
    
    """
    
    ML_model = pickle.load(open(model_path + '/'+ model_name + '.pickle', 'rb'))
    if pca:
        pca_vectorizer = pickle.load(open(model_path + 'pca_vectorizer.pickle', "rb"))
    else:
        pca_vectorizer = None
    tfidf_vectorizer = pickle.load(open(model_path + 'tfidf_vectorizer.pickle', "rb"))
    
    return ML_model, pca_vectorizer, tfidf_vectorizer

In [171]:
# load dataset for testing
fold_parent = './data/dronology_five_folds/'

sub_folders = []
for folder in os.listdir(fold_parent):
    if 'fold' in folder: 
        sub_folders.append(os.path.join(fold_parent, folder))
        

In [213]:
model_name = 'SVM'
PCA = True
map_labels = {0: 'information', 1: 'requirement'}

In [194]:
# load test data & make prediction

ml_results = []
avg_accuracy = []
fold_count = 1

for subs in sorted(sub_folders):
    test_path = subs + '/test_' + 'fold_' + str(fold_count) + '.csv'
    
    df_test = pd.read_csv(test_path)
    df_test['STR.REQ'] = df_test['STR.REQ'].str.lower()
    X_test = df_test['STR.REQ']
    y_test = df_test['class']
    
    model_path = './models/ML_models/' + model_name + '/fold_' + str(fold_count) + '/'
    ML_model, pca_vectorizer, tfidf_vectorizer = load_ML_model_files(model_name, model_path, PCA)

    tfidf_vecs = tfidf_vectorizer.transform(X_test)
    normalized_tfidf = normalize(tfidf_vecs)

    test_vecs = pca_vectorizer.transform(normalized_tfidf.toarray())
    predicted_labels = ML_model.predict(test_vecs)
    
    evaluation_results = classification_report(y_test.tolist(), predicted_labels.tolist(), 
                                               target_names=list(map_labels.values()), 
                                               output_dict=True)
    
    avg_accuracy.append(get_accuracy(y_test.tolist(), predicted_labels.tolist()))
    
    report_df = pd.DataFrame(evaluation_results).transpose()
    ml_results.append(report_df)
    
    print('\nResults for dataset fold number :',fold_count, 'on model :', model_name)
    print('\n',report_df)
    print('--------------------------------------')
    
    fold_count += 1


Results for dataset fold number : 1 on model : SVM

               precision    recall  f1-score    support
information    0.823529  1.000000  0.903226  56.000000
requirement    1.000000  0.400000  0.571429  20.000000
accuracy       0.842105  0.842105  0.842105   0.842105
macro avg      0.911765  0.700000  0.737327  76.000000
weighted avg   0.869969  0.842105  0.815911  76.000000
--------------------------------------

Results for dataset fold number : 2 on model : SVM

               precision    recall  f1-score    support
information    0.787879  0.928571  0.852459  56.000000
requirement    0.600000  0.300000  0.400000  20.000000
accuracy       0.763158  0.763158  0.763158   0.763158
macro avg      0.693939  0.614286  0.626230  76.000000
weighted avg   0.738437  0.763158  0.733391  76.000000
--------------------------------------

Results for dataset fold number : 3 on model : SVM

               precision    recall  f1-score  support
information    0.768116  0.963636  0.854839    

In [198]:
# Average results of ML pipeline

avg_acc_score = round(np.mean(avg_accuracy), 2)
weighted_avg, macro_avg = get_avg_report(ml_results, folds=5)

avg_scores = list([weighted_avg, macro_avg, (avg_acc_score, avg_acc_score, avg_acc_score)])

final_df = pd.DataFrame([x for x in avg_scores], columns=(['Precision', 'Recall', 'F1_score']),
                      index=['weighted_avg','macro_avg', 'accuracy_avg'])

final_df.rename_axis('5-folds')

Unnamed: 0_level_0,Precision,Recall,F1_score
5-folds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
weighted_avg,0.78,0.79,0.75
macro_avg,0.76,0.63,0.64
accuracy_avg,0.78,0.78,0.78


### BERT Family Pipeline

In [104]:
def load_tokenizer(model_name):
    
    """
    loads and returns the relevant tokenizer for passed parameter BERT model name
    
    """
    if model_name in ('BERT_base_uncased', 
                      'pBERT_base_uncased'):
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                                  do_lower_case=True)
                
    elif model_name in ('BERT_base_cased',
                        'pBERT_base_cased'):
        tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    
    elif model_name in ('pXLNet_base', 
                        'XLNet_base'):
        tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
    
    elif model_name in ('SciBERT_uncased', 
                        'pSciBERT_uncased'):
        tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', 
                                                  do_lower_case=True)
    
    elif model_name in ('pRoBERTa_base', 
                        'RoBERTa_base'):
        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

    elif model_name in ('DisBERT_base_cased', 
                        'pDisBERT_base_cased'):
        tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")
    
    elif model_name in ('DisBERT_base_uncased', 
                        'pDisBERT_base_uncased'):
        tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

    else:
        #'pXRBERT_base', 'XRBERT_base'
        tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
    
    return tokenizer
        

In [337]:
def load_BERT_model(model_name, model_path):
    """
    loads and returns the BERT model based on the model name and path parameters
    
    """
    
    if model_name in ('BERT_base_uncased', 'pBERT_base_cased',
                      'pBERT_base_uncased', 'BERT_base_cased',
                      'SciBERT_uncased', 'pSciBERT_uncased'
                     ):
        model = BertForSequenceClassification.from_pretrained(model_path)                
    elif model_name in ('pXLNet_base', 
                        'XLNet_base'
                       ):
        model = XLNetForSequenceClassification.from_pretrained(model_path)
    
    elif model_name in ('pRoBERTa_base', 
                        'RoBERTa_base'
                       ):
        model = RobertaForSequenceClassification.from_pretrained(model_path)

    elif model_name in ('DisBERT_base_cased', 'DisBERT_base_uncased',
                        'pDisBERT_base_cased', 'pDisBERT_base_uncased'
                       ):
        model = DistilBertForSequenceClassification.from_pretrained(model_path)    
    
    else:
        #'pXRBERT_base', 'XRBERT_base'
        model = XLMRobertaForSequenceClassification.from_pretrained(model_path)
    
    return model

In [200]:
# model name to test the pipeline

map_labels = {0: 'information', 1: 'requirement'}

prefix = './models/DL_models/'
model_name = 'DisBERT_base_uncased'

fold_parent = './data/dronology_five_folds/'

sub_folders = []
for folder in os.listdir(fold_parent):
    if 'fold' in folder: 
        sub_folders.append(os.path.join(fold_parent, folder))

tokenizer = load_tokenizer(model_name)
MAX_SEQ_LENGTH = 128

In [201]:
tokenizer

PreTrainedTokenizer(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [209]:
fold_count = 1
results = []
avg_accuracy = []
for subs in sorted(sub_folders):
    test_path = subs + '/test_' + 'fold_' + str(fold_count) + '.csv'
    
    df_test = pd.read_csv(test_path)
    selected_test = df_test[['STR.REQ','class']]

    test_sequences = selected_test['STR.REQ'].tolist()

    test_encodings = tokenizer(test_sequences, truncation=True, 
                               padding=True, 
                               max_length=MAX_SEQ_LENGTH, 
                               return_tensors="pt")
    # load model
    model_path = glob.glob(prefix + model_name + '/fold_' + str(fold_count) + '/*')[0]
    bert_model = load_BERT_model(model_name, model_path)

    with torch.no_grad():
        logits = bert_model(**test_encodings).logits

    predictions = np.argmax(logits, axis=1)
    evaluation_results = classification_report(selected_test['class'].tolist(), 
                                               predictions.tolist(), 
                                               target_names=list(map_labels.values()), 
                                               output_dict=True)
    
    avg_accuracy.append(get_accuracy(selected_test['class'].tolist(), 
                                     predictions.tolist()))

    report_df = pd.DataFrame(evaluation_results).transpose()
    results.append(report_df)
    
    print('\nResults for dataset fold number :',fold_count, 'on model :', model_name)
    print('\n',report_df)
    print('--------------------------------------')
    
    fold_count += 1


Results for dataset fold number : 1 on model : DisBERT_base_uncased

               precision    recall  f1-score    support
information    0.981132  0.928571  0.954128  56.000000
requirement    0.826087  0.950000  0.883721  20.000000
accuracy       0.934211  0.934211  0.934211   0.934211
macro avg      0.903610  0.939286  0.918925  76.000000
weighted avg   0.940331  0.934211  0.935600  76.000000
--------------------------------------

Results for dataset fold number : 2 on model : DisBERT_base_uncased

               precision    recall  f1-score    support
information    0.833333  0.892857  0.862069  56.000000
requirement    0.625000  0.500000  0.555556  20.000000
accuracy       0.789474  0.789474  0.789474   0.789474
macro avg      0.729167  0.696429  0.708812  76.000000
weighted avg   0.778509  0.789474  0.781408  76.000000
--------------------------------------

Results for dataset fold number : 3 on model : DisBERT_base_uncased

               precision    recall  f1-score  supp

In [210]:
# Average results of BERT model

avg_acc_score = round(np.mean(avg_accuracy), 2)
weighted_avg, macro_avg = get_avg_report(results, folds=5)

avg_scores = list([weighted_avg, macro_avg, (avg_acc_score, avg_acc_score, 
                                             avg_acc_score)])

final_df = pd.DataFrame([x for x in avg_scores], 
                        columns=(['Precision', 'Recall', 'F1_score']),
                        index=['weighted_avg','macro_avg', 'accuracy_avg'])

final_df.rename_axis('5-folds')

Unnamed: 0_level_0,Precision,Recall,F1_score
5-folds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
weighted_avg,0.85,0.86,0.85
macro_avg,0.81,0.81,0.8
accuracy_avg,0.86,0.86,0.86


### Sentence-BERT_Few shot pipeline

In [322]:
def get_dataset(path):
    
    """
    load and return the dataset in the format fine-tuned few shot sentence-BERT 
    expects
    
    """
    dataset = load_dataset(path)
    test_dataset = dataset['test']
    
    return test_dataset


def _apply_column_mapping(dataset:dataset, column_mapping: Dict[str, str]):
    
    """
    apply the column mapping required for the loaded dataset
    
    """

        dataset = dataset.rename_columns(
            {
                **column_mapping,
                **{col: f"feat_{col}" for col in dataset.column_names if col not in column_mapping},
            }
        )
        dset_format = dataset.format
        dataset = dataset.with_format(
            type=dset_format["type"],
            columns=dataset.column_names,
            output_all_columns=dset_format["output_all_columns"],
            **dset_format["format_kwargs"],
        )
        return dataset

def evaluate_ST(test_data, Sent_tf_model):
    
    """
    load and evaluate the Sentence-BERT model on the given test dataset
    
    """
    
    eval_dataset = _apply_column_mapping(dataset=test_data, 
                                         column_mapping={"STR.REQ": "text", "class": "label"})   
    x_test = eval_dataset["text"]
    y_test = eval_dataset["label"]

    predicted_labels = Sent_tf_model.predict(x_test)
    
    return predicted_labels, y_test

In [332]:
model_name = 'pS-BERT_20%'

prefix = './models/DL_models/'
fold_parent = './data/dronology_preprocess_five_folds/'

sub_folders = []
for folder in os.listdir(fold_parent):
    if 'fold' in folder: 
        sub_folders.append(os.path.join(fold_parent, folder)) 

In [333]:
fold_count = 1
st_results = []
avg_accuracy = []

for subs in sorted(sub_folders):
    test_dataset = get_dataset(subs)
    
    model_path = prefix + model_name + '/fold_' + str(fold_count)
    ST_model = SetFitModel.from_pretrained(model_path)
    
    predicted_labels, y_test = evaluate(test_dataset, ST_model)
    
    evaluation_results = classification_report(y_test, predicted_labels.tolist(), 
                                               target_names=list(map_labels.values()), 
                                               output_dict=True)
    
    avg_accuracy.append(get_accuracy(y_test, 
                                     predicted_labels.tolist()))

    report_df = pd.DataFrame(evaluation_results).transpose()
    st_results.append(report_df)
    
    print('\nResults for dataset fold number :',fold_count, 'on model :', model_name)
    print('\n',report_df)
    print('--------------------------------------')

    fold_count += 1


  0%|          | 0/2 [00:00<?, ?it/s]


Results for dataset fold number : 1 on model : pS-BERT_20%

               precision    recall  f1-score    support
information    0.840909  0.660714  0.740000  56.000000
requirement    0.406250  0.650000  0.500000  20.000000
accuracy       0.657895  0.657895  0.657895   0.657895
macro avg      0.623580  0.655357  0.620000  76.000000
weighted avg   0.726525  0.657895  0.676842  76.000000
--------------------------------------


  0%|          | 0/2 [00:00<?, ?it/s]


Results for dataset fold number : 2 on model : pS-BERT_20%

               precision    recall  f1-score    support
information    0.857143  0.642857  0.734694  56.000000
requirement    0.411765  0.700000  0.518519  20.000000
accuracy       0.657895  0.657895  0.657895   0.657895
macro avg      0.634454  0.671429  0.626606  76.000000
weighted avg   0.739938  0.657895  0.677806  76.000000
--------------------------------------


  0%|          | 0/2 [00:00<?, ?it/s]


Results for dataset fold number : 3 on model : pS-BERT_20%

               precision    recall  f1-score    support
information    0.885714  0.563636  0.688889  55.000000
requirement    0.400000  0.800000  0.533333  20.000000
accuracy       0.626667  0.626667  0.626667   0.626667
macro avg      0.642857  0.681818  0.611111  75.000000
weighted avg   0.756190  0.626667  0.647407  75.000000
--------------------------------------


  0%|          | 0/2 [00:00<?, ?it/s]


Results for dataset fold number : 4 on model : pS-BERT_20%

               precision    recall  f1-score  support
information    0.897436  0.636364  0.744681    55.00
requirement    0.444444  0.800000  0.571429    20.00
accuracy       0.680000  0.680000  0.680000     0.68
macro avg      0.670940  0.718182  0.658055    75.00
weighted avg   0.776638  0.680000  0.698480    75.00
--------------------------------------


  0%|          | 0/2 [00:00<?, ?it/s]


Results for dataset fold number : 5 on model : pS-BERT_20%

               precision    recall  f1-score    support
information    0.804348  0.660714  0.725490  56.000000
requirement    0.344828  0.526316  0.416667  19.000000
accuracy       0.626667  0.626667  0.626667   0.626667
macro avg      0.574588  0.593515  0.571078  75.000000
weighted avg   0.687936  0.626667  0.647255  75.000000
--------------------------------------


In [335]:
avg_acc_score = round(np.mean(avg_accuracy), 2)
weighted_avg, macro_avg = get_avg_report(st_results, folds=5)

avg_scores = list([weighted_avg, macro_avg, (avg_acc_score, avg_acc_score, 
                                             avg_acc_score)])

final_df = pd.DataFrame([x for x in avg_scores], 
                        columns=(['Precision', 'Recall', 'F1_score']),
                        index=['weighted_avg','macro_avg', 'accuracy_avg'])

final_df.rename_axis('5-folds')

Unnamed: 0_level_0,Precision,Recall,F1_score
5-folds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
weighted_avg,0.74,0.65,0.67
macro_avg,0.63,0.66,0.62
accuracy_avg,0.65,0.65,0.65


### Random Pipeline

In [119]:
def get_random_label(ranges):
    """
    returns the random label from the defined ranges of the labels
    """
    temp=random.randint(1, ranges[-1][-1])
    
    for r in ranges:
        if(temp>r[1] and temp<=r[-1]):
            return r[0]
    return None

def get_ranges(df):
    """
    predicts the random labels on the given test dataset
    
    """
    csum = 0
    ranges = []
    total_tr = len(df)

    for k, v in df['class'].value_counts().to_dict().items():

        csum_old = csum
        csum += round((v/total_tr) * 100,0)
        #print (k,"from", csum_old, "to",csum)
        ranges.append([k, csum_old, csum])
    
    r_out = []
    for row in test_df.iterrows():
        r3labels = []

        while len(r3labels)!=1:
            rl = get_random_label(ranges)
            if not rl in r3labels:
                r3labels.append(rl)

        r_out.append([row[1]['issueid'], row[1]['class'], r3labels])

    return ranges, r_out

In [212]:
# load dataset
fold_parent = './data/dronology_five_folds/'

sub_folders = []
for folder in os.listdir(fold_parent):
    if 'fold' in folder: 
        sub_folders.append(os.path.join(fold_parent, folder)) 

In [128]:
fold_count = 1
for subs in sorted(sub_folders):
    
    test_path = subs + '/test_' + 'fold_' + str(fold_count) + '.csv'
    test_df = pd.read_csv(test_path)
    ranges, r_out = get_ranges(test_df)
    
    random_out = pd.DataFrame()
    random_out['issueid'] = [i[0] for i in r_out]
    random_out['class'] = [i[1] for i in r_out]
    random_out['top_label'] = [i[2][0] for i in r_out]
    evaluation_results = classification_report(random_out['class'], random_out['top_label'], 
                                               target_names=list(map_labels.values()), 
                                               output_dict=True)
    
    report_df = pd.DataFrame(evaluation_results).transpose()
    print('\nResults for fold number :',fold_count)
    print('\n',report_df)
    print('--------------------------------------')
    
    fold_count += 1


Results for fold number : 1

               precision    recall  f1-score    support
information    0.736842  0.750000  0.743363  56.000000
requirement    0.263158  0.250000  0.256410  20.000000
accuracy       0.618421  0.618421  0.618421   0.618421
macro avg      0.500000  0.500000  0.499887  76.000000
weighted avg   0.612188  0.618421  0.615217  76.000000
--------------------------------------

Results for fold number : 2

               precision    recall  f1-score    support
information    0.754386  0.767857  0.761062  56.000000
requirement    0.315789  0.300000  0.307692  20.000000
accuracy       0.644737  0.644737  0.644737   0.644737
macro avg      0.535088  0.533929  0.534377  76.000000
weighted avg   0.638966  0.644737  0.641754  76.000000
--------------------------------------

Results for fold number : 3

               precision    recall  f1-score    support
information    0.730769  0.690909  0.710280  55.000000
requirement    0.260870  0.300000  0.279070  20.000000
accu