# **Generate data and install Mosek**

Please install Mosek according to the instructions on [https://docs.mosek.com/latest/install/installation.html](https://docs.mosek.com/latest/install/installation.html).

Before running this cell, run the following in bash - 

`export PATH=/root/mosek/9.3/tools/platform/linux64x86/bin:$PATH`.

Please download MOSEK from [here](https://drive.google.com/drive/folders/1ZIKizbByyQIZmWCPTgFvnnQ6bVzUcRvF?usp=sharing) and add all contents to the current working directory. 

Please replace *mosek.lic* with your own license file obtained from [here](https://www.mosek.com/products/academic-licenses/).

Please download *MT-DNN.zip* which contains all models and scripts used in this notebook from [here](https://drive.google.com/file/d/1Wq4V93ZZC3sraVRY62agwXIJC0PubuGF/view?usp=sharing) and place it in the current working directory.

In [None]:
! bash install-mosek.sh
! unzip mt-dnn.zip
! rm mt-dnn.zip
! bash create-leopard-environment.sh

# **LEOPARD**

Setup the environment required for the classfications which are defined by Bansal et al. 

In [None]:
import os
import json

DATA_PATH = "/content/leopard/data/json"

def get_categories():
    categories = os.listdir(DATA_PATH)
    categories.remove("restaurant")
    categories.remove("conll")

    # move emotion to the end
    categories.remove("emotion")
    categories.append("emotion")
    return categories

def get_labelled_training_sentences(category, shot, episode):
    sentences = []
    labels = []
    label_keys = {}
    label_index = 0
    data_path = DATA_PATH + "/" + category + "/"
    for file_name in os.listdir(data_path):
        if file_name.endswith("_" + str(episode) + "_" + str(shot) + ".json"):
            data = json.load(open(data_path + file_name))            
            for index in range(len(data)):
                processed_sentence = data[index]['processed_sent']
                processed_sentence = processed_sentence.replace('[CLS]', '')
                processed_sentence = processed_sentence.replace('[SEP]', '')
                processed_sentence = processed_sentence.replace('[PAD]', '')
                label = data[index]['label']
                sentences.append(processed_sentence)
                # convert categorical labels to numeric values
                if label not in label_keys:
                    label_keys[label] = label_index
                    label_index += 1
                labels.append(label_keys[label])
    return sentences, labels, label_keys

def get_labelled_test_sentences(category):
    sentences = []
    labels = []
    data_path = DATA_PATH + "/" + category + "/"
    for file_name in os.listdir(data_path):
        if file_name.endswith("_eval.json"):
            data = json.load(open(data_path + file_name))            
            for index in range(len(data)):
                processed_sentence = data[index]['processed_sent']
                processed_sentence = processed_sentence.replace('[CLS]', '')
                processed_sentence = processed_sentence.replace('[SEP]', '')
                processed_sentence = processed_sentence.replace('[PAD]', '')
                label = data[index]['label']
                sentences.append(processed_sentence)
                labels.append(label)
    return sentences, labels

# **Set Up MT-DNN**

In [None]:
! pip install pytorch_pretrained_bert==0.4.0

import sys
sys.path.insert(1, '/content/mt-dnn/')

EXTRACT_FEATURES_COMMAND_BASE = "python mt-dnn/extractor.py --do_lower_case --finput mt-dnn/input_examples/single-input.txt --foutput mt-dnn/input_examples/single-output.json --bert_model bert-base-uncased --checkpoint mt-dnn/mt_dnn_models/mt_dnn_base_uncased.pt"
EXTRACT_FEATURES_COMMAND_LARGE = "python mt-dnn/extractor.py --do_lower_case --finput mt-dnn/input_examples/single-input.txt --foutput mt-dnn/input_examples/single-output.json --bert_model bert-base-uncased --checkpoint mt-dnn/mt_dnn_models/mt_dnn_large_uncased.pt"

# **Less Than One Shot Classification (Sucholutsky et al, 2021)**

Original source can be found [here](https://github.com/ilia10000/LO-Shot/blob/master/Paper3/Soft%20Label%20Optimization.ipynb). Run the script to install the environment.

In [None]:
! bash create-lo-shot-environment.sh

Load the required scripts and definitions.

In [None]:
% run lo_shot_definitions.ipynb

# **Extracting the data and creating the training set**


Generate the sentence encoding for each sentence. Store this along with the labels; use it to generate lines and prototypes.

In [None]:
import ast

def get_labelled_training_data(category, shot, episode):    
    sentences, training_labels, label_keys = get_labelled_training_sentences(category, shot, episode)  
    # write all sentences to the input file
    with open("/content/mt-dnn/input_examples/single-input.txt", 'w', encoding='utf-8') as writer:
        writer.write('\n'.join(sentences))
    # execute the command to get encodings
    os.system(EXTRACT_FEATURES_COMMAND_BASE)
    # fetch sentence encodings from the output file
    training_encodings = []
    with open('/content/mt-dnn/input_examples/single-output.json', 'r') as data_file:
        encodings_json = data_file.read()
    encodings_data = json.loads(encodings_json)
    for encoding in encodings_data:
        training_encodings.append(np.array(ast.literal_eval(encoding['11'])))        
    return training_encodings, training_labels, label_keys

Get centroids for the training data.

In [None]:
def get_labelled_centroids(training_encodings, training_labels):
    centroids = []
    centroid_labels = []
    for label in set(training_labels):
        centroids_per_label = []
        for i in range(len(training_labels)):
            if training_labels[i] == label:
                centroids_per_label.append(training_encodings[i])
        centroid = np.mean(np.array(centroids_per_label), axis=0)
        centroids.append(centroid)
        centroid_labels.append(label)
    
    return centroids, centroid_labels

Create the test dataset for the sentences.

In [None]:
def get_labelled_test_data(category):
    sentences, test_labels = get_labelled_test_sentences(category)
    # write all sentences to the input file
    with open("/content/mt-dnn/input_examples/single-input.txt", 'w', encoding='utf-8') as writer:
        writer.write('\n'.join(sentences))
    # execute the command to get encodings
    os.system(EXTRACT_FEATURES_COMMAND_BASE)
    # fetch sentence encodings from the output file
    test_encodings = []
    with open('/content/mt-dnn/input_examples/single-output.json', 'r') as data_file:
        encodings_json = data_file.read()
    encodings_data = json.loads(encodings_json)
    for encoding in encodings_data:        
        test_encodings.append(np.array(ast.literal_eval(encoding['11'])))        
    return test_encodings, test_labels

# **Classification using different flavours of KNN**

Perform the classification using soft label KNN

In [None]:
def get_soft_label_prototypes(lines, labeled_centroids):

  classifiers = []
  for line in lines:
    try:
      distX, distY = get_line_prototypes(line, labeled_centroids[0])
      classifier = SoftKNN(k=1)
      classifier.fit(distX, distY)
      classifiers.append(classifier)
      print("distX is", distX)
      print("distY is", distY)
    except:
      classifiers.append(None)

  return classifiers

# **Classify and measure metrics**

Classify the test data using centroids from the training data.

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

assert(cp.MOSEK in cp.installed_solvers())

for category in get_categories():
    test_encodings, categorical_test_labels = get_labelled_test_data(category)
    for episode in range(10):
        for shot in [4,8,16]:
            predictions = []
            true_labels = []

            training_encodings, training_labels, label_keys = get_labelled_training_data(category, shot, episode)            
            if not training_encodings:
                continue
            # convert categorical attributes to numeric indices
            test_labels = [label_keys[label] for label in categorical_test_labels]        
            centroids, centroid_labels = get_labelled_centroids(training_encodings, training_labels)

            labeled_training_data_np = np.stack(training_encodings, axis=0)
            labeled_test_data_np = np.stack(test_encodings, axis=0)
            labeled_centroids_np = np.stack(centroids, axis=0)

            labeled_test_data = [labeled_test_data_np[i] for i in range(labeled_test_data_np.shape[0])], [label for label in test_labels]
            labeled_centroids = [labeled_centroids_np[i] for i in range(labeled_centroids_np.shape[0])], [label for label in centroid_labels]

            test_classifications = max(max(labeled_centroids[1]) for labeled_centroid in labeled_centroids) + 1
            required_lines = test_classifications - 1
            dimensions = labeled_training_data_np.shape[1]

            lines = [line_order_no_endpoints(centroids=labeled_centroids_np, active_classes=np.array(line)) for line in find_lines_R_multiD(dat=labeled_training_data_np, labels=training_labels , dims=dimensions, centroids=labeled_centroids_np, k=required_lines)]
            classifiers = get_soft_label_prototypes(lines, labeled_centroids)

            for i in range(test_classifications):
                points = int(np.sum([True for class_val in labeled_test_data[1] if class_val == i]))
                points_required = [labeled_test_data[0][x] for x in range(len(labeled_test_data[1])) if labeled_test_data[1][x] == i]
                assignments = []
                for point in points_required:
                    dists = [dist_to_line_multiD(point, labeled_centroids[0][line[0]], labeled_centroids[0][line[-1]]) for line in lines]
                    nearest = np.argmin(dists)
                    assignments.append(nearest)
                for j in range(len(points_required)):
                    if classifiers[assignments[j]] is not None:
                        classifier = classifiers[assignments[j]]
                        prediction = classifier.predict(points_required[j])
                        predictions.append(prediction)
                        true_labels.append(i)

            print("For category", category, "and shot =", str(shot) + "...")
            print("Lines used are", len(lines))
            print("Number of classifications are", test_classifications)
            print("Macro f1 score is", f1_score(true_labels, predictions, average='macro'))
            print("Accuracy is", accuracy_score(true_labels, predictions))
            print("Correctly classified points are", np.sum(np.array(true_labels) == np.array(predictions)), "/", len(true_labels), "\n")