# Embedding Model with Instructions

## Datasets

In [122]:
def display_dataset_info(dataset):
    info = dataset.info
    dataset_name = info.dataset_name
    splits_info = info.splits
    features = info.features
    print(f"Dataset Name: {dataset_name}")
    print("Splits Info:")
    for split_name, split_info in splits_info.items():
        num_examples = split_info.num_examples
        print(f" - Split: {split_name}, Num Examples: {num_examples}")
    print("Features:")
    for feature_name, feature_info in features.items():
        print(f" - {feature_name}: {feature_info}")

In [123]:
from datasets import load_dataset

def get_dataset(dataset_name, train_size=0, test_size=0):  
    # Load the dataset
    dataset = load_dataset(dataset_name)
    #display_dataset_info(dataset['test'])
    
    # Access the train, test splits
    train_dataset = dataset['train']
    test_dataset = dataset['test']

    # Random sample the dataset, only use random_sample_size
    if(train_size != 0):
        train_dataset = train_dataset.shuffle(seed=42).select(range(train_size))
    if(test_size != 0):
        test_dataset = test_dataset.shuffle(seed=42).select(range(test_size))

    return train_dataset, test_dataset

## Embedding Models

In [124]:
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import warnings

def load_model(model_name):
    warnings.filterwarnings("ignore")
    match model_name:
        case "Bert1":
            return pipeline("feature-extraction", model="google-bert/bert-base-uncased", device=0)
        case "Bert2":
            return pipeline("feature-extraction", model="google-bert/bert-large-uncased", device=0)
        case "Instructor":
            return SentenceTransformer("hkunlp/instructor-large")
        case _:
            return None

### BERT - Pipeline

In [125]:
from tqdm.auto import tqdm
from transformers.pipelines.pt_utils import KeyDataset
import numpy as np

def encode_Bert(model, dataset, key="text", truncation=True, padding=True, max_length=512, use_cls=True):
    data = KeyDataset(dataset, key)
    pipe = model(data, return_tensors=True, truncation=truncation, padding=padding, max_length=max_length)
    embeddings=[]
    for tensor in tqdm(pipe, desc="Encoding"): 
        if use_cls:
            # Shape [batch_size, sequence_length, hidden_size]
            embedding = tensor[0, 0, :].detach().numpy()
            print(embedding.shape)
        else:
            embedding = tensor.mean(dim=1).flatten()
        embeddings.append(embedding)
    return np.array(embeddings), np.array(dataset["label"])

### Instructor - Sentence Transformers

In [126]:
def encode_ST(model, dataset):
    embeddings = []
    texts = dataset["text"]
    instructions = dataset["instruction"]
    for text, instruction in tqdm(zip(texts, instructions),total=len(dataset), desc="Encoding"):
        embedding = model.encode([[instruction, text]])[0]
        embeddings.append(embedding)
    return np.array(embeddings), np.array(dataset["label"])

## Augment Instruction

In [127]:
def mapper_affixes(example, prefix, suffix):
    example['text'] = prefix + example['text'] + suffix
    return example

def mapper_instruct(example, instruction):
    example['instruction'] = instruction
    return example

def augment_dataset_Affix(dataset, prefix, suffix):
    augmented_dataset = dataset.map(lambda x: mapper_affixes(x, prefix, suffix))
    return augmented_dataset

def augment_dataset_Inst(dataset, instruction):
    augmented_dataset = dataset.map(lambda x: mapper_instruct(x, instruction))
    return augmented_dataset

## Evaluation

In [128]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

def evaluate(method, train_embeddings, test_embeddings, train_labels, test_labels):
    if method == "SVM":
        model = SVC(kernel='linear')
        
    elif method == "MLP":
        model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, alpha=1e-4,
                          solver='sgd', verbose=1, random_state=1,
                          learning_rate_init=.1)

    model.fit(train_embeddings, train_labels)
    predicted_labels = model.predict(test_embeddings)
    print("Report on " + method + ": ")
    print(classification_report(y_true = test_labels, y_pred = predicted_labels))

## EmbedFlow

In [129]:
def EmbedFlow(model_name, dataset_name, train_size, test_size, evaluator, prefix, suffix):
    # Load Dataset
    train_dataset, test_dataset = get_dataset(dataset_name, train_size, test_size)

    # Load Model
    model = load_model(model_name)
    
    # Add Instruction
    if model_name == "Instructor":
        instruction = prefix  # Use prefix as the instruction
        train_dataset = augment_dataset_Inst(train_dataset, instruction)
        test_dataset = augment_dataset_Inst(test_dataset, instruction)
    else:
        train_dataset = augment_dataset_Affix(train_dataset, prefix, suffix)
        test_dataset = augment_dataset_Affix(test_dataset, prefix, suffix)

    # Embed Dataset
    if model_name == "Instructor":
        train_embeddings, train_labels = encode_ST(model, train_dataset)
        test_embeddings, test_labels = encode_ST(model, test_dataset)
    else:
        train_embeddings, train_labels = encode_Bert(model, train_dataset, use_cls=False)
        test_embeddings, test_labels = encode_Bert(model, test_dataset, use_cls=False)

    # Evaluate
    evaluate(evaluator, train_embeddings, test_embeddings, train_labels, test_labels)

## Testing

In [130]:
models  = ['Bert1',
          'Bert2',
          'Instructor']

datasets = ['stanfordnlp/imdb', 
            'yelp_review_full']

evaluator = ['SVM', 'MLP']

prefix = ['','Movie Review: ', 'Cat and Dog: ']
suffix = ['','']

train_size = 1000
test_size  = 1000

EmbedFlow(models[2], datasets[0], train_size, test_size, evaluator[0], prefix[0], suffix[0])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Encoding:   0%|          | 0/1000 [00:00<?, ?it/s]

Encoding:   0%|          | 0/1000 [00:00<?, ?it/s]

Report on SVM: 
              precision    recall  f1-score   support

           0       0.96      0.93      0.94       512
           1       0.92      0.95      0.94       488

    accuracy                           0.94      1000
   macro avg       0.94      0.94      0.94      1000
weighted avg       0.94      0.94      0.94      1000

