# Embedding Model with Instructions

## Datasets

In [1]:
def display_dataset_info(dataset):
    info = dataset.info
    dataset_name = info.dataset_name
    splits_info = info.splits
    features = info.features
    print(f"Dataset Name: {dataset_name}")
    print("Splits Info:")
    for split_name, split_info in splits_info.items():
        num_examples = split_info.num_examples
        print(f" - Split: {split_name}, Num Examples: {num_examples}")
    print("Features:")
    for feature_name, feature_info in features.items():
        print(f" - {feature_name}: {feature_info}")

In [2]:
from datasets import load_dataset
import random
from collections import defaultdict

def get_dataset(dataset_name, train_size=0, test_size=0):  
    # Load the dataset
    dataset = load_dataset(dataset_name)
    
    # Access the train, test splits
    train_dataset = dataset['train']
    test_dataset = dataset['test']
    
    # Function to balance and shuffle a split
    def shuffle(split, size):
        if size == 0:
            return split
        label_to_indices = defaultdict(list)
        for idx, example in enumerate(split):
            label_to_indices[example['label']].append(idx)
        
        balanced_indices = []
        for indices in label_to_indices.values():
            if len(indices) >= size // len(label_to_indices):
                balanced_indices.extend(random.sample(indices, size // len(label_to_indices)))
            else:
                balanced_indices.extend(indices)
        
        random.shuffle(balanced_indices)
        return split.select(balanced_indices[:size])

    # Shuffle and balance the datasets
    train_dataset = shuffle(train_dataset, train_size)
    test_dataset = shuffle(test_dataset, test_size)
    
    return train_dataset, test_dataset

## Embedding Models

### BERT - Pipeline

In [3]:
from tqdm.auto import tqdm
from transformers.pipelines.pt_utils import KeyDataset
import numpy as np

def encode_Pipeline(model, dataset, max_length=512, use_cls=True):
    key = "text"
    data = KeyDataset(dataset, key)
    pipe = model(data, return_tensors=True, truncation=True, padding=True, max_length=max_length)
    embeddings=[]
    for tensor in tqdm(pipe, desc="Encoding"): 
        # Tensor Shape [batch_size, sequence_length, hidden_size]
        if use_cls:
            embedding = tensor[:, 0, :]
        else:
            embedding = tensor.mean(dim=1)
        embeddings.append(embedding.squeeze())
    return np.array(embeddings), np.array(dataset["label"])

### Instructor - Sentence Transformers

In [4]:
def encode_ST(model, dataset):
    embeddings = []
    texts = dataset["text"]
    instructions = dataset["instruction"]
    for text, instruction in tqdm(zip(texts, instructions),total=len(dataset), desc="Encoding"):
        embedding = model.encode([[instruction, text]])
        embeddings.append(np.array(embedding).squeeze())
    return np.array(embeddings), np.array(dataset["label"])

### T5 - Transformer Sentence Piece

In [5]:
from transformers import T5Tokenizer, T5Model
import torch
from tqdm import tqdm
import numpy as np

def encode_T5(dataset, key="text", truncation=True, padding=True, max_length=512, use_mean_pooling=True):
    # Check if CUDA is available and set device accordingly
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Initialize the tokenizer and model
    tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base")
    model = T5Model.from_pretrained("google-t5/t5-base").to(device)
    
    embeddings = []
    labels = []
    
    for data in tqdm(dataset, desc="Encoding text"):
        text = data[key]
        label = data.get("label", None)
        
        # Tokenize the input text
        inputs = tokenizer(text, truncation=truncation, padding=padding, max_length=max_length, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Get encoder outputs
        with torch.no_grad():
            encoder_outputs = model.encoder(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        
        # Pooling to get a single vector for each input
        if use_mean_pooling:
            attention_mask = inputs['attention_mask']
            last_hidden_state = encoder_outputs.last_hidden_state
            mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1)
            sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
            embedding = sum_embeddings / sum_mask
        else:
            embedding = encoder_outputs.last_hidden_state[:, 0, :]
        
        embeddings.append(embedding.cpu().numpy().flatten())  # Flatten the embeddings
        if label is not None:
            labels.append(label)
    
    return np.array(embeddings), np.array(labels)

## Augment Instruction

In [6]:
def mapper_affixes(example, prefix, suffix):
    example['text'] = prefix + example['text'] + suffix
    return example

def mapper_instruct(example, instruction):
    example['instruction'] = instruction
    return example

def augment_dataset_Affix(dataset, prefix, suffix):
    augmented_dataset = dataset.map(lambda x: mapper_affixes(x, prefix, suffix))
    return augmented_dataset

def augment_dataset_Inst(dataset, instruction):
    augmented_dataset = dataset.map(lambda x: mapper_instruct(x, instruction))
    return augmented_dataset

## Evaluation

In [7]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

def evaluate(method, train_embeddings, test_embeddings, train_labels, test_labels):
    if method == "SVM":
        model = SVC(kernel='linear')
        
    elif method == "MLP":
        model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, alpha=1e-4,
                          solver='sgd', verbose=1, random_state=1,
                          learning_rate_init=.1)

    model.fit(train_embeddings, train_labels)
    predicted_labels = model.predict(test_embeddings)
    print("Report on " + method + ": ")
    print(classification_report(y_true = test_labels, y_pred = predicted_labels, digits=4))

## EmbedFlow

In [8]:
from transformers import pipeline
def EmbedFlow_Bert_1(train_dataset, test_dataset, evaluator, prefix, suffix=''):
    # Load Model
    model = pipeline("feature-extraction", model="google-bert/bert-base-uncased", device=0)
    
    train_dataset = augment_dataset_Affix(train_dataset, prefix, suffix)
    test_dataset = augment_dataset_Affix(test_dataset, prefix, suffix)

    # Embed Dataset
    train_embeddings, train_labels = encode_Pipeline(model, train_dataset, max_length=512, use_cls=True)
    test_embeddings, test_labels = encode_Pipeline(model, test_dataset, use_cls=True)

    # Evaluate
    evaluate(evaluator, train_embeddings, test_embeddings, train_labels, test_labels)

In [9]:
def EmbedFlow_Bert_2(train_dataset, test_dataset, evaluator, prefix, suffix=''):
    # Load Model
    model = pipeline("feature-extraction", model="google-bert/bert-large-uncased", device=0)
    
    train_dataset = augment_dataset_Affix(train_dataset, prefix, suffix)
    test_dataset = augment_dataset_Affix(test_dataset, prefix, suffix)

    # Embed Dataset
    train_embeddings, train_labels = encode_Pipeline(model, train_dataset, max_length=512, use_cls=False)
    test_embeddings, test_labels = encode_Pipeline(model, test_dataset, max_length=512, use_cls=False)

    # Evaluate
    evaluate(evaluator, train_embeddings, test_embeddings, train_labels, test_labels)

In [10]:
from sentence_transformers import SentenceTransformer
def EmbedFlow_Instructor(train_dataset, test_dataset, evaluator, instruction):
    # Load Model
    model = SentenceTransformer("hkunlp/instructor-large")
    
    # Add Instruction
    train_dataset = augment_dataset_Inst(train_dataset, instruction)
    test_dataset = augment_dataset_Inst(test_dataset, instruction)

    # Embed Dataset
    train_embeddings, train_labels = encode_ST(model, train_dataset)
    test_embeddings, test_labels = encode_ST(model, test_dataset)
    
    # Evaluate
    evaluate(evaluator, train_embeddings, test_embeddings, train_labels, test_labels)

In [11]:
def EmbedFlow_T5(train_dataset, test_dataset, evaluator, prefix, suffix=''):
    # Load Model
    model = pipeline("feature-extraction", model="t5-base", device=0)

    # Add Instruction
    train_dataset = augment_dataset_Affix(train_dataset, prefix, suffix)
    test_dataset = augment_dataset_Affix(test_dataset, prefix, suffix)

    # Embed Dataset
    train_embeddings, train_labels = encode_T5(train_dataset)
    test_embeddings, test_labels = encode_T5(test_dataset)
    
    # Evaluate
    evaluate(evaluator, train_embeddings, test_embeddings, train_labels, test_labels)

In [12]:
def EmbedFlow_GPT(train_dataset, test_dataset, evaluator, prefix, suffix=''):
    # Load Model
    model = pipeline("feature-extraction", model="openai-community/gpt2", device=0)
  
    # Add Instruction
    train_dataset = augment_dataset_Affix(train_dataset, prefix, suffix)
    test_dataset = augment_dataset_Affix(test_dataset, prefix, suffix)

    # Embed Dataset
    train_embeddings, train_labels = encode_Pipeline(model, train_dataset, use_cls=False)
    test_embeddings, test_labels = encode_Pipeline(model, test_dataset, use_cls=False)
    
    # Evaluate
    evaluate(evaluator, train_embeddings, test_embeddings, train_labels, test_labels)

## Testing
Logging: [Google Sheet](https://docs.google.com/spreadsheets/d/1iBDq7C59G6olf_of_sTF5oCY3Itj6_kImzeUl3XMpd8/edit#gid=1587051763)

In [13]:
import warnings
warnings.filterwarnings("ignore")

datasets = ['stanfordnlp/imdb', 
            'yelp_review_full',
            'Voice49/arXiv-Abstract-Label-20k']


evaluator = ['SVM', 'MLP']

instructions = ['',
          'Movie Review: ', 
          'Restaurant Review: ', 
          'Sentiment Analysis: ', 
          'User Feedback: ', 
          'Customer Experience: ',
          'Product Review: ',
          'Service Feedback: ',
          'Experience at: ',
          'Abstract: ',
          'Research Paper Abstract: ',
          'Paper Summary: ']

test = ["Represent the Review sentence for classifying emotion as positive or negative: ",
      "Represent the movie review sentence for sentiment analysis: ",
      "Classify the sentiment of the following movie review as positive or negative: "]

train, test = get_dataset(datasets[0], train_size=100, test_size=100)
EmbedFlow_Instructor(train, test, evaluator[0], instructions[0])

# Loop Through Everything
# for dataset in datasets:
#     train, test = get_dataset(datasets[0], train_size=1000, test_size=1000)
#     for instruction in instructions:
#         print(f"Processing dataset: {dataset}, instruction: '{instruction}'")
#         print("Bert")
#         EmbedFlow_Bert_1(train, test, evaluator[0], instructions[10])
#         print("Bert2")
#         EmbedFlow_Bert_2(train, test, evaluator[0], instructions[10])
#         print("Instructor")
#         EmbedFlow_Instructor(train, test, evaluator[0], instructions[10])
#         print("T5")
#         EmbedFlow_T5(train, test, evaluator[0], instructions[10])
#         print("GPT2")
#         EmbedFlow_GPT(train, test, evaluator[0], instructions[10])

TypeError: EmbedFlow_Instructor() missing 2 required positional arguments: 'evaluator' and 'instruction'