# Embedding Model with Instructions

## Datasets

In [61]:
def display_dataset_info(dataset):
    info = dataset.info
    dataset_name = info.dataset_name
    splits_info = info.splits
    features = info.features
    print(f"Dataset Name: {dataset_name}")
    print("Splits Info:")
    for split_name, split_info in splits_info.items():
        num_examples = split_info.num_examples
        print(f" - Split: {split_name}, Num Examples: {num_examples}")
    print("Features:")
    for feature_name, feature_info in features.items():
        print(f" - {feature_name}: {feature_info}")

In [62]:
from datasets import load_dataset
import random
from collections import defaultdict

def get_dataset(dataset_name, train_size=0, test_size=0):  
    # Load the dataset
    dataset = load_dataset(dataset_name)
    
    # Access the train, test splits
    train_dataset = dataset['train']
    test_dataset = dataset['test']
    
    # Function to balance and shuffle a split
    def shuffle(split, size):
        if size == 0:
            return split
        label_to_indices = defaultdict(list)
        for idx, example in enumerate(split):
            label_to_indices[example['label']].append(idx)
        
        balanced_indices = []
        for indices in label_to_indices.values():
            if len(indices) >= size // len(label_to_indices):
                balanced_indices.extend(random.sample(indices, size // len(label_to_indices)))
            else:
                balanced_indices.extend(indices)
        
        random.shuffle(balanced_indices)
        return split.select(balanced_indices[:size])

    if(train_size != 0):
        train_dataset = train_dataset.shuffle(seed=42).select(range(train_size))
    if(test_size != 0):
        test_dataset = test_dataset.shuffle(seed=42).select(range(test_size))
    # Shuffle and balance the datasets
    # train_dataset = shuffle(train_dataset, train_size)
    # test_dataset = shuffle(test_dataset, test_size)
    
    return train_dataset, test_dataset

## Embedding Models

### BERT - Pipeline

In [63]:
from tqdm.auto import tqdm
from transformers.pipelines.pt_utils import KeyDataset
import numpy as np

def encode_Pipeline(model, dataset, max_length=512, use_cls=True):
    key = "text"
    data = KeyDataset(dataset, key)
    pipe = model(data, return_tensors=True, truncation=True, padding=True, max_length=max_length)
    embeddings=[]
    for tensor in tqdm(pipe, desc="Encoding"): 
        # Tensor Shape [batch_size, sequence_length, hidden_size]
        if use_cls:
            embedding = tensor[:, 0, :]
        else:
            embedding = tensor.mean(dim=1)
        embeddings.append(embedding.squeeze())
    return np.array(embeddings), np.array(dataset["label"])

### Instructor - Sentence Transformers

In [64]:
def encode_ST(model, dataset):
    embeddings = []
    texts = dataset["text"]
    instructions = dataset["instruction"]
    for text, instruction in tqdm(zip(texts, instructions),total=len(dataset), desc="Encoding"):
        embedding = model.encode([[instruction, text]])
        embeddings.append(np.array(embedding).squeeze())
    return np.array(embeddings), np.array(dataset["label"])

### T5 - Transformer Sentence Piece

In [65]:
from transformers import T5Tokenizer, T5Model
import torch
from tqdm import tqdm
import numpy as np

def encode_T5(dataset, key="text", truncation=True, padding=True, max_length=512, use_mean_pooling=True):
    # Check if CUDA is available and set device accordingly
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Initialize the tokenizer and model
    tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base")
    model = T5Model.from_pretrained("google-t5/t5-base").to(device)
    
    embeddings = []
    labels = []
    
    for data in tqdm(dataset, desc="Encoding text"):
        text = data[key]
        label = data.get("label", None)
        
        # Tokenize the input text
        inputs = tokenizer(text, truncation=truncation, padding=padding, max_length=max_length, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Get encoder outputs
        with torch.no_grad():
            encoder_outputs = model.encoder(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        
        # Pooling to get a single vector for each input
        if use_mean_pooling:
            attention_mask = inputs['attention_mask']
            last_hidden_state = encoder_outputs.last_hidden_state
            mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1)
            sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
            embedding = sum_embeddings / sum_mask
        else:
            embedding = encoder_outputs.last_hidden_state[:, 0, :]
        
        embeddings.append(embedding.cpu().numpy().flatten())  # Flatten the embeddings
        if label is not None:
            labels.append(label)
    
    return np.array(embeddings), np.array(labels)

## Augment Instruction

In [66]:
def mapper_affixes(example, prefix, suffix):
    example['text'] = prefix + example['text'] + suffix
    return example

def mapper_instruct(example, instruction):
    example['instruction'] = instruction
    return example

def augment_dataset_Affix(dataset, prefix, suffix):
    augmented_dataset = dataset.map(lambda x: mapper_affixes(x, prefix, suffix))
    return augmented_dataset

def augment_dataset_Inst(dataset, instruction):
    augmented_dataset = dataset.map(lambda x: mapper_instruct(x, instruction))
    return augmented_dataset

## Evaluation

In [67]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

def evaluate(method, train_embeddings, test_embeddings, train_labels, test_labels):
    if method == "SVM":
        model = SVC(kernel='linear')
        
    elif method == "MLP":
        model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, alpha=1e-4,
                          solver='sgd', verbose=1, random_state=1,
                          learning_rate_init=.1)

    model.fit(train_embeddings, train_labels)
    predicted_labels = model.predict(test_embeddings)
    print("Report on " + method + ": ")
    print(classification_report(y_true = test_labels, y_pred = predicted_labels, digits=4))

## EmbedFlow

In [68]:
from transformers import pipeline
def EmbedFlow_Bert_1(train_dataset, test_dataset, evaluator, prefix, suffix=''):
    # Load Model
    model = pipeline("feature-extraction", model="google-bert/bert-base-uncased", device=0)
    
    train_dataset = augment_dataset_Affix(train_dataset, prefix, suffix)
    test_dataset = augment_dataset_Affix(test_dataset, prefix, suffix)

    # Embed Dataset
    train_embeddings, train_labels = encode_Pipeline(model, train_dataset, max_length=512, use_cls=True)
    test_embeddings, test_labels = encode_Pipeline(model, test_dataset, use_cls=True)

    # Evaluate
    evaluate(evaluator, train_embeddings, test_embeddings, train_labels, test_labels)

In [69]:
def EmbedFlow_Bert_2(train_dataset, test_dataset, evaluator, prefix, suffix=''):
    # Load Model
    model = pipeline("feature-extraction", model="google-bert/bert-large-uncased", device=0)
    
    train_dataset = augment_dataset_Affix(train_dataset, prefix, suffix)
    test_dataset = augment_dataset_Affix(test_dataset, prefix, suffix)

    # Embed Dataset
    train_embeddings, train_labels = encode_Pipeline(model, train_dataset, max_length=512, use_cls=False)
    test_embeddings, test_labels = encode_Pipeline(model, test_dataset, max_length=512, use_cls=False)

    # Evaluate
    evaluate(evaluator, train_embeddings, test_embeddings, train_labels, test_labels)

In [70]:
from sentence_transformers import SentenceTransformer
def EmbedFlow_Instructor(train_dataset, test_dataset, evaluator, instruction):
    # Load Model
    model = SentenceTransformer("hkunlp/instructor-large")
    
    # Add Instruction
    train_dataset = augment_dataset_Inst(train_dataset, instruction)
    test_dataset = augment_dataset_Inst(test_dataset, instruction)

    # Embed Dataset
    train_embeddings, train_labels = encode_ST(model, train_dataset)
    test_embeddings, test_labels = encode_ST(model, test_dataset)
    
    # Evaluate
    evaluate(evaluator, train_embeddings, test_embeddings, train_labels, test_labels)

In [71]:
def EmbedFlow_T5(train_dataset, test_dataset, evaluator, prefix, suffix=''):
    # Load Model
    model = pipeline("feature-extraction", model="t5-base", device=0)

    # Add Instruction
    train_dataset = augment_dataset_Affix(train_dataset, prefix, suffix)
    test_dataset = augment_dataset_Affix(test_dataset, prefix, suffix)

    # Embed Dataset
    train_embeddings, train_labels = encode_T5(train_dataset)
    test_embeddings, test_labels = encode_T5(test_dataset)
    
    # Evaluate
    evaluate(evaluator, train_embeddings, test_embeddings, train_labels, test_labels)

In [72]:
def EmbedFlow_GPT(train_dataset, test_dataset, evaluator, prefix, suffix=''):
    # Load Model
    model = pipeline("feature-extraction", model="openai-community/gpt2", device=0)
  
    # Add Instruction
    train_dataset = augment_dataset_Affix(train_dataset, prefix, suffix)
    test_dataset = augment_dataset_Affix(test_dataset, prefix, suffix)

    # Embed Dataset
    train_embeddings, train_labels = encode_Pipeline(model, train_dataset, use_cls=False)
    test_embeddings, test_labels = encode_Pipeline(model, test_dataset, use_cls=False)
    
    # Evaluate
    evaluate(evaluator, train_embeddings, test_embeddings, train_labels, test_labels)

## Testing
Logging: [Google Sheet](https://docs.google.com/spreadsheets/d/1iBDq7C59G6olf_of_sTF5oCY3Itj6_kImzeUl3XMpd8/edit#gid=1587051763)

In [73]:
import warnings
warnings.filterwarnings("ignore")
instructions_old = ['',
          'Movie Review: ', 
          'Restaurant Review: ', 
          'Sentiment Analysis: ', 
          'User Feedback: ', 
          'Customer Experience: ',
          'Product Review: ',
          'Service Feedback: ',
          'Experience at: ',
          'Abstract: ',
          'Research Paper Abstract: ',
          'Paper Summary: ']

instructions_general = [
    "Sentiment Analysis: ",
    "Summarize: ",
    "Review: ",
    "Evaluate: ",
    "Analyze: ",
    "Represent the text for determining the sentiment: ",
    "Summarize the main points of the text for a concise overview: ",
    "Evaluate the text to classify its sentiment and key themes: ",
    "Analyze the text for sentiment classification and thematic understanding: ",
    "Classify the sentiment and summarize the content of the given text: "
]

instructions_imdb=[
    "Movie Review: ",
    "Film Critique: ",
    "Cinema Opinion: ",
    "Film Feedback: ",
    "Review Analysis: ",
    "Represent the Review sentence for classifying emotion as positive or negative: ",
    "Evaluate the sentiment of the movie review sentence: ",
    "Analyze the review to determine if the sentiment is positive or negative: ",
    "Determine the emotional tone of the review for sentiment classification: ",
    "Classify the given movie review as either positive or negative: ",
]
instructions_yelp=[
    "Customer Experience at: ",
    "Opinion on: ",
    "Feed back on: ",
    "Restaurant Review: ",
    "User Feedback: ",
    "Describe the review in terms of customer satisfaction for the given place: ",
    "Evaluate the review to determine the customer's overall experience: ",
    "Analyze the review to classify the service quality as positive or negative: ",
    "Represent the customer's feedback on the establishment for sentiment analysis: ",
    "Determine the sentiment of the review, indicating whether the experience was positive or negative: ",
]

instructions_arXiv = [
    "Paper Summary: ",
    "Overview: ",
    "Abstract: ",
    "Key points: ",
    "Main findings: ",
    "Please read the following research paper abstract and summarize the key points and findings discussed: ",
    "Analyze the following abstract from a research paper and provide a detailed overview of its main ideas and conclusions: ",
    "Review the following abstract of a research paper and highlight the primary methodologies and results presented: ",
    "Examine the following abstract from a scientific paper and summarize the main topics and discoveries described: ",
    "Process the following research paper abstract and provide a concise summary of the core themes and contributions of the study: "
]

In [74]:
instructions_map = {
    'stanfordnlp/imdb': instructions_imdb,
    'yelp_review_full': instructions_yelp,
    'Voice49/arXiv-Abstract-Label-20k': instructions_arXiv
}

datasets = ['stanfordnlp/imdb', 
            'yelp_review_full',
            'Voice49/arXiv-Abstract-Label-20k']

evaluator = ['SVM', 'MLP']

train, test = get_dataset(datasets[0], train_size=1000, test_size=1000)

# Loop First two dataset
# for dataset in datasets[:2]:
#     instructions = instructions_general + instructions_map[dataset]
#     for instruction in instructions:
#         print(f"Processing dataset: {dataset}, instruction: '{instruction}'")
#         print("Bert")
#         EmbedFlow_Bert_1(train, test, evaluator[0], instruction)
#         print("Bert2")
#         EmbedFlow_Bert_2(train, test, evaluator[0], instruction)
#         print("Instructor")
#         EmbedFlow_Instructor(train, test, evaluator[0], instruction)
#         print("T5")
#         EmbedFlow_T5(train, test, evaluator[0], instruction)
#         print("GPT2")
#         EmbedFlow_GPT(train, test, evaluator[0], instruction)

# Loop last dataset
for dataset in datasets[-1:]:
    instructions = instructions_general + instructions_map[dataset]
    for instruction in instructions:
        print(f"Processing dataset: {dataset}, instruction: '{instruction}'")
        print("Bert")
        EmbedFlow_Bert_1(train, test, evaluator[0], instruction)
        print("Bert2")
        EmbedFlow_Bert_2(train, test, evaluator[0], instruction)
        print("Instructor")
        EmbedFlow_Instructor(train, test, evaluator[0], instruction)
        print("T5")
        EmbedFlow_T5(train, test, evaluator[0], instruction)
        print("GPT2")
        EmbedFlow_GPT(train, test, evaluator[0], instruction)

Processing dataset: Voice49/arXiv-Abstract-Label-20k, instruction: 'Sentiment Analysis: '
Bert


Encoding: 100%|██████████| 1000/1000 [00:10<00:00, 98.16it/s]
Encoding:   8%|▊         | 80/1000 [00:00<00:08, 105.44it/s]


KeyboardInterrupt: 