This notebook shows how to use Smoothie. 

What you will need for a given task:
* The list of sample inputs (`test_inputs`). In this tutorial, we load this from a jsonl file in `smoothie_data`.
* A set of models to route among, namely their generations for `test_inputs`. In this tutorial, we have previously saved different model generations in separate jsonl files, and we load and concatenate them to form a numpy array `test_generations` (of size `n_samples` x `n_models`).


We will walk through an example on CNN dailymail. To follow along, make sure you download `smoothie_data` from Huggingface, `cd` into the directory, and do `git lfs pull`.

If interested in the mathematical details of the Smoothie algorithm, please see `algorithm.ipynb`.

In [2]:
import jsonlines
import json 
import numpy as np
from sentence_transformers import SentenceTransformer
from fastembed import TextEmbedding
from sklearn.neighbors import NearestNeighbors

import sys 
sys.path.append("..")
from src.model import Smoothie

ModuleNotFoundError: No module named 'jsonlines'

Load and format data

In [None]:
# load test_inputs for the task 
with jsonlines.open("tutorial_data/datasets/cnn_dailymail_test.jsonl") as file: 
    test_dataset = list(file.iter())
test_inputs = [sample['embedding_input'] for sample in test_dataset] # get the raw inputs for the task (no formatting)

n_samples = len(test_inputs)

In [None]:
# load test_generations, numpy array (n_samples x n_models) of generations

models = ["mistral-7b", "llama-2-7b", "vicuna-7b", "gemma-7b", "nous-capybara"]
n_models = len(models)
test_generations = []
for model in models:
    predictions_path = f"tutorial_data/generations/cnn_dailymail/{model}_test.json"
    with open(predictions_path, "r") as f:
        test_generations.append(json.load(f)['generations'])

test_generations = np.array(test_generations).T

In [None]:
# embed test_inputs for sample-dependent routing 
# this is used for Smoothie-dependent, in KNN to determine which samples should be used to learn the Smoothie weights for a given test sample 

model_name = "all-mpnet-base-v2"
model = SentenceTransformer(model_name)

test_input_embeddings = model.encode(test_inputs)


In [None]:
# embed test_generations --- these are the embeddings used in the main Smoothie algorithm
def clean_generation(generation: str):
    """
    Extracts a generation from the full output of the model.
    """
    generation = generation.replace("<pad>", "")
    generation = generation.replace("<s>", "")
    generation = generation.replace("</s>", "")
    generation = generation.replace("</eos>", "")
    generation = generation.replace("\\n", "\n")
    return generation.strip().split("\n")[0]

cleaned_test_generations = np.array([clean_generation(gen) for gens_per_sample in test_generations for gen in gens_per_sample])

embedding_model = TextEmbedding(
    model_name="BAAI/bge-small-en-v1.5", providers=["CUDAExecutionProvider"]
)
smoothie_embeddings = np.array(list(embedding_model.embed(cleaned_test_generations))).reshape(n_samples, n_models, -1)
embed_dim = smoothie_embeddings.shape[2]

Use either smoothie-dependent or smoothie-independent (only run one of the two cells below!)

In [None]:
# Code for smoothie-dependent
# produces smoothie_dataset_weights, an n_samples x n_models numpy array of scores for each generation in test_generations
# for smoothie-dependent, each row of weights is different 

# adjust n_neighbors as you wish
nbrs = NearestNeighbors(n_neighbors=20, algorithm="auto")
nbrs.fit(test_input_embeddings)
_, test_indices = nbrs.kneighbors(test_input_embeddings)

smoothie_dataset_weights = []
for sample_idx in range(n_samples):
    embs_per_sample = smoothie_embeddings[test_indices[sample_idx]]
    smoothie = Smoothie(n_voters=n_models, dim=embed_dim)
    smoothie.fit(embs_per_sample)
    smoothie_dataset_weights.append(smoothie.theta)

smoothie_dataset_weights = np.array(smoothie_dataset_weights)

In [None]:
# Code for smoothie-independent
# each row of weights in smoothie_dataset_weights is the same - we just use one model for the dataset.
smoothie = Smoothie(n_voters=n_models, dim=embed_dim)
smoothie.fit(smoothie_embeddings)
smoothie_dataset_weights = np.tile(smoothie.theta, (n_samples, 1))


Select samples according to smoothie weights

In [None]:
# finally, select samples according to smoothie weights

routed_texts = []
routed_models = []

for sample_idx in range(n_samples):
    max_idx = smoothie_dataset_weights[sample_idx].argmax()
    text = test_generations[sample_idx][max_idx]
    routed_texts.append(text)
    routed_models.append(models[max_idx])

In [None]:
routed_texts

In [None]:
routed_models