In [None]:
# Finetune Passage Embeddings with GenQ


## Part 3: Applications and Evaluation

## Create vectorstores for both models

In [None]:
import json
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

with open('data/metadatas.json', 'r') as f:
    metadatas = json.load(f)

with open('data/passages.json', 'r') as f:
    passages = json.load(f)

embeddings_genq = HuggingFaceEmbeddings(model_name='distilbert-finetuned-pubmed')
embeddings_base = HuggingFaceEmbeddings(model_name='distilbert-embedder')

print("Creating vector store for base model...")
db_base = Chroma.from_texts(passages, embeddings_base, metadatas=metadatas, persist_directory='db_base_pubmed')

print("Creating vector store for finetuned model...")
db_genq = Chroma.from_texts(passages, embeddings_genq, metadatas=metadatas, persist_directory='db_genq_pubmed')

**Compare the retrieval results for a couple example queries**

The base model's results are of little use on this domain-specific embedding task. Moreover, it was not finetuned to associate (query, passage) pairs. However, a qualitative inspection of the GenQ model shows extremely promising results. For the more abstract query about the relationship between environment and health, it draws relevant results from multiple articles. For the more specific question about myriocin, all of the top results come from the same source.

In [None]:
queries = [
    "relationship between environment and health",
    "what is myriocin useful for?"
]

for q in queries:
    print(f"Query: {q}\n")
    print(f"Base Model: {db_base.similarity_search(q)}\n")
    print(f"GenQ Model: {db_genq.similarity_search(q)}")
    print("\n")

## Retrieval Augmented Generation

Retrieval Augmented Generation is a form of question answering in which a large language model (such as OpenAI's gpt3.5-turbo) scopes its response to relevant content returned by our passage retrievers. RAG is useful in cases where you want to limit the response to your own dataset in an efficient manner. If the LLM deems that the presented context does not adequately answer the question, it will respond with `I don't know.`.

In [None]:
from getpass import getpass

os.environ['OPENAI_API_KEY'] = getpass('OpenAI API Key: ')

In [None]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import OpenAI

query = "what is myriocin successful in treating"

chain_base = RetrievalQAWithSourcesChain.from_chain_type(
    OpenAI(temperature=0),
    chain_type="map_reduce",
    retriever=db_base.as_retriever()
)

print("Base Model:")
print(chain_base({"question": query}))

chain_genq = RetrievalQAWithSourcesChain.from_chain_type(
    OpenAI(temperature=0),
    chain_type="map_reduce",
    retriever=db_genq.as_retriever()
)

print("Finetuned Model:")
print(chain_genq({"question": query}))

For the base embedding model, the retrieved documents are not of much use, so the LLM responds `I don't know.` However, the finetuned model gives a concise, accurate response based on the effective passage retrieval. It even returns the source document so that users can dive deeper.

## Extractive Question Answering

Extractive Question Answering is another form of QA in which we use models specifically tuned to pull the relevant answer from the passage. This is useful because there is no generative element - the results are directly from the source material. However, it can help highlight relevant information and sort results based on relevance, all without relying on an external LLM API. In this instance, we will deploy the extractive QA model to a Sagemaker endpoint in our account to keep full control of our requests.

In [None]:
## Extractive Question Answering

from sagemaker.huggingface import HuggingFaceModel
import sagemaker

role = sagemaker.get_execution_role()
hub = {
    'HF_MODEL_ID':'deepset/roberta-base-squad2',
    'HF_TASK':'question-answering'
}

transformers_version = '4.26.0'
pytorch_version = '1.13.1'
py_version = 'py39'
use_gpu = True

# https://github.com/aws/deep-learning-containers/blob/master/available_images.md#huggingface-inference-containers
image_uri = f"763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference:{pytorch_version}-transformers{transformers_version}-{'gpu' if use_gpu else 'cpu'}-{py_version}{'-cu117' if use_gpu else ''}-ubuntu20.04"

# create Hugging Face Model Class
qa_model = HuggingFaceModel(
    image_uri=image_uri,
    env=hub,
    role=role
)

qa_predictor = qa_model.deploy(
    initial_instance_count=1, # number of instances
    instance_type='ml.g4dn.xlarge' # ec2 instance type
)


In [None]:
from IPython.core.display import display, HTML

query = "what cancers are most common in elderly patients"

docs = db_genq.similarity_search(query)

display(HTML(f"<h3>Query: {query}</h3>"))

answers = []
for doc in docs:
    context = doc.page_content
    answers.append(qa_predictor.predict({
        "inputs": {
            "question": query,
            "context": context
        }
    }))
    answers[-1]['context'] = context

answers = sorted(answers, key=lambda x: x['score'], reverse=True)

# return results sorted by relevance
for ans in answers:
    context = ans['context']
    display(HTML(f"{context[:ans['start']]}<b>{context[ans['start']:ans['end']]}</b>{context[ans['end']:]}<br><i>Relevance: {round(ans['score'], 2)}</i>"))

## Visualize Embedding Space

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import plotly.express as px
import textwrap
import plotly.offline as pyo


def prepare_embedding_data(db: Chroma, query: str, limit: int = 1000):
    df = pd.DataFrame(db._collection.get(limit=limit, include=["documents", "embeddings", "metadatas"]))
    df["returned_doc"] = False
    df["size"] = 7.5

    query_emb = db._embedding_function.embed_query(query)
    res = db._collection.query(query_embeddings=query_emb, include=["documents", "embeddings", "metadatas"])
    df_res = pd.DataFrame([{"ids": res["ids"][0][i], "embeddings": res["embeddings"][0][i], "metadatas": res["metadatas"][0][i], "documents": res["documents"][0][i]} for i in range(len(res["embeddings"][0]))])
    df_res["returned_doc"] = True
    df_res["size"] = 20

    df = pd.concat([df, df_res])

    df["source"] = df["metadatas"].apply(lambda x: x["source"])
    df["documents"] = df["documents"].apply(lambda x: "<br>".join(textwrap.wrap(x)))

    X = np.array([e for e in df["embeddings"]])

    pca = PCA(2)
    X_out = pca.fit_transform(X)

    df["x0"] = X_out[:,0]
    df["x1"] = X_out[:,1]

    return df


def create_figure(df: pd.DataFrame, title: str):
    fig = px.scatter(df, x="x0", y="x1", color="returned_doc", custom_data=["documents"], size="size", title=title)
    fig.update_traces(
        hovertemplate="%{customdata[0]}"
    )
    return fig


query = "relationship between environment and health"

df_base = prepare_embedding_data(db_base, query=query)
df_genq = prepare_embedding_data(db_genq, query=query)

fig_base = create_figure(df_base, title=f"**Base Model**: {query}")
fig_genq = create_figure(df_genq, title=f"**GenQ Model**: {query}")

pyo.init_notebook_mode()
pyo.iplot(fig_genq)

In [None]:
pyo.iplot(fig_base)

In [None]:
from typing import Dict, List
from langchain.embeddings import SagemakerEndpointEmbeddings
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
import json


class ContentHandler(EmbeddingsContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, inputs: list[str], model_kwargs: Dict) -> bytes:
        input_str = json.dumps({"inputs": inputs, **model_kwargs})
        return input_str.encode('utf-8')

    def transform_output(self, output: bytes) -> List[List[float]]:
        response_json = json.loads(output.read().decode("utf-8"))
        return response_json["vectors"]

content_handler = ContentHandler()

with open('.endpoint_name', 'r') as f:
    endpoint_name = f.read()

embeddings = SagemakerEndpointEmbeddings(
    endpoint_name=endpoint_name,
    region_name="us-east-1",
    content_handler=content_handler
)

# now you can use this SagemakerEndpointEmbeddings object in a vectorstore as shown above
db = Chroma.from_texts(passages, embeddings, metadatas=metadatas)