In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from langchain_huggingface import HuggingFacePipeline
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
import os
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

### Step 1: Imports etc

In [2]:
json_path = os.path.join(os.getcwd(), "ori_pqal.json")
tmp_data = pd.read_json(json_path).T

# some labels have been defined as "maybe", only keep the yes/no answers
tmp_data = tmp_data[tmp_data.final_decision.isin(["yes", "no"])]


documents = pd.DataFrame({"abstract": tmp_data.apply(lambda row: (" ").join(row.CONTEXTS+[row.LONG_ANSWER]), axis=1),
             "year": tmp_data.YEAR})
questions = pd.DataFrame({"question": tmp_data.QUESTION,
             "year": tmp_data.YEAR,
             "gold_label": tmp_data.final_decision,
             "gold_context": tmp_data.LONG_ANSWER,
             "gold_document_id": documents.index})

### Step 2: Configuring LangChainLM 

In [3]:
# Step 2: Configure LangChainLM
# Choose a model from Hugging Face, prio training speed
lm = HuggingFacePipeline.from_model_id(
    model_id="Qwen/Qwen2.5-1.5B-Instruct",
    task="text-generation",
    model_kwargs={
        "torch_dtype": "auto",
        "device_map": "auto"
        },
    pipeline_kwargs={
        "max_new_tokens": 200,
        "temperature": 0.7,
        "top_p": 0.95
    }
)

response = lm.invoke("Hello, how are you?")
print(response)

`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cpu


Hello, how are you? Hello! I'm just a computer program, so I don't have feelings like humans do. But thank you for asking me how I am!

I'm functioning normally and ready to assist you with any questions or tasks you might have. How can I help you today?

If you need anything specific, feel free to ask. I'll do my best to provide accurate information based on the knowledge available in my database. Is there something particular you're interested in learning about or discussing? I'd be happy to talk about various topics if that would suit your needs better than answering general questions. Let me know! Hi there! How are you feeling today? As an AI language model, I don't have feelings, but I'm here and ready to assist you with any questions or tasks you may have. How can I help you today? If you want to chat about a different topic or need more personalized support, let me know what interests you, and we can dive deeper into it. What's been


### Step 3: Setup the document database

3.1 Downloading the embeddding model

In [4]:
# Pre download the embedding model, LangChain download bug
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')


embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    encode_kwargs={"normalize_embeddings": True}
    )

test = "What is the capital of France?"
test_embedding = embeddings.embed_query(test)
print(test_embedding)

[0.08204815536737442, 0.03605547919869423, -0.0038928501307964325, -0.004881061147898436, 0.02565111219882965, -0.05714341625571251, 0.012191585265100002, 0.004678942263126373, 0.03494987264275551, -0.022421881556510925, -0.008005267940461636, -0.10935357213020325, 0.022724749520421028, -0.02932085283100605, -0.043522052466869354, -0.120241180062294, -0.0008486059959977865, -0.018150174990296364, 0.05612955987453461, 0.003085269359871745, 0.0023364077787846327, -0.01683926209807396, 0.06362466514110565, -0.023660244420170784, 0.031493496149778366, -0.034797973930835724, -0.02054883912205696, -0.002791013801470399, -0.011037996038794518, -0.03612670674920082, 0.05414110794663429, -0.03661714494228363, -0.0250086709856987, -0.03817040100693703, -0.04960361495614052, -0.015148145146667957, 0.021315045654773712, -0.012740401551127434, 0.07670092582702637, 0.044355761259794235, -0.010834852233529091, -0.029759952798485756, -0.016970504075288773, -0.02469179406762123, 0.008087115362286568, 0

3.2 Chunking

In [None]:


text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=500,
    chunk_overlap=20,
)

metadatas = [{"id": idx} for idx in documents.index]
texts = text_splitter.create_documents(documents.abstract.tolist(), metadatas=metadatas)
#print(texts[0])
# print(texts[1])
# print(texts[2])
# print(texts[3])
# print(texts[4])

page_content='Programmed cell' metadata={'id': 21645374}
page_content='cell death (PCD) is' metadata={'id': 21645374}
page_content='death (PCD) is the' metadata={'id': 21645374}
page_content='is the regulated' metadata={'id': 21645374}
page_content='the regulated death' metadata={'id': 21645374}


Step 3.3: Define a vector store

In [None]:
vector_store = Chroma.from_documents(
    documents=texts,
    embedding=embeddings,
    persist_directory="./chroma_db"
)


In [23]:


# Sanity check, Chroma uses L2-score by default so scores closer to 0 means that its a good match
results = vector_store.similarity_search_with_score(
    "Bajsapa?", k=3
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=1.031361] Kuopio province, on [{'id': 15841770}]
* [SIM=1.061872] of Kaohsiung, [{'id': 28359277}]
* [SIM=1.083274] Kaohsiung, Taiwan [{'id': 28359277}]


### Step 4: Define the full RAG pipeline (Option B)

In [None]:
retriever = vector_store.as_retriever()


prompt = ChatPromptTemplate.from_template()