In [1]:
import requests
import chromadb
import re
import uuid
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

def parse_experiments():
    """ Get a list of secret experiment from ColdF  """
    url = "https://raw.githubusercontent.com/cgrodrigues/rag-intro/main/coldf_secret_experiments.txt"

    response = requests.get(url)
    if response.status_code == 200:
        text = response.text

        # Split the text using the experiment identifier as a delimiter
        experiments = text.split('# Experiment')
        
        # Remove empty strings and reformat each experiment
        experiments = ['# Experiment ' + exp.strip() for exp in experiments if exp.strip()]
        
        return experiments
    else:
        raise Exception(f"Failed to fetch the file: {response.status_code}")
    

def init_chroma_db(store_name:str="documents"):
    """ Initialize ChromaDB client. """
    chroma_client = chromadb.PersistentClient(path="./cromadb")
    vector_store = chroma_client.get_or_create_collection(store_name)
    return chroma_client, vector_store

def chunk_embed_text(input, context_encoder, context_tokenizer, chunk_size:int=0, overlap_size:int=0 ):
    """Generate chunks and id from the list of texts."""

    chunks = []
    ids = []
    embeddings = []
    pattern = r"^# Experiment \d+.*"
    for text in input:
        start = 0
        
        if chunk_size == 0:
            _chunk_size = len(text) + overlap_size
        else:
            _chunk_size = chunk_size
        match = re.findall(pattern, text)
        if match:
            id = match[0]
        else: # some random id
            id = uuid.uuid4()
        ct = 0
        while start < len(text):
            # get the chunk
            end = start + _chunk_size
            chunk = f"{text[start:end]}"
            chunks.append(chunk)
            start += _chunk_size - overlap_size

            # get the embeddings
            inputs = context_tokenizer(chunk, return_tensors='pt')
            embedding = context_encoder(**inputs).pooler_output.detach().numpy()[0].tolist()
            embeddings.append(embedding)

            # get the id
            ids.append(f"{id}_{str(ct)}")
            ct += 1
            
    return chunks, ids, embeddings


def preprocess_text_to_chroma(text, vector_store, chunk_size:int=0, overlap_size:int=0): 
    """Process text and store chunks in ChromaDB."""

    # Get the encoder and tokenizer
    context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
    context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
    
    # Create the chunks, ids and embeddings from the experiment text to put in the database
    chunks, ids, embeddings = chunk_embed_text(input=text, 
                                               context_encoder=context_encoder, 
                                               context_tokenizer=context_tokenizer, 
                                               chunk_size=chunk_size, 
                                               overlap_size=overlap_size)
    # Add to the database
    vector_store.add(documents=chunks, embeddings=embeddings, ids=ids)
    
# Get the secret experiment text
text = parse_experiments()

# Get the Vector Database Client
chroma_client, vector_store = init_chroma_db("documents")

# Put the secret experiments in the vector database
preprocess_text_to_chroma(text=text, vector_store=vector_store, chunk_size=0, overlap_size=0)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this fu

In [4]:
from chromadb.api.types import QueryResult
from ollama import Client

def get_inference_prompt(question:str, context_encoder, context_tokenizer) -> tuple[str, QueryResult]:
    """ Based on the question get the most relevants chunks from teh database and create the prompt to feed the model
        Return the prompt and the result of the search in the database"""
    
    inputs = context_tokenizer(question, return_tensors='pt')
    query_embeddings = context_encoder(**inputs).pooler_output.detach().numpy()[0].tolist()
     
    results = vector_store.query(query_embeddings, n_results=3)
    # results = vector_store.query(query_texts=question, n_results=10)

    documents = "\n".join(results['documents'][0])

    prompt = f"""DOCUMENT:
{documents}

QUESTION:
{question}

INSTRUCTIONS:
Answer the users QUESTION using the DOCUMENT text above.
Give short and concise answers.
Keep your answer ground in the facts of the DOCUMENT.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return 'NONE'"""

    return prompt, results


def get_inference(question, context_encoder, context_tokenizer):
    """ Inference in the LLaMA 3 model serve by Ollama """
    
    host = ""
    model = "llama3"
    prompt, db_results = get_inference_prompt(question, context_encoder, context_tokenizer)

    system_message = {"role": "system", "content": prompt}
    messages = [system_message]

    response = Client(host=host).chat(model=model, messages=messages, options= {"seed": 42, "top_p": 0.9, "temperature": 0 })

    return response, prompt, db_results

# Get the query from the end user, search in the vector database.
question = input("Please enter question: ")

# Get the encoder and tokenizer 
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# Prepara the data and get the answer
response, prompt, db_results = get_inference(question, context_encoder, context_tokenizer)

print("\n================================\n")
print(f"Prompt: {prompt}")
print("\n================================\n")
print(f"Database Results: {db_results}")
print("\n================================\n")
print(f"Response: {response['message']['content']}")
print("\n================================\n")

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokeniz



Prompt: DOCUMENT:
# Experiment 1
## May 23, 2024
The first experiment focused on using palladium electrodes submerged in heavy water (deuterium oxide, D2O). Dr. Emily D. Jensen, Senior Physicist, led this trial. The procedure involved electrolysis at a constant current of 50 mA, aiming to induce cold fusion within the palladium lattice. Throughout the 12-hour process, temperatures were carefully monitored, maintaining a steady 25°C. Voltage readings were recorded every hour to observe any anomalies indicating fusion events. The experiment yielded promising preliminary results with minor heat generation detected, suggesting potential excess energy beyond chemical reactions.
To ensure accuracy, additional measurements included the analysis of gas output. Both hydrogen and deuterium gas levels were monitored using a gas chromatograph, which revealed an increase in deuterium gas concentration over time. This increase suggested that deuterium nuclei might be fusing within the palladium la

In [5]:
response

{'model': 'llama3',
 'created_at': '2024-06-14T18:37:01.505996028Z',
 'message': {'role': 'assistant',
  'content': 'ANSWER:\nPalladium electrodes were used in Experiment 1.'},
 'done_reason': 'stop',
 'done': True,
 'total_duration': 7402812842,
 'load_duration': 1174717,
 'prompt_eval_count': 718,
 'prompt_eval_duration': 4826648000,
 'eval_count': 15,
 'eval_duration': 2410807000}

In [6]:
# # CODE 1


# from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

# text = "apple"

# context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
# context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# inputs = context_tokenizer(text, return_tensors='pt')
# embeddings = context_encoder(**inputs).pooler_output.detach().numpy()[0].tolist()
# print(f"Embeddings:{embeddings}")
# print(f"Lenght embeddings:{len(embeddings)}")


In [7]:
import requests
import pandas as pd

def get_questions_and_answers():
    """ Read the list of questions and answers and return a list of tuples """
    # URL of the file
    url = 'https://raw.githubusercontent.com/cgrodrigues/rag-intro/main/coldf_question_and_answer.psv'

    # Fetch the file from the URL
    response = requests.get(url)
    data = response.text

    # Split the data into lines
    lines = data.split('\n')

    # Split each line by the pipe symbol and create tuples
    qa_pairs = []
    for line in lines[1:]:
        if line.strip():  # Ensure the line is not empty
            question, answer = line.split('|')
            qa_pairs.append((question.strip(), answer.strip()))

    # Return the list of tuples
    return qa_pairs

In [8]:
qa_pais = get_questions_and_answers()

#
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

questions = []
reference_answers = []
generated_answers = []

for question, reference_answer in qa_pais:
    response, _  = get_inference(question, context_encoder, context_tokenizer)
    generated_answer = response['message']['content']

    questions.append(question)
    reference_answers.append(reference_answer)
    generated_answers.append(generated_answer)

# Create the DataFrame
df = pd.DataFrame({
    'question': questions,
    'reference_answer': reference_answers,
    'generated_answer': generated_answers
})

df


Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokeniz

{'ids': [['# Experiment 1_0', '# Experiment 6_0', '# Experiment 11_0']], 'distances': [[188.41211816189255, 191.2033599205535, 194.7212749923776]], 'metadatas': [[None, None, None]], 'embeddings': None, 'documents': [['# Experiment 1\n## May 23, 2024\nThe first experiment focused on using palladium electrodes submerged in heavy water (deuterium oxide, D2O). Dr. Emily D. Jensen, Senior Physicist, led this trial. The procedure involved electrolysis at a constant current of 50 mA, aiming to induce cold fusion within the palladium lattice. Throughout the 12-hour process, temperatures were carefully monitored, maintaining a steady 25°C. Voltage readings were recorded every hour to observe any anomalies indicating fusion events. The experiment yielded promising preliminary results with minor heat generation detected, suggesting potential excess energy beyond chemical reactions.\nTo ensure accuracy, additional measurements included the analysis of gas output. Both hydrogen and deuterium gas l

Unnamed: 0,question,reference_answer,generated_answer
0,Who led the first experiment?,Dr. Emily D. Jensen,"The user's question is: ""Who led the first exp..."
1,What material was used for the electrodes in E...,Palladium,ANSWER:\nPalladium electrodes were used in Exp...
2,What was the temperature maintained at during ...,25°C,The temperature maintained during Experiment 1...
3,How long did Experiment 1 last?,12 hours,"The first experiment, which was led by Dr. Emi..."
4,What type of water was used in all the experim...,"Heavy water (deuterium oxide, D2O)",The type of water used in all the experiments ...
...,...,...,...
84,What music was played in the lab during Experi...,NONE,NONE\n\nThe DOCUMENT does not mention any spec...
85,What color was the lab coat of Dr. David R. Mo...,NONE,NONE
86,What did Dr. Natasha R. Ivanov read before Exp...,NONE,The question is: What did Dr. Natasha R. Ivano...
87,What was the nationality of Dr. Benjamin K. Park?,NONE,NONE


In [9]:
# import numpy as np
# from sklearn.metrics import precision_score, recall_score, f1_score
# from nltk.translate.bleu_score import sentence_bleu
# from rouge import Rouge


In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(df["reference_answer"], df["generated_answer"], average='weighted')
print(precision)

recall = recall_score(df["reference_answer"], df["generated_answer"], average='weighted')
print(recall)

f1 = f1_score(df["reference_answer"], df["generated_answer"], average='weighted')
print(f1)

0.2247191011235955
0.0449438202247191
0.07490636704119849


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
