In [1]:
import requests

def parse_experiments():
    """ Get a list of secret experiment from ColdF  """
    url = "https://raw.githubusercontent.com/cgrodrigues/rag-intro/main/coldf_secret_experiments.txt"

    response = requests.get(url)
    if response.status_code == 200:
        text = response.text

        # Split the text using the experiment identifier as a delimiter
        experiments = text.split('# Experiment')
        
        # Remove empty strings and reformat each experiment
        experiments = ['# Experiment ' + exp.strip() for exp in experiments if exp.strip()]
        
        return experiments
    else:
        raise Exception(f"Failed to fetch the file: {response.status_code}")


In [2]:
import chromadb
import re
import uuid


def init_chroma_db(store_name:str="documents"):
    """ Initialize ChromaDB client. """
    chroma_client = chromadb.PersistentClient(path="./cromadb")
    vector_store = chroma_client.get_or_create_collection(store_name)
    return chroma_client, vector_store

def chunk_embed_text(input, chunk_size:int=0, overlap_size:int=0 ):
    """Generate chunks and id from the list of texts."""

    chunks = []
    ids = []
    pattern = r"^\# Experiment \d+\].*"
    for text in input:
        start = 0
        if chunk_size == 0:
            chunk_size = len(text)
        match = re.findall(pattern, text)
        if match:
            id = match[0]
        else: # some random id
            id = uuid.uuid4()
        ct = 0
        while start < len(text):
            end = start + chunk_size
            chunk = f"{text[start:end]}"
            chunks.append(chunk)
            start += chunk_size - overlap_size

            ids.append(f"{id}_{str(ct)}")
            ct += 1
            
    return chunks, ids


def preprocess_text_to_chroma(text, vector_store, chunk_size:int=0, overlap_size:int=0): 
    """Process text and store chunks in ChromaDB."""
    
    chunks, ids = chunk_embed_text(text, chunk_size, overlap_size)
    vector_store.add(documents=chunks, ids=ids)
    
# Get the secret experiment text
text = parse_experiments()

# Get the Vector Database Client
chroma_client, vector_store = init_chroma_db("documents")

# Put the secret experiments in the vector database
preprocess_text_to_chroma(text, vector_store, chunk_size=300, overlap_size=50)


In [3]:
def get_inference_prompt(question:str)->str:
    results = vector_store.query(query_texts=question, n_results=5)
    documents = "\n".join(results['documents'][0])

    prompt = f"""DOCUMENT:
{documents}

QUESTION:
{question}

INSTRUCTIONS:
Answer the users QUESTION using the DOCUMENT text above.
Keep your answer ground in the facts of the DOCUMENT.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return 'NONE'"""

    return prompt

In [8]:
# Inference in the LLaMA 3 model serve by Ollama

from ollama import Client
host = ""
model = "llama3"

# Get the query from the end user, search in the vector database.
question = input("Please enter question: ")
#question = "What were the key findings in the last successful cold fusion experiment?"
# question = "What is the color of palladium?"

prompt = get_inference_prompt(question)

system_message = {"role": "system", "content": prompt}
messages = [system_message]

response = Client(host=host).chat(model=model, messages=messages, options= {"seed": 42, "top_p": 0.9, "temperature": 0 })
print(f"Prompt: {prompt}")
print(f"Response: {response['message']['content']}")

Prompt: DOCUMENT:
d at 90 mA, with temperatures controlled between 31°C and 36°C. Voltage and temperature readings were taken every 10 minutes. The experiment yielded significant heat generation, with energy output surpassing input by 23% after 7 hours. The combination of materials, magnetic field, and electrolyte co - [Experiment 17]
at 95 mA, with temperatures maintained at 32°C. Voltage and temperature readings were taken every 10 minutes. The experiment showed a significant increase in heat output, with energy exceeding input by 19% after 8 hours. The combination of palladium and gold proved to be effective in enhancing cold  - [Experiment 11]
es. Temperatures were maintained at 30°C, with readings taken every 15 minutes. The experiment showed a noticeable increase in heat output, with energy surpassing input by 22% after 6 hours. The high-pressure conditions appeared to enhance deuterium absorption, improving the cold fusion reaction.
Fu - [Experiment 16]
ent set at 90 mA. The tem

In [5]:
response

{'model': 'llama3',
 'created_at': '2024-06-14T11:06:59.71532956Z',
 'message': {'role': 'assistant',
  'content': "The DOCUMENT does not provide information about Experiment 1. Therefore, I must return 'NONE' as there is no material used for electrodes in Experiment 1 that can be found in the provided text."},
 'done_reason': 'stop',
 'done': True,
 'total_duration': 8779838658,
 'load_duration': 979959,
 'prompt_eval_count': 442,
 'prompt_eval_duration': 2791843000,
 'eval_count': 41,
 'eval_duration': 5837544000}

In [6]:
# CODE 1


from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

text = "apple"

context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

inputs = context_tokenizer(text, return_tensors='pt')
embeddings = context_encoder(**inputs).pooler_output.detach().numpy()[0].tolist()
print(f"Embeddings:{embeddings}")
print(f"Lenght embeddings:{len(embeddings)}")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this fu

Embeddings:[-0.44394582509994507, 0.40732043981552124, -0.8182123303413391, 0.2211644947528839, -0.030159039422869682, 0.7341172695159912, -0.10371781885623932, 1.328649878501892, -0.6216631531715393, -0.4707707166671753, -0.06333217769861221, 0.620055079460144, 0.16868259012699127, 0.25832870602607727, -0.3458516001701355, -0.10463718324899673, 0.14825665950775146, 1.0864906311035156, 0.005231252405792475, -0.11455629765987396, -0.16026091575622559, 0.2526855766773224, -0.028649091720581055, -0.4456080198287964, 0.7062768936157227, -0.3689732849597931, -0.01471005193889141, -0.34969228506088257, 0.13449975848197937, 0.3538621664047241, -0.5931247472763062, 0.15402652323246002, 0.24266868829727173, -0.08608291298151016, -0.03772580251097679, -0.639232337474823, -0.11476082354784012, -0.7529443502426147, -0.5277695059776306, -0.30947256088256836, 0.17393971979618073, -0.2328023761510849, 0.383547306060791, 0.006160078104585409, -0.05022634565830231, 0.015225867740809917, -2.374244213104