In [28]:
import requests

def parse_experiments():
    """ Get a list of secret experiment from ColdF  """
    url = "https://raw.githubusercontent.com/cgrodrigues/rag-intro/main/coldf_secret_experiments.txt"

    response = requests.get(url)
    if response.status_code == 200:
        text = response.text

        # Split the text using the experiment identifier as a delimiter
        experiments = text.split('[Experiment')
        
        # Remove empty strings and reformat each experiment
        experiments = ['[Experiment' + exp.strip() for exp in experiments if exp.strip()]
        
        return experiments
    else:
        raise Exception(f"Failed to fetch the file: {response.status_code}")


In [None]:
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
import numpy as np
import chromadb


def get_context_encoder_tokenizer():
    """ Load the DPR context encoder and tokenizer. """
    context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
    context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
    return context_encoder, context_tokenizer

def init_chroma_db(store_name:str="documents"):
    """ Initialize ChromaDB client. """
    chroma_client = chromadb.PersistentClient(path="./cromadb")
    vector_store = chroma_client.get_or_create_collection(store_name)
    return chroma_client, vector_store

def chunk_embed_text(text, chunk_size, overlap_size, context_encoder, context_tokenizer):
    """Chunk the text into overlapping segments and generate embeddings for the text using a transformer model.."""

    chunks = []
    embeddings = []
    ids = []
    ct = 0
    for chunk in text:
        chunks.append(chunk)
        inputs = context_tokenizer(chunk, return_tensors='pt')
        embedding = context_encoder(**inputs).pooler_output.detach().numpy()[0].tolist()
        embeddings.append(embedding)

        print(chunk, embedding, ct)
        
        ids.append(f"id_{str(ct)}")
        ct += 1


    # chunks = []
    # embeddings = []
    # ids = []
    # start = 0
    # ct = 0
    # while start < len(text):
    #     end = start + chunk_size
    #     chunk = text[start:end]
    #     chunks.append(chunk)
    #     start += chunk_size - overlap_size

    #     inputs = context_tokenizer(chunk, return_tensors='pt')
    #     embedding = context_encoder(**inputs).pooler_output.detach().numpy()[0].tolist()
    #     embeddings.append(embedding)

    #     print(chunk, embedding, ct)
        
    #     ids.append(f"id_{str(ct)}")
    #     ct += 1

    # print(f"------------====>{embeddings}")
    return chunks, embeddings, ids


def preprocess_text_to_chroma(text, vector_store, chunk_size, overlap_size, context_encoder, context_tokenizer): 
    """Process text and store chunks in ChromaDB."""
    
    chunks, embeddings, ids = chunk_embed_text(text, 
                                               chunk_size, 
                                               overlap_size, 
                                               context_encoder=context_encoder, 
                                               context_tokenizer=context_tokenizer)
    vector_store.add(documents=chunks, embeddings=embeddings, ids=ids)
    




In [None]:
# Configuration
chunk_size = 500  # Define your chunk size, each experiment has more or less 
overlap_size = 50  # Define your overlap size

# Example text corpus
text = parse_experiments()

context_encoder, context_tokenizer = get_context_encoder_tokenizer()
chroma_client, vector_store = init_chroma_db("documents")
preprocess_text_to_chroma(text, 
                          vector_store, 
                          chunk_size, 
                          overlap_size, 
                          context_tokenizer=context_tokenizer, 
                          context_encoder=context_encoder)

In [5]:
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
import numpy as np
import chromadb


def init_chroma_db(store_name:str="documents"):
    """ Initialize ChromaDB client. """
    chroma_client = chromadb.PersistentClient(path="./cromadb")
    vector_store = chroma_client.get_or_create_collection(store_name)
    return chroma_client, vector_store

def chunk_embed_text(text):
    """Generate embeddings for the text using a transformer model.."""

    chunks = []
    ids = []
    ct = 0
    for chunk in text:
        chunks.append(chunk)
        ids.append(f"id_{str(ct)}")
        ct += 1

    return chunks, ids


def preprocess_text_to_chroma(text, vector_store): 
    """Process text and store chunks in ChromaDB."""
    
    chunks, ids = chunk_embed_text(text)
    vector_store.add(documents=chunks, ids=ids)
    
# Example text corpus
text = parse_experiments()

chroma_client, vector_store = init_chroma_db("documents")
preprocess_text_to_chroma(text, 
                          vector_store)


Add of existing embedding ID: id_0
Add of existing embedding ID: id_1
Add of existing embedding ID: id_2
Add of existing embedding ID: id_3
Add of existing embedding ID: id_4
Add of existing embedding ID: id_5
Add of existing embedding ID: id_6
Add of existing embedding ID: id_7
Add of existing embedding ID: id_8
Add of existing embedding ID: id_9
Add of existing embedding ID: id_10
Add of existing embedding ID: id_11
Add of existing embedding ID: id_12
Add of existing embedding ID: id_13
Add of existing embedding ID: id_14
Add of existing embedding ID: id_15
Add of existing embedding ID: id_16
Add of existing embedding ID: id_17
Add of existing embedding ID: id_18
Add of existing embedding ID: id_19
Insert of existing embedding ID: id_0
Insert of existing embedding ID: id_1
Insert of existing embedding ID: id_2
Insert of existing embedding ID: id_3
Insert of existing embedding ID: id_4
Insert of existing embedding ID: id_5
Insert of existing embedding ID: id_6
Insert of existing embed

{'ids': [['id_0', 'id_8', 'id_3', 'id_16', 'id_19', 'id_18', 'id_10', 'id_5', 'id_11', 'id_15']], 'distances': [[0.8525894004258584, 0.9353905196358957, 0.9482485195863827, 0.9590379392588978, 0.9841376880261145, 1.0198892389075616, 1.0525335040210608, 1.0530792117099146, 1.054134360354219, 1.0597632357084095]], 'metadatas': [[None, None, None, None, None, None, None, None, None, None]], 'embeddings': None, 'documents': [['[Experiment1]\n[May 23, 2024]\nThe first experiment focused on using palladium electrodes submerged in heavy water (deuterium oxide, D2O). Dr. Emily Jensen, Senior Physicist, led this trial. The procedure involved electrolysis at a constant current of 50 mA, aiming to induce cold fusion within the palladium lattice. Throughout the 12-hour process, temperatures were carefully monitored, maintaining a steady 25°C. Voltage readings were recorded every hour to observe any anomalies indicating fusion events. The experiment yielded promising preliminary results with minor 

In [24]:
question = "What were the key findings in the last successful cold fusion experiment?"
# question = "What is the color of palladium?"
results = vector_store.query(query_texts=question, n_results=5)

documents = "\n".join(results['documents'][0])

prompt = f"""DOCUMENT:
{documents}

QUESTION:
{question}

INSTRUCTIONS:
Answer the users QUESTION using the DOCUMENT text above.
Keep your answer ground in the facts of the DOCUMENT.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return 'NONE'"""

In [25]:
from ollama import Client
host = ""
model = "llama3"

system_message = {"role": "system", "content": prompt}
messages = [system_message]

response = Client(host=host).chat(model=model, messages=messages, options= {"seed": 42, "top_p": 0.9, "temperature": 0 })

In [26]:
response

{'model': 'llama3',
 'created_at': '2024-06-13T16:04:55.040275131Z',
 'message': {'role': 'assistant',
  'content': 'The key finding in the last successful cold fusion experiment (Experiment20) was that the combination of palladium and silver electrodes, with a lithium chloride electrolyte, proved to be highly effective in enhancing cold fusion reactions. The experiment showed significant heat generation, with energy output surpassing input by 25% after 8 hours.'},
 'done_reason': 'stop',
 'done': True,
 'total_duration': 20054003340,
 'load_duration': 1495696,
 'prompt_eval_count': 1125,
 'prompt_eval_duration': 7309602000,
 'eval_count': 66,
 'eval_duration': 12585410000}

In [None]:
# Configuration
chunk_size = 500  # Define your chunk size
overlap_size = 50  # Define your overlap size

# Example text corpus
text = get_coldf_experiment_text()
# text = """
# CHAPTER I
# A SHIFTING REEF
# The year 1866 was signalised by a remarkable incident, a mysterious and puzzling phenomenon, which doubtless no one has yet forgotten. Not to mention rumours which agitated the maritime population and excited the public mind, even in the interior of continents, seafaring men were particularly excited. Merchants, common sailors, captains of vessels, skippers, both of Europe and America, naval officers of all countries, and the Governments of several states on the two continents, were deeply interested in the matter.

# For some time past, vessels had been met by “an enormous thing,” a long object, spindle-shaped, occasionally phosphorescent, and infinitely larger and more rapid in its movements than a whale.

# The facts relating to this apparition (entered in various log-books) agreed in most respects as to the shape of the object or creature in question, the untiring rapidity of its movements, its surprising power of locomotion, and the peculiar life with which it seemed endowed. If it was a cetacean, it surpassed in size all those hitherto classified in science. Taking into consideration the mean of observations made at divers times,—rejecting the timid estimate of those who assigned to this object a length of two hundred feet, equally with the exaggerated opinions which set it down as a mile in width and three in length,—we might fairly conclude that this mysterious being surpassed greatly all dimensions admitted by the ichthyologists of the day, if it existed at all. And that it did exist was an undeniable fact; and, with that tendency which disposes the human mind in favour of the marvellous, we can understand the excitement produced in the entire world by this supernatural apparition. As to classing it in the list of fables, the idea was out of the question.

# On the 20th of July, 1866, the steamer Governor Higginson, of the Calcutta and Burnach Steam Navigation Company, had met this moving mass five miles off the east coast of Australia. Captain Baker thought at first that he was in the presence of an unknown sandbank; he even prepared to determine its exact position, when two columns of water, projected by the inexplicable object, shot with a hissing noise a hundred and fifty feet up into the air. Now, unless the sandbank had been submitted to the intermittent eruption of a geyser, the Governor Higginson had to do neither more nor less than with an aquatic mammal, unknown till then, which threw up from its blow-holes columns of water mixed with air and vapour.

# Similar facts were observed on the 23rd of July in the same year, in the Pacific Ocean, by the Columbus, of the West India and Pacific Steam Navigation Company. But this extraordinary cetaceous creature could transport itself from one place to another with surprising velocity; as, in an interval of three days, the Governor Higginson and the Columbus had observed it at two different points of the chart, separated by a distance of more than seven hundred nautical leagues.

# Fifteen days later, two thousand miles farther off, the Helvetia, of the Compagnie-Nationale, and the Shannon, of the Royal Mail Steamship Company, sailing to windward in that portion of the Atlantic lying between the United States and Europe, respectively signalled the monster to each other in 42° 15′ N. lat. and 60° 35′ W. long. In these simultaneous observations they thought themselves justified in estimating the minimum length of the mammal at more than three hundred and fifty feet, as the Shannon and Helvetia were of smaller dimensions than it, though they measured three hundred feet over all.

# Now the largest whales, those which frequent those parts of the sea round the Aleutian, Kulammak, and Umgullich islands, have never exceeded the length of sixty yards, if they attain that.

# These reports arriving one after the other, with fresh observations made on board the transatlantic ship Pereire, a collision which occurred between the Etna of the Inman line and the monster, a procès verbal directed by the officers of the French frigate Normandie, a very accurate survey made by the staff of Commodore Fitz-James on board the Lord Clyde, greatly influenced public opinion. Light-thinking people jested upon the phenomenon, but grave practical countries, such as England, America, and Germany, treated the matter more seriously.

# In every place of great resort the monster was the fashion. They sang of it in the cafés, ridiculed it in the papers, and represented it on the stage. All kinds of stories were circulated regarding it. There appeared in the papers caricatures of every gigantic and imaginary creature, from the white whale, the terrible “Moby Dick” of hyperborean regions, to the immense kraken whose tentacles could entangle a ship of five hundred tons, and hurry it into the abyss of the ocean. The legends of ancient times were even resuscitated, and the opinions of Aristotle and Pliny revived, who admitted the existence of these monsters, as well as the Norwegian tales of Bishop Pontoppidan, the accounts of Paul Heggede, and, last of all, the reports of Mr. Harrington (whose good faith no one could suspect), who affirmed that, being on board the Castillan, in 1857, he had seen this enormous serpent, which had never until that time frequented any other seas but those of the ancient “Constitutionnel.”

# Then burst forth the interminable controversy between the credulous and the incredulous in the societies of savants and the scientific journals. “The question of the monster” inflamed all minds. Editors of scientific journals, quarrelling with believers in the supernatural, spilled seas of ink during this memorable campaign, some even drawing blood; for, from the sea-serpent they came to direct personalities.

# For six months war was waged with various fortune in the leading articles of the Geographical Institution of Brazil, the Royal Academy of Science of Berlin, the British Association, the Smithsonian Institution of Washington, in the discussions of the “Indian Archipelago,” of the Cosmos of the Abbé Moigno, in the Mittheilungen of Petermann, in the scientific chronicles of the great journals of France and other countries. The cheaper journals replied keenly and with inexhaustible zest. These satirical writers parodied a remark of Linnæus, quoted by the adversaries of the monster, maintaining “that nature did not make fools,” and adjured their contemporaries not to give the lie to nature, by admitting the existence of krakens, sea-serpents, “Moby Dicks,” and other lucubrations of delirious sailors. At length an article in a well-known satirical journal by a favourite contributor, the chief of the staff, settled the monster, like Hippolytus, giving it the death-blow amidst an universal burst of laughter. Wit had conquered science.

# During the first months of the year 1867 the question seemed buried, never to revive, when new facts were brought before the public. It was then no longer a scientific problem to be solved, but a real danger seriously to be avoided. The question took quite another shape. The monster became a small island, a rock, a reef, but a reef of indefinite and shifting proportions.

# On the 5th of March, 1867, the Moravian, of the Montreal Ocean Company, finding herself during the night in 27° 30′ lat. and 72° 15′ long., struck on her starboard quarter a rock, marked in no chart for that part of the sea. Under the combined efforts of the wind and its four hundred horse-power, it was going at the rate of thirteen knots. Had it not been for the superior strength of the hull of the Moravian, she would have been broken by the shock and gone down with the 237 passengers she was bringing home from Canada.

# The accident happened about five o’clock in the morning, as the day was breaking. The officers of the quarter-deck hurried to the after-part of the vessel. They examined the sea with the most scrupulous attention. They saw nothing but a strong eddy about three cables’ length distant, as if the surface had been violently agitated. The bearings of the place were taken exactly, and the Moravian continued its route without apparent damage. Had it struck on a submerged rock, or on an enormous wreck? they could not tell; but on examination of the ship’s bottom when undergoing repairs, it was found that part of her keel was broken.

# This fact, so grave in itself, might perhaps have been forgotten like many others if, three weeks after, it had not been re-enacted under similar circumstances. But, thanks to the nationality of the victim of the shock, thanks to the reputation of the company to which the vessel belonged, the circumstance became extensively circulated.

# The 13th of April, 1867, the sea being beautiful, the breeze favourable, the Scotia, of the Cunard Company’s line, found herself in 15° 12′ long. and 45° 37′ lat. She was going at the speed of thirteen knots and a half.

# At seventeen minutes past four in the afternoon, whilst the passengers were assembled at lunch in the great saloon, a slight shock was felt on the hull of the Scotia, on her quarter, a little aft of the port-paddle.

# The Scotia had not struck, but she had been struck, and seemingly by something rather sharp and penetrating than blunt. The shock had been so slight that no one had been alarmed, had it not been for the shouts of the carpenter’s watch, who rushed on to the bridge, exclaiming, “We are sinking! we are sinking!” At first the passengers were much frightened, but Captain Anderson hastened to reassure them. The danger could not be imminent. The Scotia, divided into seven compartments by strong partitions, could brave with impunity any leak. Captain Anderson went down immediately into the hold. He found that the sea was pouring into the fifth compartment; and the rapidity of the influx proved that the force of the water was considerable. Fortunately this compartment did not hold the boilers, or the fires would have been immediately extinguished. Captain Anderson ordered the engines to be stopped at once, and one of the men went down to ascertain the extent of the injury. Some minutes afterwards they discovered the existence of a large hole, of two yards in diameter, in the ship’s bottom. Such a leak could not be stopped; and the Scotia, her paddles half submerged, was obliged to continue her course. She was then three hundred miles from Cape Clear, and after three days’ delay, which caused great uneasiness in Liverpool, she entered the basin of the company.

# The engineers visited the Scotia, which was put in dry dock. They could scarcely believe it possible; at two yards and a half below water-mark was a regular rent, in the form of an isosceles triangle. The broken place in the iron plates was so perfectly defined that it could not have been more neatly done by a punch. It was clear, then, that the instrument producing the perforation was not of a common stamp; and after having been driven with prodigious strength, and piercing an iron plate 1-3/8 inches thick, had withdrawn itself by a retrograde motion truly inexplicable.

# Such was the last fact, which resulted in exciting once more the torrent of public opinion. From this moment all unlucky casualties which could not be otherwise accounted for were put down to the monster. Upon this imaginary creature rested the responsibility of all these shipwrecks, which unfortunately were considerable; for of three thousand ships whose loss was annually recorded at Lloyd’s, the number of sailing and steam ships supposed to be totally lost, from the absence of all news, amounted to not less than two hundred!

# Now, it was the “monster” who, justly or unjustly, was accused of their disappearance, and, thanks to it, communication between the different continents became more and more dangerous. The public demanded peremptorily that the seas should at any price be relieved from this formidable cetacean. 
# """


context_encoder, context_tokenizer = get_context_encoder_tokenizer()
chroma_client, vector_store = init_chroma_db("documents")
preprocess_text_to_chroma(text, vector_store, chunk_size, overlap_size, context_tokenizer=context_tokenizer, context_encoder=context_encoder)



In [None]:
query_text = "Who is Alexandre Dumas?"
# _, query_embeddings, _   = chunk_embed_text(query_text, chunk_size, overlap_size,context_encoder, context_tokenizer)

# results = vector_store.query(query_embeddings, n_results=3)
results = vector_store.query(query_texts=query_text, n_results=3)

print(results)

In [None]:
# CODE 1


from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

text = "apple"

context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

inputs = context_tokenizer(text, return_tensors='pt')
embeddings = context_encoder(**inputs).pooler_output.detach().numpy()[0].tolist()
print(f"Embeddings:{embeddings}")
print(f"Lenght embeddings:{len(embeddings)}")


In [None]:
inputs

In [None]:
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
input_ids = tokenizer(text, return_tensors="pt")["input_ids"]
embeddings = model(input_ids).pooler_output

print(embeddings.shape)