In [1]:
# Get the SciQ dataset from HuggingFace
from datasets import load_dataset

dataset = load_dataset("sciq", split="train")

# Filter the dataset to only include questions with a support
dataset = dataset.filter(lambda x: x["support"] != "")

print("Number of questions with support: ", len(dataset))

Number of questions with support:  10481


In [2]:
dataset[0]

{'question': 'What type of organism is commonly used in preparation of foods such as cheese and yogurt?',
 'distractor3': 'viruses',
 'distractor1': 'protozoa',
 'distractor2': 'gymnosperms',
 'correct_answer': 'mesophilic organisms',
 'support': 'Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.'}

In [14]:
from chromadb import HttpClient, EphemeralClient, Client
from chromadb.config import Settings
settings = Settings(chroma_api_impl="chromadb.api.fastapi.FastAPI")
client = Client(settings=settings)

ModuleNotFoundError: No module named 'chromadb.telemetry.posthog'

In [12]:
import chromadb
client = chromadb.EphemeralClient()

RuntimeError: Chroma is running in http-only client mode, and can only be run with 'chromadb.api.fastapi.FastAPI' as the chroma_api_impl.             see https://docs.trychroma.com/usage-guide?lang=py#using-the-python-http-only-client for more information.

In [4]:
# Create a new Chroma collection to store the supporting evidence. We don't need to specify an embedding fuction, and the default will be used.
collection = client.create_collection("sciq_supports")

In [5]:
# Embed and store the first 100 supports for this demo
collection.add(
    ids=[str(i) for i in range(0, 100)],  # IDs are just strings
    documents=dataset["support"][:100],
    metadatas=[{"type": "support"} for _ in range(0, 100)
    ],
)

In [6]:
results = collection.query(
    query_texts=dataset["question"][:10],
    n_results=1)

In [7]:
# Print the question and the corresponding support
for i, q in enumerate(dataset['question'][:10]):
    print(f"Question: {q}")
    print(f"Retrieved support: {results['documents'][i][0]}")
    print()

Question: What type of organism is commonly used in preparation of foods such as cheese and yogurt?
Retrieved support: Agents of Decomposition The fungus-like protist saprobes are specialized to absorb nutrients from nonliving organic matter, such as dead organisms or their wastes. For instance, many types of oomycetes grow on dead animals or algae. Saprobic protists have the essential function of returning inorganic nutrients to the soil and water. This process allows for new plant growth, which in turn generates sustenance for other organisms along the food chain. Indeed, without saprobe species, such as protists, fungi, and bacteria, life would cease to exist as all organic carbon became “tied up” in dead organisms.

Question: What phenomenon makes global winds blow northeast to southwest or the reverse in the northern hemisphere and northwest to southeast or the reverse in the southern hemisphere?
Retrieved support: Without Coriolis Effect the global winds would blow north to south

In [8]:
from chromadb.utils import embedding_functions

In [16]:
default_ef = embedding_functions.HuggingFaceEmbeddingFunction( api_key="https://api-inference.huggingface.co/models/bert-base-uncased", model_name="bert-base-uncased")

In [17]:
val = default_ef(["zuid"])
val

{'error': 'Authorization header is correct, but the token seems invalid'}

In [3]:
from vector_database_manager import ChromaClient, Mode

In [6]:
client = ChromaClient(host="172.16.13.74", port_number=8000, mode=Mode.host)

In [10]:
collection = client.get_collection("SVD_for_documents_retrieval")

KeyboardInterrupt: 

In [11]:
documents = ["Chess is called the game of kings. It has been around for a long time. People have been playing it for "
             "over 500 years. Chess is based on an even older game from India. The chess we play today is from Europe.",
             "Chess is a two-player game. One player uses the white pieces. The other uses the"
             "black pieces. Each piece moves in a special way. One piece is called the king. Each player has one. The "
             "players take turns moving their pieces. If a player lands on a piece, he or she takes it. The game ends "
             "when a player loses his or her king. There are a few more rules, but those are the basics.",
             "Some people think that chess is more than a game. They think that it makes the mind stronger. Good "
             "chess players use their brains. They take their time. They think about what will happen next. These "
             "skills are useful in life and in chess. Chess is kind of like a work out for the mind",
             "You don't always have lots of time to think when playing chess. There is a type of chess with short "
             "time limits. It's called blitz chess. In blitz chess, each player gets ten minutes to use for the whole "
             "game. Your clock runs during your turn. You hit the time clock after your move. This stops your clock. "
             "It also starts the other player's clock. If you run out of time, you lose. Games of blitz chess are "
             "fast-paced.",
             "Chess is not just for people. Computers have been playing chess since the 1970s. At first they did not "
             "play well. They made mistakes. As time went on they grew stronger. In 1997, a computer beat the best "
             "player in the world for the first time. It was a computer called Deep Blue. Deep Blue was big. It took "
             "up a whole room. By 2006 a cell phone could beat the best players in the world. Chess sure has come a "
             "long way. Don't you think so ?"]
import torch
from sentence_transformers import SentenceTransformer, util
passage_encoder = SentenceTransformer('facebook-dpr-ctx_encoder-single-nq-base')

passages = documents

passage_embeddings = passage_encoder.encode(passages)

query_encoder = SentenceTransformer('facebook-dpr-ctx_encoder-multiset-base')
query = "Computers have been playing chess since the 1970s. At first they did not"
print(query)
query_embedding = query_encoder.encode(query)

#Important: You must use dot-product, not cosine_similarity
scores = util.dot_score(query_embedding, passage_embeddings)
print("Scores:", scores)

Computers have been playing chess since the 1970s. At first they did not
Scores: tensor([[89.4950, 75.8594, 86.4711, 83.7911, 98.1524]])


In [12]:
# return the passages ranked by the highest score
ranked_scores = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)


In [13]:
# Print the top 5 results
for i, score in ranked_scores[:5]:
    print(f"Passage: {passages[i]}")
    print(f"Score: {score}")
    print()

Passage: Chess is called the game of kings. It has been around for a long time. People have been playing it for over 500 years. Chess is based on an even older game from India. The chess we play today is from Europe.
Score: tensor([89.4950, 75.8594, 86.4711, 83.7911, 98.1524])
