In [1]:
!pip install accelerate -U

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/cvmfs/soft.ccr.buffalo.edu/versions/2023.01/easybuild/software/avx512/Compiler/gcccore/11.2.0/python/3.9.6/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [2]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/cvmfs/soft.ccr.buffalo.edu/versions/2023.01/easybuild/software/avx512/Compiler/gcccore/11.2.0/python/3.9.6/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd

In [2]:
df = pd.read_csv('fact_dataset.csv')

In [3]:
facts_formatted_data = []
for _, row in df.iterrows():
    facts_formatted_data.append({"question": row["input"], "context": row["output"]})

In [4]:
facts_formatted_data[:5]

[{'question': 'What is Artificial Intelligence?',
  'context': 'Artificial Intelligence refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as visual perception, speech recognition, decision-making, and language translation.\\n'},
 {'question': 'What are the two main categories of Artificial Intelligence?',
  'context': 'The two main categories of Artificial Intelligence are Narrow AI and General AI. Narrow AI focuses on a specific task, while General AI aims to have human-like intelligence and the ability to learn and perform various tasks.\\n'},
 {'question': 'What is Machine Learning?',
  'context': 'Machine Learning is a subset of Artificial Intelligence that focuses on the development of algorithms that can learn from and make predictions or decisions on data.\\n'},
 {'question': 'What is Deep Learning?',
  'context': 'Deep Learning is a subset of Machine Learning that uses deep neural networks with many layers

In [5]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [8]:
def save_embeddings(facts_db):
    embeddings_db = []
    for fact in facts_db:
        with torch.no_grad():
            fact_embedding = encode_text(fact['question'])
        embeddings_db.append({'question': fact['question'], 'context': fact['context'], 'embedding': fact_embedding})
    torch.save(embeddings_db, 'facts_embeddings_db.pt')
save_embeddings(formatted_data)

In [9]:
def encode_text(text):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings[0]

def semantic_similarity(text_embedding1, test_embedding2):
    text_embedding1 = text_embedding1 / text_embedding1.norm()
    test_embedding2 = test_embedding2 / test_embedding2.norm()
    cosine_sim = torch.nn.functional.cosine_similarity(text_embedding1, test_embedding2, dim=0)
    return cosine_sim.item()

facts_db = formatted_data

def match_query(user_query, embeddings_db):
    user_query_embedding = encode_text(user_query)
    highest_similarity_score = 0.5
    best_response = "I'm sorry, I don't have information on that topic."
    for fact in embeddings_db:
        fact_question_embedding = fact['embedding']
        similarity = semantic_similarity(user_query_embedding, fact_question_embedding)
        if similarity > highest_similarity_score:
            highest_similarity_score = similarity
            best_response = fact['context']
    return best_response

In [10]:
embeddings_db = torch.load('facts_embeddings_db.pt')  # using precomputed embeddings to test

user_query = ["name 4 sports that are typically played in summer"]
for query in user_query:
    response = match_query(query, embeddings_db)
    print(f"Query: {query}\nResponse: {response}\n")


Query: name 4 sports that are typically played in summer
Response: Swimming, volleyball, softball, and tennis are all sports typically played in the summer.



In [12]:
user_query = ["tell me about artificial intelligence"]
for query in user_query:
    response = match_query(query, embeddings_db)
    print(f"Query: {query}\nResponse: {response}\n")

Query: tell me about artificial intelligence
Response: Artificial Intelligence (AI) is an area of computer science that focuses on creating intelligent machines that are capable of performing tasks that would typically require human intelligence such as visual perception, speech recognition, decision-making, and language translation. It involves using algorithms, machine learning, and deep learning to analyze large data sets to accomplish tasks quickly and accurately.



In [14]:
user_query = ["what is summer"]
for query in user_query:
    response = match_query(query, embeddings_db)
    print(f"Query: {query}\nResponse: {response}\n")

Query: what is summer
Response: To me, summer represents a time of fun and relaxation, full of sunny days and warm nights.



In [15]:
user_query = ["do you know about any movies?"]
for query in user_query:
    response = match_query(query, embeddings_db)
    print(f"Query: {query}\nResponse: {response}\n")

Query: do you know about any movies?
Response: 1. The Lord of the Rings by J.R.R. Tolkien\n2. 1984 by George Orwell\n3. Harry Potter by J. K. Rowling\n4. Gone With the Wind by Margaret Mitchell\n5. To Kill a Mockingbird by Harper Lee

