<a href="https://colab.research.google.com/github/alex-smith-uwec/AI_Spring2025/blob/main/RaG_Simple.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install transformers
!pip install datasets -q
!pip install sentence-transformers -q


In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np
import pandas as pd
import textwrap

# Dataset: eli5_category, filtered to "Physics"

In [None]:
# Load the training split of the dataset
dataset = load_dataset("eli5_category", split='train',trust_remote_code=True)

# Define a function to check if the category is 'physics'
def filter_physics(example):
    return example['category'] == 'Physics'

# Filter the dataset to only include 'physics' category
physics_dataset = dataset.filter(filter_physics)

# Slice the first 5000 entries if there are at least that many
if len(physics_dataset) > 5000:
    physics_dataset = physics_dataset.select(range(5000))

# If fewer than 5000 physics entries, this retains all available physics entries



In [None]:
physics_dataset

Dataset({
    features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
    num_rows: 5000
})

In [None]:
phys = physics_dataset.flatten()

In [None]:
phys

Dataset({
    features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'answers.text_urls', 'title_urls', 'selftext_urls'],
    num_rows: 5000
})

In [None]:
phys = phys.rename_column('answers.text', 'answers')
phys=phys.rename_column('title','question')

In [None]:
phys

Dataset({
    features: ['q_id', 'question', 'selftext', 'category', 'subreddit', 'answers.a_id', 'answers', 'answers.score', 'answers.text_urls', 'title_urls', 'selftext_urls'],
    num_rows: 5000
})

In [None]:
columns_to_keep = ['question', 'answers']

# List of columns to remove
columns_to_remove = [col for col in phys.column_names if col not in columns_to_keep]

# Remove the unwanted columns
phys = phys.remove_columns(columns_to_remove)

In [None]:
phys

Dataset({
    features: ['question', 'answers'],
    num_rows: 5000
})

In [None]:
def extract_first_answer(row):
    # Check if the 'answers' list is not empty
    if row['answers']:
        # Return the first answer
        return {'best_answer': row['answers'][0]}
    else:
        # Return None or an empty string if there are no answers
        return {'best_answer': None}

# Apply the function to each row
phys = phys.map(extract_first_answer)


In [None]:
phys

Dataset({
    features: ['question', 'answers', 'best_answer'],
    num_rows: 5000
})

# Embed title entries

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
# Function to compute embeddings
def compute_embeddings(row):
    # Compute embedding for the question
    embedding = model.encode(row['question'], convert_to_tensor=True)
    # Return the embedding as a list to ensure compatibility with the dataset format
    return {'question_embed': embedding.numpy().tolist()}

In [None]:
# Apply the function to each row to create the new column
phys = phys.map(compute_embeddings)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
# Convert the list of embeddings in the dataset to a 2D NumPy array
question_embeddings = np.array(phys['question_embed'])


# Bring in an LLM

In [None]:
# generator = pipeline('text-generation', model='gpt2')
generator = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B')

In [None]:
# query = "What is light?"
query="What is temperature?"
query_embedding = model.encode(query, convert_to_tensor=False).reshape(1, -1)



# Calculate cosine similarities (returns a 2D array)
similarities = cosine_similarity(query_embedding, question_embeddings)

# Extract similarity scores from the array
similarity_scores = similarities[0]

# Get indices of the top k most similar questions
k=1
most_similar_indices = np.argsort(similarity_scores)[::-1][:k]

# Retrieve the most similar questions using the indices
most_similar_questions = [phys['question'][index] for index in most_similar_indices]

In [None]:
most_similar_questions

['What do degrees (temperature) actually measure?']

In [None]:
most_similar_indices

array([3472])

In [None]:
# Assuming you have a list called most_similar_indices and a dataset phys

# Retrieve the best answers for the most similar questions
best_answers = [phys['best_answer'][index] for index in most_similar_indices]

prompt = f"Original query: '{query}'\n" \
         f"Relevant question: '{phys['question'][most_similar_indices[0]]}'\n" \
         f"Best answer provided: {best_answers[0]}\n\n" \
         "Given this context, can you provide a detailed explanation or additional insights?"




In [None]:
prompt

"Original query: 'What is temperature?'\nRelevant question: 'What do degrees (temperature) actually measure?'\nBest answer provided: At a basic level you can think of temperature as the average molecular energy of a system. Temperature can be measured in different ways, but they all come down to measuring changes in heat energy (molecular motion and jiggling). In everyday thermometers, what is measured is how a liquid (originally mercury) expands with temperature, travelling up the bulb. The expansion is due to the atoms in the liquid moving faster and pushing on each other harder. In devices called thermocouples, the temperature affects how well a piece of metal conducts electricity, which can be measured precisely. There are also infrared thermometers, which measure the heat energy in the form of light that objects give off. It's important to note that temperature isn't the same thing as energy, but the explanation requires delving into the wonders of thermodynamics, which is a bit t

In [None]:
response = generator(prompt, max_length=300)


# Join these answers into a single string, separating them with a space or some other separator
prompt = " ".join(best_answers)

# Now, this prompt can be used with your LLM pipeline
# Assuming the LLM pipeline is initialized as `generator`
# response = generator(prompt, max_length=50, truncation=True)
response = generator(prompt, max_length=200, truncation=True, length_penalty=1.1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
response

[{'generated_text': "At a basic level you can think of temperature as the average molecular energy of a system. Temperature can be measured in different ways, but they all come down to measuring changes in heat energy (molecular motion and jiggling). In everyday thermometers, what is measured is how a liquid (originally mercury) expands with temperature, travelling up the bulb. The expansion is due to the atoms in the liquid moving faster and pushing on each other harder. In devices called thermocouples, the temperature affects how well a piece of metal conducts electricity, which can be measured precisely. There are also infrared thermometers, which measure the heat energy in the form of light that objects give off. It's important to note that temperature isn't the same thing as energy, but the explanation requires delving into the wonders of thermodynamics, which is a bit too complicated for me to ELI5. A thermometer's readings can be used to calculate the temperature of a system. He

In [None]:
# Retrieve the Reddit questions corresponding to the most similar indices
reddit_questions = [phys['question'][index] for index in most_similar_indices]

# Convert list of indices to a string
indices_string = ', '.join(map(str, most_similar_indices))

# Convert list of questions to a string, each question in quotes
questions_string = ', '.join(f'"{question}"' for question in reddit_questions)

generated_text = response[0]['generated_text']  # assuming 'generated_text' is the correct key



# Now incorporate this into the formatted output
output = f"Your question was '{query}'\n\n" \
         f"My answer is informed by the database documents with indices {indices_string}.\n\n" \
         f"The Reddit questions for these documents are {questions_string}\n\n" \
         f"My answer:"

# Wrap the generated text to a specified width, e.g., 80 characters
wrapped_text = textwrap.fill(generated_text, width=80)

# Combine the formatted output with the wrapped text
final_output = output + "\n" + wrapped_text

# Print the output
print(final_output)



Your question was 'What is temperature?'

My answer is informed by the database documents with indices 3472.

The Reddit questions for these documents are "What do degrees (temperature) actually measure?"

My answer:
At a basic level you can think of temperature as the average molecular energy of
a system. Temperature can be measured in different ways, but they all come down
to measuring changes in heat energy (molecular motion and jiggling). In everyday
thermometers, what is measured is how a liquid (originally mercury) expands with
temperature, travelling up the bulb. The expansion is due to the atoms in the
liquid moving faster and pushing on each other harder. In devices called
thermocouples, the temperature affects how well a piece of metal conducts
electricity, which can be measured precisely. There are also infrared
thermometers, which measure the heat energy in the form of light that objects
give off. It's important to note that temperature isn't the same thing as
energy, but t

In [None]:
phys['best_answer'][3472]

"At a basic level you can think of temperature as the average molecular energy of a system. Temperature can be measured in different ways, but they all come down to measuring changes in heat energy (molecular motion and jiggling). In everyday thermometers, what is measured is how a liquid (originally mercury) expands with temperature, travelling up the bulb. The expansion is due to the atoms in the liquid moving faster and pushing on each other harder. In devices called thermocouples, the temperature affects how well a piece of metal conducts electricity, which can be measured precisely. There are also infrared thermometers, which measure the heat energy in the form of light that objects give off. It's important to note that temperature isn't the same thing as energy, but the explanation requires delving into the wonders of thermodynamics, which is a bit too complicated for me to ELI5."