In [1]:
def load_rules(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        rules_text = file.read()
    return rules_text

def preprocess_document(document):
    # Split document into lines
    chunks = document.split('\n')
    # Remove any empty lines
    chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
    return chunks

# Load and preprocess the document
document = load_rules("ultimate_frisbee_rules-manual_copy_from_website-edited.txt")
chunks = preprocess_document(document)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Vectorize the chunks
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(chunks)

# Store vectors and chunks
index = {i: chunk for i, chunk in enumerate(chunks)}

In [7]:
def retrieve_relevant_chunks(query, vectorizer, X, index, top_n=5):
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, X).flatten()
    relevant_indices = np.argsort(similarities, axis=0)[-top_n:][::-1]
    return [index[i] for i in relevant_indices]

query = "Explain the timeout rules"
relevant_chunks = retrieve_relevant_chunks(query, vectorizer, X, index)
print(relevant_chunks)

['2.D.5. explain their viewpoint clearly and concisely;', '7.E.3. After the spirit timeout:', '20.E. If a novice player commits an infraction out of sincere ignorance of the rules, it should be common practice to stop play and explain the infraction.', '2.H. In the case where a novice player commits an infraction out of ignorance of the rules, experienced players are obliged to explain the infraction and clarify what should happen.', '15.A.5.b. Specific Rules:']


2025-03-06 16:15:11.547355: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-06 16:15:11.660686: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741295711.714132   39186 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741295711.727860   39186 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-06 16:15:11.816329: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Using device: cuda


In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import time

# model_name = 'gpt2-xl'  # this is the largest GPT-2 model from OpenAI and is open source
model_name = "EleutherAI/gpt-neo-2.7B"  # best GPT-related model for my laptop
# model_name = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'  # works, best deepseek model I can get working
# query = "Explain the timeout rules"
query = "What is the stall count?"
use_gpu_if_available = True

device = torch.device("cuda" if use_gpu_if_available and torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def load_rules(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        rules_text = file.read()
    return rules_text

def preprocess_document(document):
    # Split document into lines
    chunks = document.split('\n')
    # Remove any empty lines
    chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
    return chunks

def retrieve_relevant_chunks(query, model_retriever, chunk_embeddings, index, top_n=5):
    query_embedding = model_retriever.encode(query, convert_to_tensor=True, device=device)
    similarities = util.pytorch_cos_sim(query_embedding, chunk_embeddings)[0]
    similarities = similarities.cpu().numpy()  # Move to CPU and convert to NumPy array
    relevant_indices = np.argsort(similarities)[-top_n:][::-1]
    return [index[i] for i in relevant_indices]

def generate_response(query, relevant_chunks, model_generator, tokenizer):
    # Combine the relevant chunks into a single context
    context = " ".join(relevant_chunks)
    # input_text = f"Query: Be concise. {query}\nContext: {context}\nAnswer:"
    input_text = 'Andrew\'s favorite color is violet. What is Andrew\'s favorite color?'
    inputs = tokenizer.encode(input_text, return_tensors='pt').to(device)
    attention_mask = inputs.ne(tokenizer.pad_token_id).long()  # Create attention mask
    outputs = model_generator.generate(
        inputs,
        attention_mask=attention_mask,
        # max_length=500,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=10,  # start of comparing to previous model
        temperature=0.3,
        top_k=5,
        do_sample=True,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


# Load and preprocess the document - COPIED
start_time = time.time()
document = load_rules("ultimate_frisbee_rules-manual_copy_from_website-edited.txt")
chunks = preprocess_document(document)
print(f"Preprocessing took {time.time() - start_time:.2f} seconds")

# Load the model - COPIED
start_time = time.time()
model_retriever = SentenceTransformer('all-MiniLM-L6-v2', device=device)  # device=device probably unnecessary for the retriever
print(f"Loading the retrieval model took {time.time() - start_time:.2f} seconds")

# Vectorize the chunks - COPIED
start_time = time.time()
chunk_embeddings = model_retriever.encode(chunks, convert_to_tensor=True, device=device)
print(f"Vectorizing the chunks took {time.time() - start_time:.2f} seconds")

# Store vectors and chunks - COPIED
start_time = time.time()
index = {i: chunk for i, chunk in enumerate(chunks)}
print(f"Indexing took {time.time() - start_time:.2f} seconds")

# Load pre-trained model and tokenizer
start_time = time.time()
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_generator = AutoModelForCausalLM.from_pretrained(model_name).to(device)
print(f"Loading the generation model took {time.time() - start_time:.2f} seconds")

# Set pad_token_id to eos_token_id
start_time = time.time()
tokenizer.pad_token_id = tokenizer.eos_token_id
print(f"Setting pad_token_id took {time.time() - start_time:.2f} seconds")

start_time = time.time()
relevant_chunks = retrieve_relevant_chunks(query, model_retriever, chunk_embeddings, index)
print(f"Retrieving relevant chunks took {time.time() - start_time:.2f} seconds")
print(relevant_chunks)

start_time = time.time()
response = generate_response(query, relevant_chunks, model_generator, tokenizer)
print(f"Generating the response took {time.time() - start_time:.2f} seconds")
print(response)

Using device: cuda
Preprocessing took 0.00 seconds
Loading the retrieval model took 0.88 seconds
Vectorizing the chunks took 3.74 seconds
Indexing took 0.00 seconds


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


Query: What is the stall count?
Context: 15.A.1. The stall count consists of announcing “stalling” and counting from one to ten loudly enough for the thrower to hear. 15.A.5. If a stall count is interrupted by a call, the thrower and marker are responsible for agreeing on the correct count before the check. The count reached is the last number fully uttered by the marker before the call. The count is resumed with the word “stalling” followed by the number listed below: 7.D.4.a.2. If the technical timeout stopped play, the count resumes at the stall count reached plus one, or at six if over five. 15.A.2.b. However, unless 15.A.2.a applies, the stall count may not be initiated or resumed before a pivot is established: 15.B.6.c. If this (15.B.6.b) occurs in the same possession following a contested stall (either due to 15.B.6.a or 15.A.3.b), the stall count resumes at six.
Answer:
Context: 15.B.6.b. The technical timeout stopped play. 15.B.6.c. If this (15.B.6.b) occurs in the same posses

Query: Explain the timeout rules
Context: 7.B.2. Any player may call a timeout after a goal is scored and before both teams have signaled readiness to start play. Time limit counts between points are suspended for 70 seconds. A timeout may not be called between a re-pull call and the ensuing pull. 7.A. A timeout stops play and suspends time limit counts. 7.C.2. The timeout is retroactive to the time of the injury, unless the injured player chooses to continue play before the timeout is called, in which case, the timeout begins at the time of the call. If the disc is in the air or the thrower is in the act of throwing at the time of the injury or of the call when the player has continued play, the timeout begins when the play is completed. 7.C.7.a. This timeout takes effect when the call is made (i.e., is not retroactive to the time of injury). If the disc is in the air or the thrower is in the act of throwing at the time of the call, the timeout begins when the play is completed. Howev