In [1]:
# Import necessary libraries
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


# Step 1: Load the rules text from the provided file
def load_rules(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        rules_text = file.read()
    return rules_text


def fine_tune_model(model, tokenizer, rules_text):
    inputs = tokenizer(rules_text, return_tensors="pt", max_length=1024, truncation=True)
    outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss
    loss.backward()
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    optimizer.step()


def retrieve_relevant_section(query, vectorizer, rules_vectors, rules_sections):
    query_vector = vectorizer.transform([query])
    similarities = np.dot(query_vector, rules_vectors.T).toarray()[0]
    most_similar_index = np.argmax(similarities)
    return rules_sections[most_similar_index]


def ask_question(question, vectorizer, rules_vectors, rules_sections, qa_pipeline):
    relevant_section = retrieve_relevant_section(question, vectorizer, rules_vectors, rules_sections)
    input_text = f"Context: {relevant_section}\nQuestion: {question}\nAnswer:"
    response = qa_pipeline(input_text, max_length=100, num_return_sequences=1)
    return response[0]['generated_text']

2025-03-05 17:25:24.009084: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-05 17:25:24.023284: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741213524.037900   53036 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741213524.042148   53036 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-05 17:25:24.062455: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
rules_text = load_rules("ultimate_frisbee_rules-manual_copy_from_website.txt")

# Step 3: Fine-Tune GPT-2
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [6]:
inputs = tokenizer(rules_text, return_tensors="pt", max_length=1024, truncation=True)
inputs

{'input_ids': tensor([[10962,   286, 26714,  ...,   966,    11,   351]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [3]:
tokenizer(rules_text, return_tensors="pt", truncation=False)["input_ids"][0]

Token indices sequence length is longer than the specified maximum sequence length for this model (30938 > 1024). Running this sequence through the model will result in indexing errors


tensor([10962,   286, 26714,  ...,  3747,    13,   198])

In [None]:
tokens_per_line = [tokenizer(line) for line in rules_text.split('\n')]
num_tokens_per_line = [len(tokens['input_ids']) for tokens in tokens_per_line]
num_tokens_per_line

[3,
 2,
 0,
 1,
 4,
 2,
 2,
 2,
 3,
 2,
 4,
 6,
 9,
 4,
 2,
 2,
 3,
 3,
 2,
 5,
 2,
 3,
 3,
 7,
 0,
 7,
 0,
 7,
 0,
 8,
 0,
 9,
 0,
 12,
 0,
 2,
 0,
 176,
 0,
 3,
 133,
 7,
 26,
 72,
 57,
 36,
 6,
 55,
 48,
 98,
 45,
 154,
 26,
 13,
 38,
 13,
 9,
 14,
 13,
 16,
 12,
 18,
 10,
 22,
 20,
 115,
 57,
 15,
 23,
 17,
 36,
 33,
 34,
 16,
 19,
 27,
 18,
 25,
 33,
 24,
 13,
 15,
 13,
 20,
 10,
 21,
 43,
 15,
 17,
 17,
 18,
 14,
 34,
 23,
 118,
 37,
 78,
 107,
 40,
 16,
 13,
 16,
 9,
 3,
 77,
 50,
 54,
 120,
 63,
 31,
 40,
 55,
 61,
 51,
 86,
 48,
 98,
 27,
 19,
 47,
 29,
 39,
 56,
 83,
 34,
 29,
 121,
 101,
 60,
 83,
 57,
 14,
 16,
 50,
 25,
 28,
 18,
 6,
 47,
 90,
 49,
 64,
 101,
 74,
 26,
 80,
 20,
 28,
 39,
 27,
 71,
 38,
 57,
 4,
 41,
 22,
 44,
 46,
 36,
 23,
 64,
 27,
 16,
 33,
 39,
 60,
 3,
 30,
 24,
 34,
 35,
 25,
 59,
 70,
 5,
 31,
 16,
 59,
 67,
 6,
 73,
 44,
 85,
 33,
 73,
 4,
 15,
 22,
 13,
 56,
 75,
 19,
 29,
 76,
 74,
 85,
 98,
 34,
 22,
 89,
 109,
 15,
 26,
 64,
 43,
 68,
 37,
 30

In [23]:
tokenizer.model_max_length

1024

In [None]:
# Extraction from PDF using PyMuPDF. Good but has newlines in paragraphs.

# Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

# Step 1: Extract text from the PDF
pdf_path = "Official-Rules-of-Ultimate-2024-2025.pdf"
rules_text = extract_text_from_pdf(pdf_path)

# Save the extracted text to a file
with open("ultimate_frisbee_rules-pdf_extraction.txt", "w") as file:
    file.write(rules_text)

In [None]:
# Extraction from HTML using BeautifulSoup. Not so great.

from bs4 import BeautifulSoup

# Load the HTML content from the file
with open('ultimate_frisbee_rules.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Extract the main content of the rules
rules_content = soup.find('div', {'id': 'rules-of-ultimate'}).get_text(separator='\n')

# Clean the extracted text
def clean_text(text):
    # Replace multiple newlines with a single newline
    text = text.replace('\n\n', '\n')
    
    # Replace newlines in the middle of paragraphs with a space
    text = text.replace('\n', ' ')
    
    # Ensure paragraphs are separated by a single newline
    text = text.replace('. ', '.\n')
    
    return text

cleaned_rules_content = clean_text(rules_content)

# Save the cleaned rules to a text file
with open('ultimate_frisbee_rules-html_extraction.txt', 'w', encoding='utf-8') as output_file:
    output_file.write(cleaned_rules_content)

print("The rules have been successfully extracted and saved to 'cleaned_ultimate_frisbee_rules.txt'.")

The rules have been successfully extracted and saved to 'cleaned_ultimate_frisbee_rules.txt'.


In [None]:
# GPU test (pytorch)

import torch
import time

# Define a simple matrix multiplication task
def gpu_test():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Create random tensors
    a = torch.randn(10000, 10000, device=device)
    b = torch.randn(10000, 10000, device=device)

    # Perform matrix multiplication
    start_time = time.time()
    c = torch.matmul(a, b)
    end_time = time.time()

    print(f"Matrix multiplication completed in {end_time - start_time} seconds")

for i in range(10000):
    gpu_test()

In [None]:
# GPU test (tensorflow)

import tensorflow as tf
import time

# Check if TensorFlow can access the GPU
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(physical_devices))

if physical_devices:
    print("TensorFlow GPU details:")
    for gpu in physical_devices:
        print(gpu)
else:
    print("No GPUs detected by TensorFlow.")

# Define a simple matrix multiplication task
def gpu_test():
    device = "/GPU:0" if tf.config.list_physical_devices('GPU') else "/CPU:0"
    print(f"Using device: {device}")

    # Create random tensors
    with tf.device(device):
        a = tf.random.normal([10000, 10000])
        b = tf.random.normal([10000, 10000])

        # Perform matrix multiplication
        start_time = time.time()
        c = tf.matmul(a, b)
        end_time = time.time()

    print(f"Matrix multiplication completed in {end_time - start_time} seconds")

for i in range(10000):
    gpu_test()

2025-03-05 15:00:32.762867: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-05 15:00:32.772080: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741204832.781773   12855 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741204832.784905   12855 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-05 15:00:32.795874: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Num GPUs Available:  1
TensorFlow GPU details:
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
