In [1]:
# Import necessary libraries
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


# Step 1: Load the rules text from the provided file
def load_rules(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        rules_text = file.read()
    return rules_text


# Fine-tune the model using each line of the rules.
def fine_tune_model(model, tokenizer, rules_text):

    # Determine the device to use.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Ensure the model is on the selected device.
    model.to(device)

    # Ensure the model is in training mode
    model.train()

    # Tokenize each line in the rules.
    tokens_per_line = [tokenizer(line, return_tensors="pt").to(device) for line in rules_text.split('\n')]

    # Get the number of tokens for each line of the rules.
    num_tokens_per_line = [len(tokens['input_ids'][0]) for tokens in tokens_per_line]

    # Ensure the model can handle every line.
    model_max_length = tokenizer.model_max_length
    assert max(num_tokens_per_line) <= model_max_length, 'The maximum number of tokens in all lines is greater than the maximum number of tokens the model can handle.'

    # Get the total number of lines i.e. number of token sets.
    tot_num_tokens = len(num_tokens_per_line)

    # Initialize the optimizer.
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    # For each line in the rules...
    for itokens, tokens in enumerate(tokens_per_line):
        num_tokens = len(tokens["input_ids"][0])
        print(f'On line {itokens + 1} of {tot_num_tokens}: number of tokens: {num_tokens}')
        if num_tokens != 0:
            outputs = model(**tokens, labels=tokens["input_ids"])
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        else:
            print('Skipping line because it is blank.')


def retrieve_relevant_section(query, vectorizer, rules_vectors, rules_sections):
    query_vector = vectorizer.transform([query])
    similarities = np.dot(query_vector, rules_vectors.T).toarray()[0]
    most_similar_index = np.argmax(similarities)
    return rules_sections[most_similar_index]


def ask_question(question, vectorizer, rules_vectors, rules_sections, qa_pipeline):
    relevant_section = retrieve_relevant_section(question, vectorizer, rules_vectors, rules_sections)
    input_text = f"Context: {relevant_section}\nQuestion: {question}\nAnswer:"
    response = qa_pipeline(input_text, max_length=100, num_return_sequences=1)
    return response[0]['generated_text']

2025-03-05 20:55:26.270539: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-05 20:55:26.292011: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741226126.306103   86150 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741226126.310372   86150 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-05 20:55:26.332009: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [None]:
rules_text = load_rules("ultimate_frisbee_rules-manual_copy_from_website.txt")

# Step 2: Preprocess the Text (if needed)
# Here you can add any text preprocessing steps if required

# Step 3: Fine-Tune GPT-2
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)  # this downloads and caches (across sessions) some files including the model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)  # this also downloads and caches things
fine_tune_model(model, tokenizer, rules_text)

# # Step 4: Implement Retrieval Mechanism
# # Split the rules text into sections for retrieval
# rules_sections = rules_text.split('\n\n')

# # Create a TF-IDF vectorizer and fit it on the rules sections
# vectorizer = TfidfVectorizer().fit(rules_sections)
# rules_vectors = vectorizer.transform(rules_sections)

# # Step 5: Integrate Retrieval with Generation
# # Create a pipeline for question answering
# qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

# # Example question
# question = "What is the stall count in ultimate frisbee?"
# answer = ask_question(question)
# print(answer)


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


On line 1 of 672: number of tokens: 3
On line 2 of 672: number of tokens: 2
On line 3 of 672: number of tokens: 0
Skipping line because it is blank.
On line 4 of 672: number of tokens: 1
On line 5 of 672: number of tokens: 4
On line 6 of 672: number of tokens: 2
On line 7 of 672: number of tokens: 2
On line 8 of 672: number of tokens: 2
On line 9 of 672: number of tokens: 3
On line 10 of 672: number of tokens: 2
On line 11 of 672: number of tokens: 4
On line 12 of 672: number of tokens: 6
On line 13 of 672: number of tokens: 9
On line 14 of 672: number of tokens: 4
On line 15 of 672: number of tokens: 2
On line 16 of 672: number of tokens: 2
On line 17 of 672: number of tokens: 3
On line 18 of 672: number of tokens: 3
On line 19 of 672: number of tokens: 2
On line 20 of 672: number of tokens: 5
On line 21 of 672: number of tokens: 2
On line 22 of 672: number of tokens: 3
On line 23 of 672: number of tokens: 3
On line 24 of 672: number of tokens: 7
On line 25 of 672: number of tokens: 0

In [None]:
# Extraction from PDF using PyMuPDF. Good but has newlines in paragraphs.

# Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

# Step 1: Extract text from the PDF
pdf_path = "Official-Rules-of-Ultimate-2024-2025.pdf"
rules_text = extract_text_from_pdf(pdf_path)

# Save the extracted text to a file
with open("ultimate_frisbee_rules-pdf_extraction.txt", "w") as file:
    file.write(rules_text)

In [None]:
# Extraction from HTML using BeautifulSoup. Not so great.

from bs4 import BeautifulSoup

# Load the HTML content from the file
with open('ultimate_frisbee_rules.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Extract the main content of the rules
rules_content = soup.find('div', {'id': 'rules-of-ultimate'}).get_text(separator='\n')

# Clean the extracted text
def clean_text(text):
    # Replace multiple newlines with a single newline
    text = text.replace('\n\n', '\n')
    
    # Replace newlines in the middle of paragraphs with a space
    text = text.replace('\n', ' ')
    
    # Ensure paragraphs are separated by a single newline
    text = text.replace('. ', '.\n')
    
    return text

cleaned_rules_content = clean_text(rules_content)

# Save the cleaned rules to a text file
with open('ultimate_frisbee_rules-html_extraction.txt', 'w', encoding='utf-8') as output_file:
    output_file.write(cleaned_rules_content)

print("The rules have been successfully extracted and saved to 'cleaned_ultimate_frisbee_rules.txt'.")

The rules have been successfully extracted and saved to 'cleaned_ultimate_frisbee_rules.txt'.


In [7]:
# GPU test (pytorch)

import torch
import time

# Define a simple matrix multiplication task
def gpu_test():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Create random tensors
    a = torch.randn(10000, 10000, device=device)
    b = torch.randn(10000, 10000, device=device)

    # Perform matrix multiplication
    start_time = time.time()
    c = torch.matmul(a, b)
    end_time = time.time()

    print(f"Matrix multiplication completed in {end_time - start_time} seconds")

for i in range(10000):
    gpu_test()

Using device: cuda
Matrix multiplication completed in 0.07192134857177734 seconds
Using device: cuda
Matrix multiplication completed in 8.177757263183594e-05 seconds
Using device: cuda
Matrix multiplication completed in 2.0742416381835938e-05 seconds
Using device: cuda
Matrix multiplication completed in 1.6450881958007812e-05 seconds
Using device: cuda
Matrix multiplication completed in 1.5497207641601562e-05 seconds
Using device: cuda
Matrix multiplication completed in 1.52587890625e-05 seconds
Using device: cuda
Matrix multiplication completed in 1.4781951904296875e-05 seconds
Using device: cuda
Matrix multiplication completed in 1.5735626220703125e-05 seconds
Using device: cuda
Matrix multiplication completed in 1.52587890625e-05 seconds
Using device: cuda
Matrix multiplication completed in 1.5735626220703125e-05 seconds
Using device: cuda
Matrix multiplication completed in 1.5497207641601562e-05 seconds
Using device: cuda
Matrix multiplication completed in 1.4781951904296875e-05 se

KeyboardInterrupt: 

In [None]:
# GPU test (tensorflow)

import tensorflow as tf
import time

# Check if TensorFlow can access the GPU
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(physical_devices))

if physical_devices:
    print("TensorFlow GPU details:")
    for gpu in physical_devices:
        print(gpu)
else:
    print("No GPUs detected by TensorFlow.")

# Define a simple matrix multiplication task
def gpu_test():
    device = "/GPU:0" if tf.config.list_physical_devices('GPU') else "/CPU:0"
    print(f"Using device: {device}")

    # Create random tensors
    with tf.device(device):
        a = tf.random.normal([10000, 10000])
        b = tf.random.normal([10000, 10000])

        # Perform matrix multiplication
        start_time = time.time()
        c = tf.matmul(a, b)
        end_time = time.time()

    print(f"Matrix multiplication completed in {end_time - start_time} seconds")

for i in range(10000):
    gpu_test()

2025-03-05 15:00:32.762867: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-05 15:00:32.772080: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741204832.781773   12855 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741204832.784905   12855 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-05 15:00:32.795874: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Num GPUs Available:  1
TensorFlow GPU details:
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
