In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import openai
from openai import OpenAI
import os
import pickle
import faiss

import error: No module named 'triton'


In [None]:
# Step 1: Build Your Knowledge Base

# Load the fighter info and event data
fighter_df = pd.read_csv("data-raw/fighter_info.csv")
event_df = pd.read_csv("data-raw/event_data_sherdog.csv")

# Merge fighter stats for Fighter 1
merged = pd.merge(
    event_df,
    fighter_df,
    left_on=["Fighter 1", "Fighter 1 ID"],
    right_on=["Fighter", "Fighter_ID"],
    suffixes=('', '_fighter1')
)

# Merge fighter stats for Fighter 2; note that fighter columns for Fighter 2 get suffixed with _fighter2
merged = pd.merge(
    merged,
    fighter_df,
    left_on=["Fighter 2", "Fighter 2 ID"],
    right_on=["Fighter", "Fighter_ID"],
    suffixes=('', '_fighter2')
)

# Create a document (text summary) for each fight event that includes both fighters' stats and the outcome.
def create_document(row):
    # For Fighter 1, use the columns without suffix; for Fighter 2, columns have a _fighter2 suffix.
    doc = (
        f"Event on {row['Event Date']}: Fight between {row['Fighter 1']} and {row['Fighter 2']}.\n"
        f"{row['Fighter 1']} stats: Wins = {row['Wins']}, Losses = {row['Losses']}, "
        f"Height = {row['Height']}, Birth Date = {row['Birth Date']}.\n"
        f"{row['Fighter 2']} stats: Wins = {row['Wins_fighter2']}, Losses = {row['Losses_fighter2']}, "
        f"Height = {row['Height_fighter2']}, Birth Date = {row['Birth Date_fighter2']}.\n"
        f"Outcome: {row['Winning Fighter']} won by {row['Winning Method']} in round {row['Winning Round']} at {row['Winning Time']}."
    )
    return doc

# Apply the function to create a new 'document' column
merged['document'] = merged.apply(create_document, axis=1)

# Create our knowledge base: a list of document texts (and a corresponding list of IDs)
documents = merged['document'].tolist()
document_ids = merged.index.tolist()

print(f"Created {len(documents)} documents for the knowledge base.")
print("\nExample document:")
print(documents[0])


In [None]:
# Step 2: Compute Document Embeddings
# Use a SentenceTransformer to convert each document into an embedding.

# Load a pre-trained model (this model is fast and works well on a Mac)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each document
embeddings = model.encode(documents, convert_to_tensor=False, show_progress_bar=True)
# embeddings = model.encode(documents, batch_size=2, convert_to_tensor=False, show_progress_bar=True)

print("Generated embeddings for all documents.")


In [None]:
# Step 3: Build a FAISS Index
# We use FAISS to index the embeddings so we can quickly retrieve the most relevant documents.

# Convert the embeddings list to a numpy array of type float32
embeddings_np = np.array(embeddings).astype('float32')
d = embeddings_np.shape[1]  # dimensionality of the embeddings

# Create a FAISS index (using L2 distance)
index = faiss.IndexFlatL2(d)
index.add(embeddings_np)

print(f"FAISS index built with {index.ntotal} vectors.")


In [4]:
# Step 4: Create a Retrieval Function
# This function takes a query, computes its embedding, and then retrieves the top‑k most similar documents from the FAISS index.

def retrieve_documents(query, k=3):
    # Compute the query embedding
    query_embedding = model.encode([query], convert_to_tensor=False)
    query_embedding_np = np.array(query_embedding).astype('float32')
    
    # Search the FAISS index for the top k closest embeddings
    distances, indices = index.search(query_embedding_np, k)
    
    # Retrieve the corresponding documents
    retrieved_docs = [documents[i] for i in indices[0]]
    return retrieved_docs, distances[0]

# Example retrieval:
# sample_query = "Predict the potential outcome for a fight between Jon Jones and Tom Aspinall."
sample_query = "Predict the potential outcome for a fight between Belal Muhammad and Jack Delamadellena."
retrieved_docs, distances = retrieve_documents(sample_query, k=3)

print("Retrieved Documents:")
for doc, dist in zip(retrieved_docs, distances):
    print(f"Distance: {dist:.2f} - {doc}")


NameError: name 'model' is not defined

In [2]:
# Step 5: Integrate with an LLM for Generation
# Finally, create a prompt that includes your query and the retrieved documents. Then call OpenAI’s API to generate a prediction.

# sample_query = "Predict the potential outcome for a fight between Jon Jones and Tom Aspinall."
sample_query = "Predict the potential outcome for a fight between Belal Muhammad and Jack Delamadellena."

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")  # Or directly: api_key="your-key-here"
)

def generate_prediction(query, retrieved_docs):
    # Combine the retrieved documents into a single context string
    context = "\n\n".join(retrieved_docs)
    
    # Construct the prompt: it includes the context and the query
    prompt = (
        f"Analyze the following historical fight data:\n{context}\n\n"
        f"Given the matchup query: '{query}', predict the winner, method of victory, and the round in which the fight might end. "
        "Provide a concise answer in one sentence."
    )
    
    # Call the OpenAI ChatCompletion API
    response = client.chat.completions.create(
        # model="gpt-3.5-turbo",
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a knowledgeable MMA fight analyst."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=100
    )
    # Change from ['content'] to .content
    print(response)
    answer = response.choices[0].message.content.strip()
    return answer

# Example generation:
prediction = generate_prediction(sample_query, retrieved_docs)
print("Prediction:", prediction)


NameError: name 'retrieved_docs' is not defined

In [None]:
# Save Model

# 1. Save model
model.save("mma_prediction_model")  # Creates a folder with model files
print("Saved model to 'mma_prediction_model' folder")

# 2. Save FAISS index
faiss.write_index(index, "mma_faiss_index.index")
print("Saved FAISS index to 'mma_faiss_index.index'")

# 3. Save documents
with open("mma_documents.pkl", "wb") as f:
    pickle.dump(documents, f)
print("Saved documents to 'mma_documents.pkl'")

# 4. Save document IDs (if needed)
with open("mma_document_ids.pkl", "wb") as f:
    pickle.dump(document_ids, f)
print("Saved document IDs to 'mma_document_ids.pkl'")

In [None]:
### Reload Model and ask Question (if you dont wanna rebuild) ###
 
model = SentenceTransformer('mma_prediction_model')
index = faiss.read_index("mma_faiss_index.index")

with open("mma_documents.pkl", "rb") as f:
    documents = pickle.load(f)
    
with open("mma_document_ids.pkl", "rb") as f:
    document_ids = pickle.load(f)
    
    
# Define the retrieval function AFTER loading resources
def retrieve_documents(query, k=3):
    query_embedding = model.encode([query], convert_to_tensor=False)
    query_embedding_np = np.array(query_embedding).astype('float32')
    distances, indices = index.search(query_embedding_np, k)
    return [documents[i] for i in indices[0]], distances[0]


# After loading saved resources but before generation
sample_query = "Predict the potential outcome for a fight between Belal Muhammad and Jack Delamadellena."

# First retrieve documents
retrieved_docs, distances = retrieve_documents(sample_query, k=3)  # Add this line

# Then generate prediction
prediction = generate_prediction(sample_query, retrieved_docs)
print("Prediction:", prediction)