In [21]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import openai
from openai import OpenAI
import os
import pickle
import faiss
from datetime import datetime

In [2]:
# Step 0: Clean Files

import pandas as pd

# Load the original data files
fighter_df = pd.read_csv("data-raw/fighter_info.csv")
event_df = pd.read_csv("data-raw/event_data_sherdog.csv")

# Save original row counts
original_event_count = len(event_df)
original_fighter_count = len(fighter_df)

# Convert the "Event Date" column to datetime (coerce errors to NaT)
event_df['Event Date'] = pd.to_datetime(event_df['Event Date'], errors='coerce')

# Ensure the datetime is timezone-naive (remove timezone if present)
if event_df['Event Date'].dt.tz is not None:
    event_df['Event Date'] = event_df['Event Date'].dt.tz_convert(None)

# Filter events: only include fights from January 1, 2010 onward
event_df_filtered = event_df[event_df['Event Date'] >= pd.Timestamp("2010-01-01")].reset_index(drop=True)
filtered_event_count = len(event_df_filtered)
removed_event_rows = original_event_count - filtered_event_count

# Determine which fighters fought after 2010 (from both Fighter 1 and Fighter 2)
fighter_ids_after_2010 = set(event_df_filtered["Fighter 1 ID"].unique()).union(set(event_df_filtered["Fighter 2 ID"].unique()))

# Filter fighter_info.csv: only include fighters who fought after 2010
fighter_df_filtered = fighter_df[fighter_df["Fighter_ID"].isin(fighter_ids_after_2010)].reset_index(drop=True)
filtered_fighter_count = len(fighter_df_filtered)
removed_fighter_rows = original_fighter_count - filtered_fighter_count

# Save the filtered data files
event_df_filtered.to_csv("data-raw/event_data_sherdog.csv", index=False)
fighter_df_filtered.to_csv("data-raw/fighter_info.csv", index=False)

# Print the number of rows removed
print(f"Original event_data_sherdog.csv rows: {original_event_count}")
print(f"Filtered event_data_sherdog.csv rows: {filtered_event_count} (removed {removed_event_rows} rows)")
print(f"Original fighter_info.csv rows: {original_fighter_count}")
print(f"Filtered fighter_info.csv rows: {filtered_fighter_count} (removed {removed_fighter_rows} rows)")
print("\nFiltered files saved as 'event_data_sherdog.csv' and 'fighter_info.csv'.")


Original event_data_sherdog.csv rows: 8131
Filtered event_data_sherdog.csv rows: 6858 (removed 1273 rows)
Original fighter_info.csv rows: 2646
Filtered fighter_info.csv rows: 2163 (removed 483 rows)

Filtered files saved as 'event_data_sherdog.csv' and 'fighter_info.csv'.


In [3]:
# Step 1: Build Your Knowledge Base

# Load the fighter info and event data
fighter_df = pd.read_csv("data-raw/fighter_info.csv")
event_df = pd.read_csv("data-raw/event_data_sherdog.csv")

# Merge fighter stats for Fighter 1
merged = pd.merge(
    event_df,
    fighter_df,
    left_on=["Fighter 1", "Fighter 1 ID"],
    right_on=["Fighter", "Fighter_ID"],
    suffixes=('', '_fighter1')
)

# Merge fighter stats for Fighter 2; note that fighter columns for Fighter 2 get suffixed with _fighter2
merged = pd.merge(
    merged,
    fighter_df,
    left_on=["Fighter 2", "Fighter 2 ID"],
    right_on=["Fighter", "Fighter_ID"],
    suffixes=('', '_fighter2')
)

# Create a document (text summary) for each fight event that includes both fighters' stats and the outcome.
def create_document(row):
    # For Fighter 1, use the columns without suffix; for Fighter 2, columns have a _fighter2 suffix.
    doc = (
        f"Event on {row['Event Date']}: Fight between {row['Fighter 1']} and {row['Fighter 2']}.\n"
        f"{row['Fighter 1']} stats: Wins = {row['Wins']}, Losses = {row['Losses']}, "
        f"Height = {row['Height']}, Birth Date = {row['Birth Date']}.\n"
        f"{row['Fighter 2']} stats: Wins = {row['Wins_fighter2']}, Losses = {row['Losses_fighter2']}, "
        f"Height = {row['Height_fighter2']}, Birth Date = {row['Birth Date_fighter2']}.\n"
        f"Outcome: {row['Winning Fighter']} won by {row['Winning Method']} in round {row['Winning Round']} at {row['Winning Time']}."
    )
    return doc

# Apply the function to create a new 'document' column
merged['document'] = merged.apply(create_document, axis=1)

# Create our knowledge base: a list of document texts (and a corresponding list of IDs)
documents = merged['document'].tolist()
document_ids = merged.index.tolist()

print(f"Created {len(documents)} documents for the knowledge base.")
print("\nExample document:")
print(documents[0])


Created 6849 documents for the knowledge base.

Example document:
Event on 2024-12-14 00:00:00: Fight between joaquin buckley and colby covington.
joaquin buckley stats: Wins = 21, Losses = 6, Height = 5'8, Birth Date = apr 27, 1994.
colby covington stats: Wins = 17, Losses = 5, Height = 5'11, Birth Date = feb 22, 1988.
Outcome: joaquin buckley won by tko (doctor stoppage) in round 3 at 4:42.


In [4]:
# Step 2: Compute Document Embeddings
# Use a SentenceTransformer to convert each document into an embedding.

# Load a pre-trained model (this model is fast and works well on a Mac)
model = SentenceTransformer('all-MiniLM-L6-v2')
# model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')  # Force CPU usage

# Generate embeddings for each document
embeddings = model.encode(documents, convert_to_tensor=False, show_progress_bar=True, convert_to_numpy=True)
# embeddings = model.encode(documents, batch_size=2, convert_to_tensor=False, show_progress_bar=True)

print("Generated embeddings for all documents.")


Batches:   0%|          | 0/215 [00:00<?, ?it/s]

Generated embeddings for all documents.


In [5]:
# Step 3: Build a FAISS Index
# We use FAISS to index the embeddings so we can quickly retrieve the most relevant documents.

# Convert the embeddings list to a numpy array of type float32
embeddings_np = np.array(embeddings).astype('float32')
d = embeddings_np.shape[1]  # dimensionality of the embeddings

# Create a FAISS index (using L2 distance)
index = faiss.IndexFlatL2(d)
index.add(embeddings_np)

print(f"FAISS index built with {index.ntotal} vectors.")


FAISS index built with 6849 vectors.


In [6]:
# Step 4: Create a Retrieval Function
# This function takes a query, computes its embedding, and then retrieves the top‑k most similar documents from the FAISS index.

def retrieve_documents(query, k=3):
    # Compute the query embedding
    query_embedding = model.encode([query], convert_to_tensor=False)
    query_embedding_np = np.array(query_embedding).astype('float32')
    
    # Search the FAISS index for the top k closest embeddings
    distances, indices = index.search(query_embedding_np, k)
    
    # Retrieve the corresponding documents
    retrieved_docs = [documents[i] for i in indices[0]]
    return retrieved_docs, distances[0]

# Example retrieval:
# sample_query = "Predict the potential outcome for a fight between Jon Jones and Tom Aspinall."
sample_query = "Predict the potential outcome for a fight between Belal Muhammad and Jack Delamadellena."
retrieved_docs, distances = retrieve_documents(sample_query, k=3)

print("Retrieved Documents:")
for doc, dist in zip(retrieved_docs, distances):
    print(f"Distance: {dist:.2f} - {doc}")


Retrieved Documents:
Distance: 0.67 - Event on 2017-11-18 21:00:00: Fight between belal muhammad and tim means.
belal muhammad stats: Wins = 24, Losses = 3, Height = 5'10, Birth Date = jul 9, 1988.
tim means stats: Wins = 33, Losses = 17, Height = 6'2, Birth Date = feb 20, 1984.
Outcome: belal muhammad won by decision (split) in round 3 at 5:00.
Distance: 0.67 - Event on 2018-06-01 00:00:00: Fight between belal muhammad and chance rencountre.
belal muhammad stats: Wins = 24, Losses = 3, Height = 5'10, Birth Date = jul 9, 1988.
chance rencountre stats: Wins = 16, Losses = 5, Height = 6'2, Birth Date = dec 31, 1986.
Outcome: belal muhammad won by decision (unanimous) in round 3 at 5:00.
Distance: 0.67 - Event on 2017-02-11 00:00:00: Fight between belal muhammad and randy brown.
belal muhammad stats: Wins = 24, Losses = 3, Height = 5'10, Birth Date = jul 9, 1988.
randy brown stats: Wins = 19, Losses = 6, Height = 6'3, Birth Date = jul 8, 1990.
Outcome: belal muhammad won by decision (unan

In [25]:
# Step 5: Integrate with an LLM for Generation
# Finally, create a prompt that includes your query and the retrieved documents. Then call OpenAI’s API to generate a prediction.

# sample_query = "Predict the potential outcome for a fight between Jon Jones and Tom Aspinall."
# sample_query = "Predict the potential outcome for a fight between Belal Muhammad and Jack Delamadellena."
# sample_query = "research and tell me about Belal Muhammad in depth, including his five most recent fights, chronologically and general stats"
# sample_query = "Belal Muhammad vs Jack Delamadellena"
sample_query = "Jon Jones vs Tom Aspinall"

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")  # Or directly: api_key="your-key-here"
)

def generate_prediction(query, retrieved_docs):
    # Combine the retrieved documents into a single context string
    context = "\n\n".join(retrieved_docs)
    
    # Construct the prompt: it includes the context and the query
    prompt = (
        # f"Use and analyze the following historical fight data:\n{context}\n\n"
        f"Using the following historical fight data:{context}, given the potential matchup between '{query}', analyze the first fighter, then the 2nd fighters historical stats and recent performances in depth. Then, predict the winner of a potential matchup between them, including the method of victory and the round in which the fight might end and why you think that outcome is likely in detail."
    )
    
    # Call the OpenAI ChatCompletion API
    response = client.chat.completions.create(
        # model="gpt-3.5-turbo",
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a knowledgeable MMA fight analyst."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3
        # max_tokens=100
    )
    print(response)
    answer = response.choices[0].message.content.strip()
    
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    filename = f"outputs/prediction_{sample_query}.md"
    with open(filename, "w") as f:
        f.write(f"# MMA Fight Prediction Report\n\n")
        f.write(f"**Date**: {timestamp.replace('_', ' ')}\n")
        f.write(f"**Model**: {response.model}\n")
        f.write(f"**Query**: {query}\n\n")
        f.write(f"## Prediction\n\n{answer}\n\n")
        f.write(f"## Raw Output\n\n{response}\n\n")
        f.write(f"---\n*Tokens Used: {response.usage.total_tokens}*")
    print(f"Saved prediction to {filename}")
    return answer

# Example generation:
prediction = generate_prediction(sample_query, retrieved_docs)
print("Prediction:", prediction)


ChatCompletion(id='chatcmpl-B4pM80RQ9DVgiQbQdZO4svaSJfoBU', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='To analyze a potential matchup between Jon Jones and Tom Aspinall, let\'s first examine their historical stats and recent performances in depth.\n\n### Jon Jones\n\n**Historical Stats:**\n- **Wins:** 26\n- **Losses:** 1 (DQ against Matt Hamill)\n- **Height:** 6\'4"\n- **Birth Date:** July 19, 1987\n\n**Recent Performance:**\nJon Jones is widely regarded as one of the greatest mixed martial artists of all time. He has an extensive and impressive resume, having competed against some of the best fighters in the light heavyweight and heavyweight divisions. His last fight was on March 4, 2023, against Ciryl Gane at UFC 285, where he won the heavyweight title by submission in the first round. This fight showcased his ability to adapt to a new weight class and his continued dominance in the sport.\n\n**Strengths:**\n- **Fight IQ:** Jo

In [8]:
# Save Model

# 1. Save model
model.save("mma_prediction_model")  # Creates a folder with model files
print("Saved model to 'mma_prediction_model' folder")

# 2. Save FAISS index
faiss.write_index(index, "mma_faiss_index.index")
print("Saved FAISS index to 'mma_faiss_index.index'")

# 3. Save documents
with open("mma_documents.pkl", "wb") as f:
    pickle.dump(documents, f)
print("Saved documents to 'mma_documents.pkl'")

# 4. Save document IDs (if needed)
with open("mma_document_ids.pkl", "wb") as f:
    pickle.dump(document_ids, f)
print("Saved document IDs to 'mma_document_ids.pkl'")

Saved model to 'mma_prediction_model' folder
Saved FAISS index to 'mma_faiss_index.index'
Saved documents to 'mma_documents.pkl'
Saved document IDs to 'mma_document_ids.pkl'


In [None]:
### Reload Model and ask Question (if you dont wanna rebuild) ###
 
model = SentenceTransformer('mma_prediction_model')
index = faiss.read_index("mma_faiss_index.index")

with open("mma_documents.pkl", "rb") as f:
    documents = pickle.load(f)
    
with open("mma_document_ids.pkl", "rb") as f:
    document_ids = pickle.load(f)
    
    
# Define the retrieval function AFTER loading resources
def retrieve_documents(query, k=3):
    query_embedding = model.encode([query], convert_to_tensor=False)
    query_embedding_np = np.array(query_embedding).astype('float32')
    distances, indices = index.search(query_embedding_np, k)
    return [documents[i] for i in indices[0]], distances[0]


# After loading saved resources but before generation
sample_query = "Predict the potential outcome for a fight between Belal Muhammad and Jack Delamadellena."

# First retrieve documents
retrieved_docs, distances = retrieve_documents(sample_query, k=3)  # Add this line

# Then generate prediction
prediction = generate_prediction(sample_query, retrieved_docs)
print("Prediction:", prediction)