In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import matplotlib.pyplot as plt
import seaborn as sns


# Assuming the file is in the current directory or mounted to Google Drive
# If not, update the file path accordingly
file_path = 'ufc-masterr.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to verify the data
df.head()

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,FinishDetails,FinishRound,FinishRoundTime,TotalFightTimeSecs,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds
0,Colby Covington,Joaquin Buckley,205.0,-250.0,205.0,40.0,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,,3.0,4:42,882.0,300.0,175.0,1800.0,2000.0,1100.0,150.0
1,Cub Swanson,Billy Quarantillo,124.0,-148.0,124.0,67.5676,2024-12-14,"Tampa, Florida, USA",USA,Red,...,Punch,3.0,1:36,696.0,250.0,,1800.0,,450.0,
2,Manel Kape,Bruno Silva,-395.0,310.0,25.3165,310.0,2024-12-14,"Tampa, Florida, USA",USA,Red,...,Punches,3.0,1:57,717.0,-105.0,550.0,900.0,1800.0,225.0,1100.0
3,Vitor Petrino,Dustin Jacoby,-340.0,270.0,29.4118,270.0,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,Punch,3.0,3:44,824.0,240.0,500.0,550.0,3000.0,110.0,800.0
4,Adrian Yanez,Daniel Marcos,185.0,-225.0,185.0,44.4444,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,,3.0,5:00,900.0,450.0,150.0,2200.0,2200.0,450.0,200.0


In [None]:
# We agree to drop these columns.
columns_to_drop = ['Location', 'Country', 'FinishDetails', 'FinishRoundTime']
df.drop(columns=columns_to_drop, inplace=True)


In [None]:
missing_finish_count = df['Finish'].isnull().sum()
missing_finish_count

238

In [None]:
# Assuming your DataFrame is named 'df'
unique_finishes = df['Finish'].unique()

# Print the unique values
print(unique_finishes)

# Calculate the mode of the 'Finish' column
mode_finish = df['Finish'].mode()[0]
# Impute missing values with the mode
df['Finish'].fillna(mode_finish, inplace=True)

['KO/TKO' 'S-DEC' 'U-DEC' 'SUB' 'M-DEC' 'DQ' nan 'Overturned']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Finish'].fillna(mode_finish, inplace=True)


In [None]:
# Create a boolean mask for missing 'FinishRound' values
missing_finish_round = df['FinishRound'].isnull()

# Create a boolean mask for missing 'TotalFightTimeSecs' values
missing_total_fight_time_secs = df['TotalFightTimeSecs'].isnull()

# Compare the masks
are_same = (missing_finish_round == missing_total_fight_time_secs).all()

are_same

True

In [None]:
# 1. Impute 'FinishRound' with the mode (most common value)
mode_finish_round = df['FinishRound'].mode()[0]
df['FinishRound'].fillna(mode_finish_round, inplace=True)

# Ensure 'FinishRound' is integer
df['FinishRound'] = df['FinishRound'].astype(int)

# 2. Define a function to impute 'TotalFightTimeSecs' based on 'FinishRound'
def impute_total_fight_time(row):
    if pd.isna(row['TotalFightTimeSecs']):
        finish_round = row['FinishRound']
        # Each round is 5 minutes = 300 seconds
        # Minimum fight time: (FinishRound - 1)*300 + 60 (at least 1 minute into the final round)
        # Maximum fight time: FinishRound * 300 - 1 second
        min_time = (finish_round - 1) * 300 + 60
        max_time = finish_round * 300
        return np.random.randint(min_time, max_time)
    else:
        return row['TotalFightTimeSecs']

# 3. Apply the function to impute missing 'TotalFightTimeSecs'
df['TotalFightTimeSecs'] = df.apply(impute_total_fight_time, axis=1)

# 4. Verify that there are no remaining missing values
missing_after = df[['FinishRound', 'TotalFightTimeSecs']].isnull().sum()
print("Missing values after imputation:")
print(missing_after)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['FinishRound'].fillna(mode_finish_round, inplace=True)


Missing values after imputation:
FinishRound           0
TotalFightTimeSecs    0
dtype: int64


In [None]:
missing_after = df.isnull().sum()
print("Missing Values After Imputation:\n", missing_after[missing_after > 0])



Missing Values After Imputation:
 RedOdds                   227
BlueOdds                  226
RedExpectedValue          227
BlueExpectedValue         226
BlueAvgSigStrLanded       930
BlueAvgSigStrPct          765
BlueAvgSubAtt             832
BlueAvgTDLanded           833
BlueAvgTDPct              842
BlueStance                  3
RedAvgSigStrLanded        455
RedAvgSigStrPct           357
RedAvgSubAtt              357
RedAvgTDLanded            357
RedAvgTDPct               367
EmptyArena               1499
BMatchWCRank             5339
RMatchWCRank             4760
RWFlyweightRank          6445
RWFeatherweightRank      6532
RWStrawweightRank        6395
RWBantamweightRank       6387
RHeavyweightRank         6355
RLightHeavyweightRank    6357
RMiddleweightRank        6359
RWelterweightRank        6349
RLightweightRank         6357
RFeatherweightRank       6364
RBantamweightRank        6360
RFlyweightRank           6352
RPFPRank                 6288
BWFlyweightRank          6468
BWFeat

In [None]:
# =============================
# STEP 3.4: CREATE A SINGLE RANK CATEGORY PER FIGHTER CORNER
# =============================

# 3.4.1 Identify Red rank columns and Blue rank columns
# 3.4.1 Identify Red rank columns and Blue rank columns
red_rank_cols = [c for c in df.columns if c.startswith('R') and c.endswith('Rank') and 'weight' in c.lower() and c != 'BetterRank']
blue_rank_cols = [c for c in df.columns if c.startswith('B') and c.endswith('Rank') and 'weight' in c.lower() and  c != 'BetterRank']

def get_rank_label(row, rank_cols):
    """
    Given a row and a list of rank columns (e.g. for Red or Blue),
    return a label: high level, good level, or okay fighter.
    """
    # Gather all rank values
    ranks = row[rank_cols].dropna()
    if len(ranks) == 0:
        # No rank at all => okay fighter
        return "okay fighter"

    min_rank = ranks.min()
    if min_rank <= 5:
        return "high level"
    elif min_rank <= 15:
        return "good level"
    else:
        return "okay fighter"

# Create new columns:
df['RedRankCategory'] = df.apply(lambda x: get_rank_label(x, red_rank_cols), axis=1)
df['BlueRankCategory'] = df.apply(lambda x: get_rank_label(x, blue_rank_cols), axis=1)

# Now drop the old rank columns
df.drop(columns=red_rank_cols + blue_rank_cols, inplace=True)

print("Shape after consolidating rank columns:", df.shape)
df[['RedRankCategory','BlueRankCategory']].head(10)


Shape after consolidating rank columns: (6541, 92)


Unnamed: 0,RedRankCategory,BlueRankCategory
0,good level,good level
1,okay fighter,okay fighter
2,good level,good level
3,okay fighter,okay fighter
4,okay fighter,okay fighter
5,okay fighter,okay fighter
6,okay fighter,okay fighter
7,okay fighter,okay fighter
8,okay fighter,okay fighter
9,okay fighter,okay fighter


In [None]:
# Filter rows where either 'BMatchWCRank' or 'RMatchWCRank' is not null
filtered_df = df[df['BMatchWCRank'].notnull() | df['RMatchWCRank'].notnull()]

# Select only fighter-related columns and 'BMatchWCRank', 'RMatchWCRank'
fighter_cols = [col for col in df.columns if col.startswith(('R', 'B'))]
selected_cols = fighter_cols + ['BMatchWCRank', 'RMatchWCRank']
filtered_df = filtered_df[selected_cols]

# Display the filtered DataFrame
filtered_df.head()

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,BlueCurrentLoseStreak,BlueCurrentWinStreak,BlueDraws,BlueAvgSigStrLanded,...,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds,RedRankCategory,BlueRankCategory,BMatchWCRank,RMatchWCRank
0,Colby Covington,Joaquin Buckley,205.0,-250.0,205.0,40.0,0,5,0,4.13,...,300.0,175.0,1800.0,2000.0,1100.0,150.0,good level,good level,9.0,6.0
2,Manel Kape,Bruno Silva,-395.0,310.0,25.3165,310.0,0,4,0,3.32,...,-105.0,550.0,900.0,1800.0,225.0,1100.0,good level,good level,12.0,9.0
13,Alexandre Pantoja,Kai Asakura,-250.0,215.0,40.0,215.0,0,0,0,0.0,...,300.0,800.0,150.0,2500.0,400.0,350.0,high level,okay fighter,,0.0
14,Shavkat Rakhmonov,Ian Machado Garry,-210.0,295.0,47.619,295.0,0,8,0,5.5,...,250.0,650.0,180.0,3000.0,240.0,700.0,high level,good level,7.0,3.0
15,Ciryl Gane,Alexander Volkov,-380.0,300.0,26.3158,300.0,0,4,0,5.13,...,-160.0,450.0,1100.0,3000.0,350.0,1100.0,high level,high level,3.0,2.0


In [None]:

# 1. Drop the EmptyArena column
if 'EmptyArena' in df.columns:
    df.drop(columns=['EmptyArena'], inplace=True)
    print("Dropped 'EmptyArena' column.")

# 2. Impute BlueStance with its mode (most frequent value)
if 'BlueStance' in df.columns:
    mode_blue_stance = df['BlueStance'].mode()[0]
    df['BlueStance'] = df['BlueStance'].fillna(mode_blue_stance)
    print("Imputed missing BlueStance with mode:", mode_blue_stance)

# 3. Impute 'Avg' columns and 'Odds' columns per fighter using group means.
#    Define the lists of columns for red and blue fighters.

# Average performance columns for red and blue fighters:
red_avg_cols = ['RedAvgSigStrLanded', 'RedAvgSigStrPct', 'RedAvgSubAtt', 'RedAvgTDLanded', 'RedAvgTDPct']
blue_avg_cols = ['BlueAvgSigStrLanded', 'BlueAvgSigStrPct', 'BlueAvgSubAtt', 'BlueAvgTDLanded', 'BlueAvgTDPct']

# Odds columns for red and blue fighters:
red_odds_cols = ['RedOdds', 'RedExpectedValue', 'RedDecOdds', 'RSubOdds', 'RKOOdds']
blue_odds_cols = ['BlueOdds', 'BlueExpectedValue', 'BlueDecOdds', 'BSubOdds', 'BKOOdds']

# Function to impute missing values per fighter group using the mean for that fighter.
def impute_by_fighter(df, fighter_col, cols_to_impute):
    for col in cols_to_impute:
        if col in df.columns:
            # Group by the fighter (e.g. RedFighter or BlueFighter) and fill missing with that fighter's mean.
            df[col] = df.groupby(fighter_col)[col].transform(lambda x: x.fillna(x.mean()))
    return df

# Impute for red fighter columns:
df = impute_by_fighter(df, 'RedFighter', red_avg_cols + red_odds_cols)

# Impute for blue fighter columns:
df = impute_by_fighter(df, 'BlueFighter', blue_avg_cols + blue_odds_cols)

# Optional: Print the number of missing values in these columns to check the imputation
missing_after = df[red_avg_cols + blue_avg_cols + red_odds_cols + blue_odds_cols].isnull().sum()
print("Missing values after imputation:")
print(missing_after)


Dropped 'EmptyArena' column.
Imputed missing BlueStance with mode: Orthodox
Missing values after imputation:
RedAvgSigStrLanded      95
RedAvgSigStrPct         87
RedAvgSubAtt            87
RedAvgTDLanded          87
RedAvgTDPct             87
BlueAvgSigStrLanded    149
BlueAvgSigStrPct       129
BlueAvgSubAtt          138
BlueAvgTDLanded        138
BlueAvgTDPct           139
RedOdds                 37
RedExpectedValue        37
RedDecOdds             333
RSubOdds               421
RKOOdds                420
BlueOdds                36
BlueExpectedValue       36
BlueDecOdds            384
BSubOdds               498
BKOOdds                498
dtype: int64


In [None]:
# List all columns we want to impute overall (after group-based imputation)
cols_to_impute = red_avg_cols + blue_avg_cols + red_odds_cols + blue_odds_cols

# For each column, fill missing values with the overall mean of that column
for col in cols_to_impute:
    overall_mean = df[col].mean()
    df[col] = df[col].fillna(overall_mean)
    print(f"Imputed remaining missing values in {col} with overall mean: {overall_mean:.2f}")

# Verify that missing values are handled:
missing_after_overall = df[cols_to_impute].isnull().sum()
print("Missing values after overall imputation:")
print(missing_after_overall)

Imputed remaining missing values in RedAvgSigStrLanded with overall mean: 21.27
Imputed remaining missing values in RedAvgSigStrPct with overall mean: 0.46
Imputed remaining missing values in RedAvgSubAtt with overall mean: 0.54
Imputed remaining missing values in RedAvgTDLanded with overall mean: 1.40
Imputed remaining missing values in RedAvgTDPct with overall mean: 0.34
Imputed remaining missing values in BlueAvgSigStrLanded with overall mean: 20.30
Imputed remaining missing values in BlueAvgSigStrPct with overall mean: 0.45
Imputed remaining missing values in BlueAvgSubAtt with overall mean: 0.49
Imputed remaining missing values in BlueAvgTDLanded with overall mean: 1.30
Imputed remaining missing values in BlueAvgTDPct with overall mean: 0.32
Imputed remaining missing values in RedOdds with overall mean: -117.34
Imputed remaining missing values in RedExpectedValue with overall mean: 96.30
Imputed remaining missing values in RedDecOdds with overall mean: 309.09
Imputed remaining mis

In [None]:
missing_after = df.isnull().sum()
print("Missing Values After Imputation:\n", missing_after[missing_after > 0])



Missing Values After Imputation:
 BMatchWCRank    5339
RMatchWCRank    4760
RPFPRank        6288
BPFPRank        6474
dtype: int64


In [None]:

# List of ranking columns to handle
ranking_cols = ['BMatchWCRank', 'RMatchWCRank', 'RPFPRank', 'BPFPRank']

for col in ranking_cols:
    # Compute the maximum observed value (ignoring missing values)
    max_rank = df[col].max(skipna=True)
    # Impute missing values with a constant worse than any observed rank.
    # Here, we use max observed + 1. Alternatively, you can use a fixed value like 999 if appropriate.
    impute_value = max_rank + 1
    df[col] = df[col].fillna(impute_value)
    print(f"For column {col}, imputed missing values with {impute_value}")

# Verify that there are no missing values in these columns now
print("Missing values after ranking imputation:")
print(df[ranking_cols].isnull().sum())


For column BMatchWCRank, imputed missing values with 16.0
For column RMatchWCRank, imputed missing values with 16.0
For column RPFPRank, imputed missing values with 16.0
For column BPFPRank, imputed missing values with 16.0
Missing values after ranking imputation:
BMatchWCRank    0
RMatchWCRank    0
RPFPRank        0
BPFPRank        0
dtype: int64


In [None]:
# Filter the DataFrame for rows where either RedFighter or BlueFighter is 'Conor McGregor'
mcgregor_rows = df[(df['RedFighter'] == 'Conor McGregor') | (df['BlueFighter'] == 'Conor McGregor')]

# Select the desired columns
selected_columns = ['RedFighter', 'BlueFighter', 'BlueTotalRoundsFought', 'RedTotalRoundsFought', 'BlueAvgSigStrLanded', 'RedAvgSigStrLanded']
mcgregor_data = mcgregor_rows[selected_columns]

# Display the last three rows using tail()
last_three_rows = mcgregor_data.tail(5)
print(last_three_rows)

          RedFighter     BlueFighter  BlueTotalRoundsFought  \
4811  Conor McGregor    Dennis Siver                     42   
4957  Dustin Poirier  Conor McGregor                      5   
5048  Conor McGregor   Diego Brandao                     12   
5490  Conor McGregor    Max Holloway                     12   
5622  Marcus Brimage  Conor McGregor                      0   

      RedTotalRoundsFought  BlueAvgSigStrLanded  RedAvgSigStrLanded  
4811                     6             39.55560             25.2500  
4957                    23             30.66670             47.4000  
5048                     4             23.66670             37.0000  
5490                     1             70.60000             21.0000  
5622                     9             30.06355             74.3333  


In [None]:
df.columns.to_list()

['RedFighter',
 'BlueFighter',
 'RedOdds',
 'BlueOdds',
 'RedExpectedValue',
 'BlueExpectedValue',
 'Date',
 'Winner',
 'TitleBout',
 'WeightClass',
 'Gender',
 'NumberOfRounds',
 'BlueCurrentLoseStreak',
 'BlueCurrentWinStreak',
 'BlueDraws',
 'BlueAvgSigStrLanded',
 'BlueAvgSigStrPct',
 'BlueAvgSubAtt',
 'BlueAvgTDLanded',
 'BlueAvgTDPct',
 'BlueLongestWinStreak',
 'BlueLosses',
 'BlueTotalRoundsFought',
 'BlueTotalTitleBouts',
 'BlueWinsByDecisionMajority',
 'BlueWinsByDecisionSplit',
 'BlueWinsByDecisionUnanimous',
 'BlueWinsByKO',
 'BlueWinsBySubmission',
 'BlueWinsByTKODoctorStoppage',
 'BlueWins',
 'BlueStance',
 'BlueHeightCms',
 'BlueReachCms',
 'BlueWeightLbs',
 'RedCurrentLoseStreak',
 'RedCurrentWinStreak',
 'RedDraws',
 'RedAvgSigStrLanded',
 'RedAvgSigStrPct',
 'RedAvgSubAtt',
 'RedAvgTDLanded',
 'RedAvgTDPct',
 'RedLongestWinStreak',
 'RedLosses',
 'RedTotalRoundsFought',
 'RedTotalTitleBouts',
 'RedWinsByDecisionMajority',
 'RedWinsByDecisionSplit',
 'RedWinsByDecisionU

**Assigning ID's To Fighters**

In [None]:
# 1) Gather all unique names from both Red and Blue columns.
all_fighters = pd.concat([df["RedFighter"], df["BlueFighter"]]).unique()

print(f"Total unique fighter names: {len(all_fighters)}")

Total unique fighter names: 2113


In [None]:
fighter_to_id = {name: idx for idx, name in enumerate(all_fighters)}

# Example:
# {
#   'Conor McGregor': 0,
#   'Khabib Nurmagomedov': 1,
#   ...
# }


In [None]:
df["RedFighterID"] = df["RedFighter"].map(fighter_to_id)
df["BlueFighterID"] = df["BlueFighter"].map(fighter_to_id)

df.head(10)


Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Winner,TitleBout,WeightClass,...,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds,RedRankCategory,BlueRankCategory,RedFighterID,BlueFighterID
0,Colby Covington,Joaquin Buckley,205.0,-250.0,205.0,40.0,2024-12-14,Blue,False,Welterweight,...,300.0,175.0,1800.0,2000.0,1100.0,150.0,good level,good level,0,281
1,Cub Swanson,Billy Quarantillo,124.0,-148.0,124.0,67.5676,2024-12-14,Red,False,Featherweight,...,250.0,288.75,1800.0,818.75,450.0,410.0,okay fighter,okay fighter,1,327
2,Manel Kape,Bruno Silva,-395.0,310.0,25.3165,310.0,2024-12-14,Red,False,Flyweight,...,-105.0,550.0,900.0,1800.0,225.0,1100.0,good level,good level,2,610
3,Vitor Petrino,Dustin Jacoby,-340.0,270.0,29.4118,270.0,2024-12-14,Blue,False,Light Heavyweight,...,240.0,500.0,550.0,3000.0,110.0,800.0,okay fighter,okay fighter,3,456
4,Adrian Yanez,Daniel Marcos,185.0,-225.0,185.0,44.4444,2024-12-14,Blue,False,Bantamweight,...,450.0,150.0,2200.0,2200.0,450.0,200.0,okay fighter,okay fighter,4,1662
5,Navajo Stirling,Tuco Tokkos,-750.0,525.0,13.3333,525.0,2024-12-14,Red,False,Light Heavyweight,...,275.0,1000.0,1200.0,2500.0,-185.0,1200.0,okay fighter,okay fighter,5,1663
6,Michael Johnson,Ottman Azaitar,-230.0,190.0,43.4783,190.0,2024-12-14,Red,False,Lightweight,...,300.0,650.0,1000.0,2500.0,120.0,350.0,okay fighter,okay fighter,6,468
7,Joel Alvarez,Drakkar Klose,-425.0,330.0,23.5294,330.0,2024-12-14,Red,False,Lightweight,...,240.0,650.0,165.0,3000.0,275.0,900.0,okay fighter,okay fighter,7,396
8,Sean Woodson,Fernando Padilla,-155.0,130.0,64.5161,130.0,2024-12-14,Red,False,Featherweight,...,100.0,350.0,2000.0,650.0,550.0,500.0,okay fighter,okay fighter,8,328
9,Miles Johns,Felipe Lima,210.0,-258.0,210.0,38.7597,2024-12-14,Blue,False,Featherweight,...,350.0,-110.0,2000.0,650.0,700.0,500.0,okay fighter,okay fighter,9,1664


In [None]:
df['Finish'].value_counts()

Unnamed: 0_level_0,count
Finish,Unnamed: 1_level_1
U-DEC,2647
KO/TKO,2016
SUB,1157
S-DEC,655
M-DEC,46
DQ,18
Overturned,2


**Aggregations**

In [None]:
df = df.sort_values(by='Date').reset_index(drop=True)
df['FightID'] = df.index  # or some other unique identifier

df.head()

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Winner,TitleBout,WeightClass,...,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds,RedRankCategory,BlueRankCategory,RedFighterID,BlueFighterID,FightID
0,Eric Schafer,Jason Brilz,140.0,-160.0,140.0,62.5,2010-03-21,Blue,False,Light Heavyweight,...,429.849839,882.323429,1105.113259,518.980801,644.79167,okay fighter,okay fighter,1661,1652,0
1,Brandon Vera,Jon Jones,215.0,-235.0,215.0,42.5532,2010-03-21,Blue,False,Light Heavyweight,...,429.849839,882.323429,1105.113259,518.980801,644.79167,okay fighter,okay fighter,1584,40,1
2,Junior Dos Santos,Gabriel Gonzaga,-250.0,230.0,40.0,230.0,2010-03-21,Red,False,Heavyweight,...,851.666667,1536.111111,508.333333,64.222222,563.333333,okay fighter,okay fighter,783,1238,2
3,Cheick Kongo,Paul Buentello,-345.0,315.0,28.9855,315.0,2010-03-21,Red,False,Heavyweight,...,429.849839,1500.0,1105.113259,205.0,644.79167,okay fighter,okay fighter,1533,2110,3
4,Alessio Sakara,James Irvin,-120.0,100.0,83.3333,100.0,2010-03-21,Red,False,Middleweight,...,429.849839,465.0,1105.113259,170.0,644.79167,okay fighter,okay fighter,1446,1643,4


In [None]:
import pandas as pd

# ---------------------
# Red corner subset
# ---------------------
red_df = df[[
    'FightID', 'Date', 'RedFighterID',
    'RedAvgSigStrLanded', 'RedAvgSigStrPct',
    'RedAvgSubAtt', 'RedAvgTDLanded', 'RedAvgTDPct',
    'FinishRound'  # if you want to do cumulative finish round as well
]].copy()

red_df.rename(columns={
    'RedFighterID': 'FighterID',
    'RedAvgSigStrLanded': 'AvgSigStrLanded',
    'RedAvgSigStrPct':    'AvgSigStrPct',
    'RedAvgSubAtt':       'AvgSubAtt',
    'RedAvgTDLanded':     'AvgTDLanded',
    'RedAvgTDPct':        'AvgTDPct'
}, inplace=True)

# ---------------------
# Blue corner subset
# ---------------------
blue_df = df[[
    'FightID', 'Date', 'BlueFighterID',
    'BlueAvgSigStrLanded', 'BlueAvgSigStrPct',
    'BlueAvgSubAtt', 'BlueAvgTDLanded', 'BlueAvgTDPct',
    'FinishRound'
]].copy()

blue_df.rename(columns={
    'BlueFighterID':       'FighterID',
    'BlueAvgSigStrLanded': 'AvgSigStrLanded',
    'BlueAvgSigStrPct':    'AvgSigStrPct',
    'BlueAvgSubAtt':       'AvgSubAtt',
    'BlueAvgTDLanded':     'AvgTDLanded',
    'BlueAvgTDPct':        'AvgTDPct'
}, inplace=True)

# ---------------------
# Concatenate (long_df)
# ---------------------
fighter_long = pd.concat([red_df, blue_df], axis=0, ignore_index=True)

# Sort again by Date + FighterID (just to keep a consistent order)
#fighter_long.sort_values(by=['FighterID','Date'], inplace=True)

fighter_long.head(10)


Unnamed: 0,FightID,Date,FighterID,AvgSigStrLanded,AvgSigStrPct,AvgSubAtt,AvgTDLanded,AvgTDPct,FinishRound
0,0,2010-03-21,1661,15.6667,0.588,1.3333,0.8333,0.145,3
1,1,2010-03-21,1584,30.7273,0.573,0.0909,0.5455,0.364,1
2,2,2010-03-21,783,25.75,0.538,0.0,0.0,0.0,1
3,3,2010-03-21,1533,32.9091,0.604,0.2727,1.1818,0.476,3
4,4,2010-03-21,1446,20.8,0.455,0.0,1.0,0.15,1
5,5,2010-03-21,1480,40.5,0.405,0.0,3.5,0.52,3
6,6,2010-03-21,1660,27.0,0.343,0.6667,0.3333,0.333,3
7,7,2010-03-21,1562,13.6667,0.577,0.0,0.0,0.0,1
8,8,2010-03-21,1345,18.0,0.55,1.0,4.6667,0.79,1
9,9,2010-03-21,1457,12.0,0.25,0.0,0.0,0.0,1


In [None]:
# We'll define a helper function that uses expanding().mean().shift(1)
# to create a "HistoricalAvgSigStrLanded" column for each fighter.

def create_historical_column(df, col_name):
    """
    For the given col_name (e.g. 'AvgSigStrLanded'),
    compute the cumulative average for each fighter,
    shifted by 1 so the current row's fight is excluded.
    """
    # Group by fighter, then apply expanding mean, then shift(1)
    df[f'Historical_{col_name}'] = (
        df.groupby('FighterID')[col_name]
          .apply(lambda x: x.expanding().mean().shift(1))
          .reset_index(level=0, drop=True)  # Reset index to align with original DataFrame
    )
    return df

# Let's do it for each stat you care about:
stats_to_lag = [
    'AvgSigStrLanded',
    'AvgSigStrPct',
    'AvgSubAtt',
    'AvgTDLanded',
    'AvgTDPct',
    'FinishRound'  # if you want to get average finish round from past fights
]

for stat_col in stats_to_lag:
    fighter_long = create_historical_column(fighter_long, stat_col)

fighter_long.head(15)



Unnamed: 0,FightID,Date,FighterID,AvgSigStrLanded,AvgSigStrPct,AvgSubAtt,AvgTDLanded,AvgTDPct,FinishRound,Historical_AvgSigStrLanded,Historical_AvgSigStrPct,Historical_AvgSubAtt,Historical_AvgTDLanded,Historical_AvgTDPct,Historical_FinishRound
0,0,2010-03-21,1661,15.6667,0.588,1.3333,0.8333,0.145,3,,,,,,
1,1,2010-03-21,1584,30.7273,0.573,0.0909,0.5455,0.364,1,,,,,,
2,2,2010-03-21,783,25.75,0.538,0.0,0.0,0.0,1,,,,,,
3,3,2010-03-21,1533,32.9091,0.604,0.2727,1.1818,0.476,3,,,,,,
4,4,2010-03-21,1446,20.8,0.455,0.0,1.0,0.15,1,,,,,,
5,5,2010-03-21,1480,40.5,0.405,0.0,3.5,0.52,3,,,,,,
6,6,2010-03-21,1660,27.0,0.343,0.6667,0.3333,0.333,3,,,,,,
7,7,2010-03-21,1562,13.6667,0.577,0.0,0.0,0.0,1,,,,,,
8,8,2010-03-21,1345,18.0,0.55,1.0,4.6667,0.79,1,,,,,,
9,9,2010-03-21,1457,12.0,0.25,0.0,0.0,0.0,1,,,,,,


In [None]:
# 1) For red corner, we select columns from fighter_long we want
red_merge_cols = [
    'FightID', 'FighterID',
    'Historical_AvgSigStrLanded',
    'Historical_AvgSigStrPct',
    'Historical_AvgSubAtt',
    'Historical_AvgTDLanded',
    'Historical_AvgTDPct',
    'Historical_FinishRound'
    # etc.
]

red_historical = fighter_long[red_merge_cols].copy()

df = pd.merge(
    df,
    red_historical,
    how='left',
    left_on=['FightID','RedFighterID'],
    right_on=['FightID','FighterID']
)

# rename
df.rename(columns={
    'Historical_AvgSigStrLanded': 'RedHistorical_AvgSigStrLanded',
    'Historical_AvgSigStrPct':    'RedHistorical_AvgSigStrPct',
    'Historical_AvgSubAtt':       'RedHistorical_AvgSubAtt',
    'Historical_AvgTDLanded':     'RedHistorical_AvgTDLanded',
    'Historical_AvgTDPct':        'RedHistorical_AvgTDPct',
    'Historical_FinishRound':     'RedHistorical_FinishRound'
}, inplace=True)

# drop the right-merge FighterID
df.drop(columns='FighterID', inplace=True)

# 2) Do the same for the blue corner
blue_merge_cols = red_merge_cols  # same structure
blue_historical = fighter_long[blue_merge_cols].copy()

df = pd.merge(
    df,
    blue_historical,
    how='left',
    left_on=['FightID','BlueFighterID'],
    right_on=['FightID','FighterID']
)

df.rename(columns={
    'Historical_AvgSigStrLanded': 'BlueHistorical_AvgSigStrLanded',
    'Historical_AvgSigStrPct':    'BlueHistorical_AvgSigStrPct',
    'Historical_AvgSubAtt':       'BlueHistorical_AvgSubAtt',
    'Historical_AvgTDLanded':     'BlueHistorical_AvgTDLanded',
    'Historical_AvgTDPct':        'BlueHistorical_AvgTDPct',
    'Historical_FinishRound':     'BlueHistorical_FinishRound'
}, inplace=True)

df.drop(columns='FighterID', inplace=True, errors='ignore')



In [None]:
bo_nickal_id = fighter_to_id.get('Bo Nickal')

if bo_nickal_id is not None:
    print(f"Bo Nickal ID: {bo_nickal_id}")


    bo_nickal_rows = fighter_long[fighter_long['FighterID'] == bo_nickal_id]


    historical_cols = [col for col in bo_nickal_rows.columns if col.startswith('Historical_')]


    print(bo_nickal_rows[['FightID','FighterID'] + historical_cols])

else:
    print("no")

Bo Nickal ID: 42
      FightID  FighterID  Historical_AvgSigStrLanded  Historical_AvgSigStrPct  \
5598     5598         42                         NaN                      NaN   
5784     5784         42                        1.64                     0.62   
6179     6179         42                        1.64                     0.62   
6491     6491         42                        1.64                     0.62   

      Historical_AvgSubAtt  Historical_AvgTDLanded  Historical_AvgTDPct  \
5598                   NaN                     NaN                  NaN   
5784                   7.5                    7.46                  0.5   
6179                   7.5                    7.46                  0.5   
6491                   7.5                    7.46                  0.5   

      Historical_FinishRound  
5598                     NaN  
5784                1.000000  
6179                1.000000  
6491                1.333333  


In [None]:
historical_cols = [col for col in df.columns if col.startswith(('RedHistorical_', 'BlueHistorical_'))]
for col in historical_cols:
   df[col] = df[col].fillna(0)

**ELO**

In [None]:
from collections import defaultdict, deque

def get_finish_points(finish):
    """
    Returns (winner_base_pts, loser_base_pts)
    based on finish method.
    """
    if finish in ['DQ', 'Overturned', None]:
        # No change if it's a DQ or Overturned (custom choice)
        return (0.5, 0.5)

    finish = finish.upper()
    if finish in ['U-DEC', 'M-DEC']:
        return (5.0, -3.0)
    elif finish == 'S-DEC':
        return (3.0, -2.0)
    elif finish in ['KO/TKO', 'SUB']:
        return (6.5, -4.5)
    else:
        # Unrecognized finish, no change
        return (0, 0)

def get_winner_rank_bonus(opponent_rank):
    """
    Additional points for the winner based on the opponent's rank category.
    """
    if opponent_rank == 'okay fighter':
        return 1.0
    elif opponent_rank == 'good level':
        return 3
    elif opponent_rank == 'high level':
        return 6.0
    return 0.0

def get_loser_rank_penalty(my_rank, opponent_rank):
    """
    Additional penalty for the loser based on rank relationship.
    - Opponent > me => 0
    - Opponent == me => -1
    - Opponent < me => -2.5
    """
    rank_map = {
        'okay fighter': 1,
        'good level': 2,
        'high level': 3
    }
    my_val  = rank_map.get(my_rank, 1)
    opp_val = rank_map.get(opponent_rank, 1)

    if opp_val > my_val:
        return -0.5  # lost to higher rank => no extra penalty
    elif opp_val == my_val:
        return -2.0
    else:
        # opp_val < my_val
        return -3.0


def transitive_bonus(winner_id, loser_id, fighter_recent_wins, fighter_recent_losses):
    """
    +1.5 if there's at least one fighter that the winner has beaten (in last 5 fights)
    who also has beaten the loser (in last 5 fights).
    That means:
        intersection( winner's recent_wins, loser's recent_losses ) != empty
    """
    w_recent_wins = set(fighter_recent_wins[winner_id])
    l_recent_losses = set(fighter_recent_losses[loser_id])
    intersection = w_recent_wins.intersection(l_recent_losses)
    if len(intersection) > 0:
        return 1.5
    return 0.0




In [None]:
import pandas as pd
from collections import defaultdict, deque

# 1) Sort the DataFrame by Date
#df = df.sort_values('Date').reset_index(drop=True)

# 2) Initialize
BASE_ELO = 10.0

# For storing the last 5 fight deltas
fighter_deltas = defaultdict(lambda: deque(maxlen=5))
# For storing the last 5 opponents each fighter has beaten or lost to
fighter_recent_wins = defaultdict(lambda: deque(maxlen=5))
fighter_recent_losses = defaultdict(lambda: deque(maxlen=5))

# We'll store pre-fight ELO in these columns
df['RedElo'] = 0.0
df['BlueElo'] = 0.0

# 3) Iterate fights in chronological order
for idx, row in df.iterrows():
    red_id = row['RedFighterID']
    blue_id = row['BlueFighterID']
    red_rank = row['RedRankCategory']
    blue_rank = row['BlueRankCategory']
    finish   = row['Finish']
    winner   = row['Winner']  # 'Red','Blue', or something else

    # PRE-FIGHT ELOs (sum of last 5 deltas + baseline)
    red_pre_elo  = BASE_ELO + sum(fighter_deltas[red_id])
    blue_pre_elo = BASE_ELO + sum(fighter_deltas[blue_id])

    # Store them
    df.at[idx, 'RedElo']  = red_pre_elo
    df.at[idx, 'BlueElo'] = blue_pre_elo

    # If no clear winner, skip ELO updates
    if winner not in ['Red','Blue']:
        continue

    # Identify winner & loser
    if winner == 'Red':
        w_id, w_rank = red_id, red_rank
        l_id, l_rank = blue_id, blue_rank
    else:  # winner == 'Blue'
        w_id, w_rank = blue_id, blue_rank
        l_id, l_rank = red_id, red_rank

    # Base points from finish method
    w_pts, l_pts = get_finish_points(finish)  # w_pts > 0, l_pts < 0

    # Winner rank bonus
    # Opponent's rank is the loser's rank
    winner_rank_bonus = get_winner_rank_bonus(l_rank)

    # Loser rank penalty
    # from the loser's perspective, my_rank = l_rank, opp_rank = w_rank
    loser_rank_penalty = get_loser_rank_penalty(l_rank, w_rank)

    # Transitive bonus
    # only consider last 5 fights
    trans_bonus = transitive_bonus(w_id, l_id, fighter_recent_wins, fighter_recent_losses)

    # Sum up final deltas
    winner_delta = w_pts + winner_rank_bonus + trans_bonus   # typically positive
    loser_delta  = l_pts + loser_rank_penalty                # typically negative

    # Update the fighter's deque with new deltas
    fighter_deltas[w_id].append(winner_delta)
    fighter_deltas[l_id].append(loser_delta)

    # Update recent wins/losses
    fighter_recent_wins[w_id].append(l_id)
    fighter_recent_losses[l_id].append(w_id)



In [None]:
fighters = [
    "Dricus Du Plessis",
    "Sean Strickland",
    "Nassourdine Imavov",
    "Khamzat Chimaev",
    "Israel Adesanya",
    "Robert Whittaker",
    "Caio Borralho",
    "Jared Cannonier",
    "Marvin Vettori",
    "Brendan Allen",
    "Roman Dolidze",
    "Paulo Costa",
    "Anthony Hernandez",
    "Michel Pereira",
    "Roman Kopylov",
    "Bo Nickal",
]

# Create a dictionary to store the most recent Elo points for each fighter
most_recent_elo = {}

# Iterate through the fighters
for fighter in fighters:
    # Filter the DataFrame for fights involving the current fighter
    fighter_fights = df[(df['RedFighter'] == fighter) | (df['BlueFighter'] == fighter)]

    # Get the most recent fight (last row)
    most_recent_fight = fighter_fights.iloc[-1]

    # Extract the Elo point for the fighter from the most recent fight
    if most_recent_fight['RedFighter'] == fighter:
        elo_point = most_recent_fight['RedElo']
    else:
        elo_point = most_recent_fight['BlueElo']

    # Store the Elo point in the dictionary
    most_recent_elo[fighter] = elo_point

# Print the most recent Elo points for each fighter
for fighter, elo in most_recent_elo.items():
    print(f"{fighter}: {elo:.2f}")

Dricus Du Plessis: 61.50
Sean Strickland: 30.00
Nassourdine Imavov: 41.00
Khamzat Chimaev: 56.50
Israel Adesanya: 34.50
Robert Whittaker: 26.50
Caio Borralho: 45.00
Jared Cannonier: 28.50
Marvin Vettori: 27.00
Brendan Allen: 50.00
Roman Dolidze: 26.50
Paulo Costa: 13.50
Anthony Hernandez: 46.00
Michel Pereira: 44.50
Roman Kopylov: 35.00
Bo Nickal: 32.50


In [None]:
fighters = [
    "Islam Makhachev",
    "Arman Tsarukyan",
    "Charles Oliveira",
    "Justin Gaethje",
    "Dustin Poirier",
    "Max Holloway",
    "Dan Hooker",
    "Michael Chandler",
    "Mateusz Gamrot",
    "Beneil Dariush",
    "Renato Moicano",
    "Rafael Fiziev",
    "Paddy Pimblett",
    "Jalin Turner",
    "Benoit Saint Denis",
    "Grant Dawson"
]

# Create a dictionary to store the most recent Elo points for each fighter
most_recent_elo = {}

# Iterate through the fighters
for fighter in fighters:
    # Filter the DataFrame for fights involving the current fighter
    fighter_fights = df[(df['RedFighter'] == fighter) | (df['BlueFighter'] == fighter)]

    # Get the most recent fight (last row)
    if not fighter_fights.empty:
        most_recent_fight = fighter_fights.iloc[-1]

        # Extract the Elo point for the fighter from the most recent fight
        if most_recent_fight['RedFighter'] == fighter:
            elo_point = most_recent_fight['RedElo']
        else:
            elo_point = most_recent_fight['BlueElo']

        # Store the Elo point in the dictionary
        most_recent_elo[fighter] = elo_point
    else:
        print(f"No fights found for {fighter}, Elo set to base: {BASE_ELO}")
        most_recent_elo[fighter] = BASE_ELO # handle new fighters or those with no data yet

# Print the most recent Elo points for each fighter
for fighter, elo in most_recent_elo.items():
    print(f"{fighter}: {elo:.2f}")

Islam Makhachev: 64.50
Arman Tsarukyan: 40.50
Charles Oliveira: 38.50
Justin Gaethje: 28.50
Dustin Poirier: 26.50
Max Holloway: 52.00
Dan Hooker: 18.00
Michael Chandler: 11.00
Mateusz Gamrot: 38.00
Beneil Dariush: 36.00
Renato Moicano: 40.50
Rafael Fiziev: 39.00
Paddy Pimblett: 44.50
Jalin Turner: 26.50
Benoit Saint Denis: 37.00
Grant Dawson: 33.00


In [None]:
fighters = [
    "Belal Muhammad",
    "Leon Edwards",
    "Shavkat Rakhmonov",
    "Kamaru Usman",
    "Jack Della Maddalena",
    "Sean Brady",
    "Joaquin Buckley",
    "Ian Machado Garry",
    "Gilbert Burns",
    "Colby Covington",
    "Geoff Neal",
    "Stephen Thompson",
    "Michael Morales",
    "Carlos Prates",
    "Vicente Luque",
    "Michael Page"
]

# Create a dictionary to store the most recent Elo points for each fighter
most_recent_elo = {}

# Iterate through the fighters
for fighter in fighters:
    # Filter the DataFrame for fights involving the current fighter
    fighter_fights = df[(df['RedFighter'] == fighter) | (df['BlueFighter'] == fighter)]

    # Get the most recent fight (last row) if available
    if not fighter_fights.empty:
        most_recent_fight = fighter_fights.iloc[-1]

        # Extract the Elo point for the fighter from the most recent fight
        if most_recent_fight['RedFighter'] == fighter:
            elo_point = most_recent_fight['RedElo']
        else:
            elo_point = most_recent_fight['BlueElo']

        # Store the Elo point in the dictionary
        most_recent_elo[fighter] = elo_point
    else:
        print(f"No fights found for {fighter}, Elo set to base: {BASE_ELO}")
        most_recent_elo[fighter] = BASE_ELO  # Handle new or fighters with no data

# Print the most recent Elo points for each fighter
for fighter, elo in most_recent_elo.items():
    print(f"{fighter}: {elo:.2f}")

Belal Muhammad: 62.00
Leon Edwards: 64.50
Shavkat Rakhmonov: 55.00
Kamaru Usman: 34.50
Jack Della Maddalena: 42.50
Sean Brady: 37.50
Joaquin Buckley: 48.50
Ian Machado Garry: 50.50
Gilbert Burns: 10.50
Colby Covington: 15.50
Geoff Neal: 11.50
Stephen Thompson: 7.50
Michael Morales: 37.00
Carlos Prates: 32.50
Vicente Luque: 12.00
Michael Page: 18.00


In [None]:
# Filter for Colby Covington's fights
covington_fights = df[(df['RedFighter'] == 'Colby Covington') | (df['BlueFighter'] == 'Colby Covington')]

# Get the last 6 fights
last_6_fights = covington_fights.tail(6)

# Select relevant columns and display
selected_columns = ['Date', 'RedFighter', 'BlueFighter', 'RedElo', 'BlueElo', 'Winner']
covington_fight_data = last_6_fights[selected_columns]

# Display the data
print(covington_fight_data)

            Date       RedFighter      BlueFighter  RedElo  BlueElo Winner
4047  2019-12-14     Kamaru Usman  Colby Covington    56.5     55.5    Red
4375  2020-09-19  Colby Covington    Tyron Woodley    43.0     33.5    Red
4943  2021-11-06     Kamaru Usman  Colby Covington    71.0     47.5    Red
5089  2022-03-05  Colby Covington   Jorge Masvidal    31.5     33.0    Red
6016  2023-12-16     Leon Edwards  Colby Covington    58.0     30.0    Red
6540  2024-12-14  Colby Covington  Joaquin Buckley    15.5     48.5   Blue


In [None]:
df.columns.to_list()

['RedFighter',
 'BlueFighter',
 'RedOdds',
 'BlueOdds',
 'RedExpectedValue',
 'BlueExpectedValue',
 'Date',
 'Winner',
 'TitleBout',
 'WeightClass',
 'Gender',
 'NumberOfRounds',
 'BlueCurrentLoseStreak',
 'BlueCurrentWinStreak',
 'BlueDraws',
 'BlueAvgSigStrLanded',
 'BlueAvgSigStrPct',
 'BlueAvgSubAtt',
 'BlueAvgTDLanded',
 'BlueAvgTDPct',
 'BlueLongestWinStreak',
 'BlueLosses',
 'BlueTotalRoundsFought',
 'BlueTotalTitleBouts',
 'BlueWinsByDecisionMajority',
 'BlueWinsByDecisionSplit',
 'BlueWinsByDecisionUnanimous',
 'BlueWinsByKO',
 'BlueWinsBySubmission',
 'BlueWinsByTKODoctorStoppage',
 'BlueWins',
 'BlueStance',
 'BlueHeightCms',
 'BlueReachCms',
 'BlueWeightLbs',
 'RedCurrentLoseStreak',
 'RedCurrentWinStreak',
 'RedDraws',
 'RedAvgSigStrLanded',
 'RedAvgSigStrPct',
 'RedAvgSubAtt',
 'RedAvgTDLanded',
 'RedAvgTDPct',
 'RedLongestWinStreak',
 'RedLosses',
 'RedTotalRoundsFought',
 'RedTotalTitleBouts',
 'RedWinsByDecisionMajority',
 'RedWinsByDecisionSplit',
 'RedWinsByDecisionU

In [None]:
# 1. Define columns to exclude to prevent data leakage
exclude_cols = ['Finish', 'FinishDetails', 'FinishRound', 'FinishRoundTime', 'TotalFightTimeSecs',
                'RedAvgSigStrLanded', 'RedAvgSigStrPct', 'RedAvgSubAtt', 'RedAvgTDLanded', 'RedAvgTDPct',
                'BlueAvgSigStrLanded', 'BlueAvgSigStrPct', 'BlueAvgSubAtt', 'BlueAvgTDLanded', 'BlueAvgTDPct',
                'Winner','Date','SigStrDif', 'AvgSubAttDif','AvgTDDif',]  # Exclude 'Winner' as we'll create a binary version

# 2. Create final_feature_cols using list comprehension
final_feature_cols = [col for col in df.columns if col not in exclude_cols]

# 3. Encode 'Winner' column as binary
if 'WinnerBinary' not in df.columns:
        df['WinnerBinary'] = df['Winner'].map({'Red': 0, 'Blue': 1})

target_col = 'WinnerBinary'

# 4. Create df_model
df_model = df[final_feature_cols + [target_col]].copy()

print("Columns in df_model:", df_model.columns.tolist())
df_model.head()


Columns in df_model: ['RedFighter', 'BlueFighter', 'RedOdds', 'BlueOdds', 'RedExpectedValue', 'BlueExpectedValue', 'TitleBout', 'WeightClass', 'Gender', 'NumberOfRounds', 'BlueCurrentLoseStreak', 'BlueCurrentWinStreak', 'BlueDraws', 'BlueLongestWinStreak', 'BlueLosses', 'BlueTotalRoundsFought', 'BlueTotalTitleBouts', 'BlueWinsByDecisionMajority', 'BlueWinsByDecisionSplit', 'BlueWinsByDecisionUnanimous', 'BlueWinsByKO', 'BlueWinsBySubmission', 'BlueWinsByTKODoctorStoppage', 'BlueWins', 'BlueStance', 'BlueHeightCms', 'BlueReachCms', 'BlueWeightLbs', 'RedCurrentLoseStreak', 'RedCurrentWinStreak', 'RedDraws', 'RedLongestWinStreak', 'RedLosses', 'RedTotalRoundsFought', 'RedTotalTitleBouts', 'RedWinsByDecisionMajority', 'RedWinsByDecisionSplit', 'RedWinsByDecisionUnanimous', 'RedWinsByKO', 'RedWinsBySubmission', 'RedWinsByTKODoctorStoppage', 'RedWins', 'RedStance', 'RedHeightCms', 'RedReachCms', 'RedWeightLbs', 'RedAge', 'BlueAge', 'LoseStreakDif', 'WinStreakDif', 'LongestWinStreakDif', 'Win

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,TitleBout,WeightClass,Gender,NumberOfRounds,...,RedHistorical_FinishRound,BlueHistorical_AvgSigStrLanded,BlueHistorical_AvgSigStrPct,BlueHistorical_AvgSubAtt,BlueHistorical_AvgTDLanded,BlueHistorical_AvgTDPct,BlueHistorical_FinishRound,RedElo,BlueElo,WinnerBinary
0,Eric Schafer,Jason Brilz,140.0,-160.0,140.0,62.5,False,Light Heavyweight,MALE,3,...,0.0,41.25,0.465,0.75,2.0,0.5,3.0,10.0,10.0,1
1,Brandon Vera,Jon Jones,215.0,-235.0,215.0,42.5532,False,Light Heavyweight,MALE,3,...,0.0,35.509918,0.566,0.529229,2.033182,0.587765,3.529412,10.0,10.0,1
2,Junior Dos Santos,Gabriel Gonzaga,-250.0,230.0,40.0,230.0,False,Heavyweight,MALE,3,...,0.0,13.6992,0.542857,0.6118,1.095386,0.502571,1.714286,10.0,10.0,0
3,Cheick Kongo,Paul Buentello,-345.0,315.0,28.9855,315.0,False,Heavyweight,MALE,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,10.0,0
4,Alessio Sakara,James Irvin,-120.0,100.0,83.3333,100.0,False,Middleweight,MALE,3,...,0.0,5.3333,0.534,0.1111,0.0,0.0,1.0,10.0,10.0,0


In [None]:
categorical_cols = ['RedFighter', 'BlueFighter', 'WeightClass', 'Gender',
    'BlueStance', 'RedStance', 'BetterRank', 'RedRankCategory', 'BlueRankCategory','TitleBout']

In [None]:
# Label Encoding for fighters
!pip install scikit-learn
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder

le_red = LabelEncoder()
le_blue = LabelEncoder()
df_model['RedFighter'] = le_red.fit_transform(df_model['RedFighter'])
df_model['BlueFighter'] = le_blue.fit_transform(df_model['BlueFighter'])



In [None]:
rank_level_mapping = {'okay fighter': 0, 'good level': 1, 'high level': 2}
df_model['RedRankCategory'] = df_model['RedRankCategory'].map(rank_level_mapping)
df_model['BlueRankCategory'] = df_model['BlueRankCategory'].map(rank_level_mapping)

In [None]:
# One-Hot Encoding for other categorical variables
# Exclude rank levels as they've been ordinally encoded
one_hot_cols = [col for col in categorical_cols if col not in ['RedRankCategory', 'BlueRankCategory', 'RedFighter', 'BlueFighter']]

df_model = pd.get_dummies(df_model, columns=one_hot_cols, drop_first=True)

print("Categorical variables encoded. One-Hot Encoding applied to:", one_hot_cols)

Categorical variables encoded. One-Hot Encoding applied to: ['WeightClass', 'Gender', 'BlueStance', 'RedStance', 'BetterRank', 'TitleBout']


**TRYING BRO**

In [None]:
df_model.dtypes.tolist()

[dtype('int64'),
 dtype('int64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('float64'),
 dtype('float64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('float64'),
 dtype('float64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('float64'),
 dtype('float64'),
 dtype('int64'),
 dtype('float64'),
 dtype('float64'),
 dtype(

In [None]:
fighter_elo_dict = {}


In [None]:
df_model.columns.tolist()

['RedFighter',
 'BlueFighter',
 'RedOdds',
 'BlueOdds',
 'RedExpectedValue',
 'BlueExpectedValue',
 'NumberOfRounds',
 'BlueCurrentLoseStreak',
 'BlueCurrentWinStreak',
 'BlueDraws',
 'BlueLongestWinStreak',
 'BlueLosses',
 'BlueTotalRoundsFought',
 'BlueTotalTitleBouts',
 'BlueWinsByDecisionMajority',
 'BlueWinsByDecisionSplit',
 'BlueWinsByDecisionUnanimous',
 'BlueWinsByKO',
 'BlueWinsBySubmission',
 'BlueWinsByTKODoctorStoppage',
 'BlueWins',
 'BlueHeightCms',
 'BlueReachCms',
 'BlueWeightLbs',
 'RedCurrentLoseStreak',
 'RedCurrentWinStreak',
 'RedDraws',
 'RedLongestWinStreak',
 'RedLosses',
 'RedTotalRoundsFought',
 'RedTotalTitleBouts',
 'RedWinsByDecisionMajority',
 'RedWinsByDecisionSplit',
 'RedWinsByDecisionUnanimous',
 'RedWinsByKO',
 'RedWinsBySubmission',
 'RedWinsByTKODoctorStoppage',
 'RedWins',
 'RedHeightCms',
 'RedReachCms',
 'RedWeightLbs',
 'RedAge',
 'BlueAge',
 'LoseStreakDif',
 'WinStreakDif',
 'LongestWinStreakDif',
 'WinDif',
 'LossDif',
 'TotalRoundDif',
 'To

In [None]:
import random
# Set random seeds for reproducibility
random.seed(40)
np.random.seed(40)
torch.manual_seed(40)

<torch._C.Generator at 0x7d4105cd76f0>

In [None]:
target_col = 'WinnerBinary'
X = df_model.drop(columns=[target_col])
y = df_model[target_col].values

print("Feature shape:", X.shape, "Target shape:", y.shape)

Feature shape: (6541, 107) Target shape: (6541,)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Train shape: (5232, 107) Test shape: (1309, 107)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Convert X to a numpy array before scaling
X_train_array = X_train.values.astype(float)
X_test_array  = X_test.values.astype(float)

# Fit on train, transform train & test
X_train_scaled = scaler.fit_transform(X_train_array)
X_test_scaled  = scaler.transform(X_test_array)

print("After scaling: ", X_train_scaled.shape, X_test_scaled.shape)



After scaling:  (5232, 107) (1309, 107)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Convert to torch tensors
X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.long)  # classification => long
X_test_t  = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_t  = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_t, y_train_t)
test_dataset  = TensorDataset(X_test_t, y_test_t)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False)



In [None]:
class UFCNet(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=2):
        super(UFCNet, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(64, 2)
        )

    def forward(self, x):
        return self.net(x)

# Create model
input_dim = X_train_scaled.shape[1]  # number of features
model = UFCNet(input_dim=input_dim, hidden_dim=64, output_dim=2)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-2)



In [None]:
import os
# ============== TRAINING LOOP ==============
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    # For train accuracy
    train_correct = 0
    train_total = 0

    # --- Training Phase ---
    for features_batch, labels_batch in train_loader:
        # Forward pass
        optimizer.zero_grad()
        outputs = model(features_batch)
        loss = criterion(outputs, labels_batch)

        # Backprop
        loss.backward()
        optimizer.step()

        # Accumulate loss
        running_loss += loss.item() * features_batch.size(0)

        # Compute train accuracy
        _, predicted = torch.max(outputs, dim=1)
        train_correct += (predicted == labels_batch).sum().item()
        train_total   += labels_batch.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    train_acc = train_correct / train_total

    # --- Evaluation Phase ---
    model.eval()
    test_correct, test_total = 0, 0

    with torch.no_grad():
        for feat_test, lab_test in test_loader:
            out_test = model(feat_test)
            _, pred_test = torch.max(out_test, dim=1)
            test_correct += (pred_test == lab_test).sum().item()
            test_total   += lab_test.size(0)

    test_acc = test_correct / test_total

    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Loss: {epoch_loss:.4f}, "
          f"Train Accuracy: {train_acc:.4f}, "
          f"Test Accuracy: {test_acc:.4f}")


# Create the directory if it doesn't exist
os.makedirs('ufc_model', exist_ok=True)

# Save the model
torch.save(model.state_dict(), 'ufc_model/model.pth')

Epoch [1/50], Loss: 0.6136, Train Accuracy: 0.6569, Test Accuracy: 0.6860
Epoch [2/50], Loss: 0.5813, Train Accuracy: 0.6907, Test Accuracy: 0.6944
Epoch [3/50], Loss: 0.5697, Train Accuracy: 0.7043, Test Accuracy: 0.6952
Epoch [4/50], Loss: 0.5624, Train Accuracy: 0.7078, Test Accuracy: 0.7097
Epoch [5/50], Loss: 0.5612, Train Accuracy: 0.7118, Test Accuracy: 0.6952
Epoch [6/50], Loss: 0.5543, Train Accuracy: 0.7160, Test Accuracy: 0.7036
Epoch [7/50], Loss: 0.5566, Train Accuracy: 0.7173, Test Accuracy: 0.7036
Epoch [8/50], Loss: 0.5545, Train Accuracy: 0.7225, Test Accuracy: 0.6990
Epoch [9/50], Loss: 0.5514, Train Accuracy: 0.7250, Test Accuracy: 0.7005
Epoch [10/50], Loss: 0.5461, Train Accuracy: 0.7322, Test Accuracy: 0.7005
Epoch [11/50], Loss: 0.5498, Train Accuracy: 0.7229, Test Accuracy: 0.7013
Epoch [12/50], Loss: 0.5447, Train Accuracy: 0.7269, Test Accuracy: 0.7013
Epoch [13/50], Loss: 0.5405, Train Accuracy: 0.7273, Test Accuracy: 0.7028
Epoch [14/50], Loss: 0.5359, Train

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# ============== FINAL EVALUATION ==============
all_preds = []
all_true = []

model.eval()
with torch.no_grad():
    for feat_test, lab_test in test_loader:
        out_test = model(feat_test)
        _, pred_test = torch.max(out_test, dim=1)
        all_preds.extend(pred_test.cpu().numpy())
        all_true.extend(lab_test.cpu().numpy())

print("\n=== Final Test Evaluation ===")
print("Confusion Matrix:")
print(confusion_matrix(all_true, all_preds))
print()
print("Classification Report:")
print(classification_report(all_true, all_preds, target_names=["BlueWin","RedWin"]))



=== Final Test Evaluation ===
Confusion Matrix:
[[573 186]
 [203 347]]

Classification Report:
              precision    recall  f1-score   support

     BlueWin       0.74      0.75      0.75       759
      RedWin       0.65      0.63      0.64       550

    accuracy                           0.70      1309
   macro avg       0.69      0.69      0.69      1309
weighted avg       0.70      0.70      0.70      1309



In [None]:
# ============== OPTIONAL: PREDICTION FUNCTION ==============
def predict_new_fights(model, new_fights_features, scaler=None):
    """
    Predict outcomes (Blue=0 or Red=1) for new fights.
    new_fights_features: shape (n_samples, input_dim)
    If you used a scaler in training, pass the same 'scaler' to transform.
    Returns: (probabilities, predictions)
    """
    model.eval()

    if scaler is not None:
        new_fights_features = scaler.transform(new_fights_features)

    feats_t = torch.tensor(new_fights_features, dtype=torch.float32)

    with torch.no_grad():
        logits = model(feats_t)
        probs = nn.Softmax(dim=1)(logits)
        _, preds = torch.max(probs, dim=1)

    return probs.cpu().numpy(), preds.cpu().numpy()

# Example usage:
# new_data = df_new_fights.values.astype(float)  # preprocessed the same as training data
# probs, preds = predict_new_fights(model, new_data, scaler=scaler)
# for i, (p, pr) in enumerate(zip(probs, preds)):
#     print(f"Fight {i}: Prob(Blue, Red) = {p}, Prediction = {pr} (0=Blue,1=Red)")

In [None]:
for column in df_model.columns:
    print(f"{column}: {df_model[column].isnull().sum()}")

RedFighter: 0
BlueFighter: 0
RedOdds: 0
BlueOdds: 0
RedExpectedValue: 0
BlueExpectedValue: 0
NumberOfRounds: 0
BlueCurrentLoseStreak: 0
BlueCurrentWinStreak: 0
BlueDraws: 0
BlueLongestWinStreak: 0
BlueLosses: 0
BlueTotalRoundsFought: 0
BlueTotalTitleBouts: 0
BlueWinsByDecisionMajority: 0
BlueWinsByDecisionSplit: 0
BlueWinsByDecisionUnanimous: 0
BlueWinsByKO: 0
BlueWinsBySubmission: 0
BlueWinsByTKODoctorStoppage: 0
BlueWins: 0
BlueHeightCms: 0
BlueReachCms: 0
BlueWeightLbs: 0
RedCurrentLoseStreak: 0
RedCurrentWinStreak: 0
RedDraws: 0
RedLongestWinStreak: 0
RedLosses: 0
RedTotalRoundsFought: 0
RedTotalTitleBouts: 0
RedWinsByDecisionMajority: 0
RedWinsByDecisionSplit: 0
RedWinsByDecisionUnanimous: 0
RedWinsByKO: 0
RedWinsBySubmission: 0
RedWinsByTKODoctorStoppage: 0
RedWins: 0
RedHeightCms: 0
RedReachCms: 0
RedWeightLbs: 0
RedAge: 0
BlueAge: 0
LoseStreakDif: 0
WinStreakDif: 0
LongestWinStreakDif: 0
WinDif: 0
LossDif: 0
TotalRoundDif: 0
TotalTitleBoutDif: 0
KODif: 0
SubDif: 0
HeightDif: 0


In [None]:
# Filter for null values in historical columns
historical_cols = [col for col in df.columns if col.startswith(('RedHistorical_', 'BlueHistorical_'))]
null_rows = df[df[historical_cols].isnull().any(axis=1)]

# Get unique fighter IDs
unique_fighters = pd.concat([null_rows['RedFighterID'], null_rows['BlueFighterID']]).unique()

# Count fight appearances for each unique fighter
fighter_counts = {}
for fighter_id in unique_fighters:
    fighter_counts[fighter_id] = len(df[(df['RedFighterID'] == fighter_id) | (df['BlueFighterID'] == fighter_id)])

# Check if counts are 1 (indicating first fight)
first_fights = [fighter_id for fighter_id, count in fighter_counts.items() if count == 1]

print("Fighters with null historical values and only one fight (likely first fights):")
print(first_fights)

print("Total Number of these:")
print(len(first_fights))
# Filter the original DataFrame to see the rows corresponding to these fighters
first_fight_rows = df[(df['RedFighterID'].isin(first_fights)) | (df['BlueFighterID'].isin(first_fights))]
# Display or further analyze the first_fight_rows DataFrame
# ...

Fighters with null historical values and only one fight (likely first fights):
[]
Total Number of these:
0


In [None]:
# historical_cols = [col for col in df.columns if col.startswith(('RedHistorical_', 'BlueHistorical_'))]
# for col in historical_cols:
 #    df[col] = df[col].fillna(0)

In [None]:
print(X_train.columns.tolist())

['RedFighter', 'BlueFighter', 'RedOdds', 'BlueOdds', 'RedExpectedValue', 'BlueExpectedValue', 'NumberOfRounds', 'BlueCurrentLoseStreak', 'BlueCurrentWinStreak', 'BlueDraws', 'BlueLongestWinStreak', 'BlueLosses', 'BlueTotalRoundsFought', 'BlueTotalTitleBouts', 'BlueWinsByDecisionMajority', 'BlueWinsByDecisionSplit', 'BlueWinsByDecisionUnanimous', 'BlueWinsByKO', 'BlueWinsBySubmission', 'BlueWinsByTKODoctorStoppage', 'BlueWins', 'BlueHeightCms', 'BlueReachCms', 'BlueWeightLbs', 'RedCurrentLoseStreak', 'RedCurrentWinStreak', 'RedDraws', 'RedLongestWinStreak', 'RedLosses', 'RedTotalRoundsFought', 'RedTotalTitleBouts', 'RedWinsByDecisionMajority', 'RedWinsByDecisionSplit', 'RedWinsByDecisionUnanimous', 'RedWinsByKO', 'RedWinsBySubmission', 'RedWinsByTKODoctorStoppage', 'RedWins', 'RedHeightCms', 'RedReachCms', 'RedWeightLbs', 'RedAge', 'BlueAge', 'LoseStreakDif', 'WinStreakDif', 'LongestWinStreakDif', 'WinDif', 'LossDif', 'TotalRoundDif', 'TotalTitleBoutDif', 'KODif', 'SubDif', 'HeightDif',

In [None]:
model_columns = X_train.columns

In [None]:
# 1. Filter for rows where either RedFighter or BlueFighter is Ian Garry
ian_garry_rows = df_model[(df_model['RedFighter'] == le_red.transform(['Ian Machado Garry'])[0]) | (df_model['BlueFighter'] == le_blue.transform(['Ian Machado Garry'])[0])]


# 2. Get the last row using tail(1)
last_ian_garry_row = ian_garry_rows.tail(1)

# 3. Display the row (optional: you can use to_string for full display)
print(last_ian_garry_row)  # or print(last_ian_garry_row.to_string())

      RedFighter  BlueFighter  RedOdds  BlueOdds  RedExpectedValue  \
6516        1442          718   -210.0     295.0            47.619   

      BlueExpectedValue  NumberOfRounds  BlueCurrentLoseStreak  \
6516              295.0               3                      0   

      BlueCurrentWinStreak  BlueDraws  ...  BlueStance_Orthodox  \
6516                     8          0  ...                 True   

      BlueStance_Southpaw  BlueStance_Switch  BlueStance_Switch   \
6516                False              False               False   

      RedStance_Orthodox  RedStance_Southpaw  RedStance_Switch  \
6516                True               False             False   

      BetterRank_Red  BetterRank_neither  TitleBout_True  
6516            True               False           False  

[1 rows x 108 columns]


In [None]:
# Assuming you used LabelEncoder to encode your fighter names:
# ... your LabelEncoder code from previous cells ...

# Get the original names back using inverse_transform
red_fighter_name  = le_red.inverse_transform([1441])[0]
blue_fighter_name = le_blue.inverse_transform([718])[0]

print(f"RedFighter 499: {red_fighter_name}")
print(f"BlueFighter 806: {blue_fighter_name}")

RedFighter 499: Shauna Bannon
BlueFighter 806: Ian Machado Garry


In [None]:


   model = UFCNet(input_dim=input_dim, hidden_dim=64, output_dim=2) # Create the model instance
   model.load_state_dict(torch.load('ufc_model/model.pth')) # Load the saved state
   model.eval()  # Set the model to evaluation mode



  model.load_state_dict(torch.load('ufc_model/model.pth')) # Load the saved state


UFCNet(
  (net): Sequential(
    (0): Linear(in_features=107, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.25, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.25, inplace=False)
    (6): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import torch.nn.functional as F

# ------------------------------------------------------------------------------
# Define expected JIM columns for each side.
# ------------------------------------------------------------------------------
red_columns = [
    'RedCurrentLoseStreak', 'RedCurrentWinStreak', 'RedDraws', 'RedLongestWinStreak',
    'RedLosses', 'RedTotalRoundsFought', 'RedTotalTitleBouts', 'RedWinsByDecisionMajority',
    'RedWinsByDecisionSplit', 'RedWinsByDecisionUnanimous', 'RedWinsByKO', 'RedWinsBySubmission',
    'RedWinsByTKODoctorStoppage', 'RedWins', 'RedHeightCms', 'RedReachCms', 'RedWeightLbs',
    'RedAge', 'RMatchWCRank', 'RPFPRank', 'RedRankCategory', 'RedFighterID',
    'RedHistorical_AvgSigStrLanded', 'RedHistorical_AvgSigStrPct', 'RedHistorical_AvgSubAtt',
    'RedHistorical_AvgTDLanded', 'RedHistorical_AvgTDPct', 'RedHistorical_FinishRound', 'RedElo'
]

blue_columns = [
    'BlueCurrentLoseStreak', 'BlueCurrentWinStreak', 'BlueDraws', 'BlueLongestWinStreak',
    'BlueLosses', 'BlueTotalRoundsFought', 'BlueTotalTitleBouts', 'BlueWinsByDecisionMajority',
    'BlueWinsByDecisionSplit', 'BlueWinsByDecisionUnanimous', 'BlueWinsByKO', 'BlueWinsBySubmission',
    'BlueWinsByTKODoctorStoppage', 'BlueWins', 'BlueHeightCms', 'BlueReachCms', 'BlueWeightLbs',
    'BlueAge', 'BMatchWCRank', 'BPFPRank', 'BlueRankCategory', 'BlueFighterID',
    'BlueHistorical_AvgSigStrLanded', 'BlueHistorical_AvgSigStrPct', 'BlueHistorical_AvgSubAtt',
    'BlueHistorical_AvgTDLanded', 'BlueHistorical_AvgTDPct', 'BlueHistorical_FinishRound', 'BlueElo'
]

# Create mapping dictionaries to convert between blue and red feature names
blue_to_red = {blue: red for blue, red in zip(blue_columns, red_columns)}
red_to_blue = {red: blue for red, blue in zip(red_columns, blue_columns)}

# ------------------------------------------------------------------------------
# 1. Updated fighter stats extraction function (using encoded values)
# ------------------------------------------------------------------------------
def get_fighter_stats(fighter_name, desired_side, df_model, le_red, le_blue):
    """
    Extract the most recent match stats (JIM features) for a fighter from both sides
    and choose the one with the highest FightID. Then, convert the stats so that they
    are in the format corresponding to the desired side ('red' or 'blue').

    Parameters:
        fighter_name: the original fighter name (string)
        desired_side: 'red' or 'blue' (the side you want the stats returned in)
        df_model: the DataFrame with historical fight data (already encoded for fighter names)
        le_red, le_blue: the fitted label encoders for RedFighter and BlueFighter respectively.

    Returns:
        A dictionary of fighter stats with keys corresponding to the desired side.
    """
    # Get encoded values for the fighter
    fighter_encoded_red = le_red.transform([fighter_name])[0]
    fighter_encoded_blue = le_blue.transform([fighter_name])[0]

    # Filter matches where the fighter appears as Red or Blue
    subset_red = df_model[df_model['RedFighter'] == fighter_encoded_red]
    subset_blue = df_model[df_model['BlueFighter'] == fighter_encoded_blue]

    # If no matches found at all, raise an error.
    if subset_red.empty and subset_blue.empty:
        raise ValueError(f"No stats found for fighter {fighter_name} in either role.")

    # Choose the most recent match based on FightID (assuming higher FightID means more recent)
    red_latest = subset_red.loc[subset_red['FightID'].idxmax()] if not subset_red.empty else None
    blue_latest = subset_blue.loc[subset_blue['FightID'].idxmax()] if not subset_blue.empty else None

    # Compare FightID values if both exist; if only one exists, choose that one.
    if red_latest is not None and blue_latest is not None:
        if red_latest['FightID'] >= blue_latest['FightID']:
            chosen_row = red_latest
            source_side = 'red'
        else:
            chosen_row = blue_latest
            source_side = 'blue'
    elif red_latest is not None:
        chosen_row = red_latest
        source_side = 'red'
    else:
        chosen_row = blue_latest
        source_side = 'blue'

    # Based on the source side and the desired side, extract and possibly remap the stats.
    if desired_side.lower() == 'red':
        if source_side == 'red':
            stats = {col: chosen_row[col] for col in red_columns}
        else:
            # Remap blue stats to red keys
            stats_blue = {col: chosen_row[col] for col in blue_columns}
            stats = {blue_to_red[k]: v for k, v in stats_blue.items()}
    elif desired_side.lower() == 'blue':
        if source_side == 'blue':
            stats = {col: chosen_row[col] for col in blue_columns}
        else:
            # Remap red stats to blue keys
            stats_red = {col: chosen_row[col] for col in red_columns}
            stats = {red_to_blue[k]: v for k, v in stats_red.items()}
    else:
        raise ValueError("Side must be either 'red' or 'blue'.")

    return stats


# ------------------------------------------------------------------------------
# 2. Function to compute DIF columns.
# ------------------------------------------------------------------------------
def compute_differences(red_stats, blue_stats):
    """
    Compute the difference columns (DIF) from the red and blue stats.
    """
    diffs = {}
    diffs['LoseStreakDif']         = blue_stats['BlueCurrentLoseStreak'] - red_stats['RedCurrentLoseStreak']
    diffs['WinStreakDif']            = blue_stats['BlueCurrentWinStreak'] - red_stats['RedCurrentWinStreak']
    diffs['LongestWinStreakDif']     = blue_stats['BlueLongestWinStreak'] - red_stats['RedLongestWinStreak']
    diffs['WinDif']                  = blue_stats['BlueWins'] - red_stats['RedWins']
    diffs['LossDif']                 = blue_stats['BlueLosses'] - red_stats['RedLosses']
    diffs['TotalRoundDif']           = blue_stats['BlueTotalRoundsFought'] - red_stats['RedTotalRoundsFought']
    diffs['TotalTitleBoutDif']       = blue_stats['BlueTotalTitleBouts'] - red_stats['RedTotalTitleBouts']
    diffs['KODif']                   = (blue_stats['BlueWinsByKO'] + blue_stats['BlueWinsByTKODoctorStoppage']) - (red_stats['RedWinsByKO'] + red_stats['RedWinsByTKODoctorStoppage'])
    diffs['SubDif']                  = blue_stats['BlueWinsBySubmission'] - red_stats['RedWinsBySubmission']
    diffs['HeightDif']               = blue_stats['BlueHeightCms'] - red_stats['RedHeightCms']
    diffs['ReachDif']                = blue_stats['BlueReachCms'] - red_stats['RedReachCms']
    diffs['AgeDif']                  = blue_stats['BlueAge'] - red_stats['RedAge']
    return diffs

# ------------------------------------------------------------------------------
# 3. Function to create the new fight feature row combining manual inputs, fighter stats, and DIF columns.
# ------------------------------------------------------------------------------
def create_fight_feature_row(red_fighter, blue_fighter, df_model, manual_inputs, le_red, le_blue):
    """
    Build a new row (as a DataFrame) that the model can use to predict a fight.

    Parameters:
      - red_fighter, blue_fighter: original fighter names (strings)
      - df_model: the training dataframe (used here to extract last-match stats)
      - manual_inputs: a dictionary of values for columns you want to enter manually.
      - le_red, le_blue: fitted label encoders for fighter names.

    Returns:
      A pandas DataFrame with one row containing all pre-fight features.
    """
    # Get fighter stats (JIM features) using the updated extraction function.
    red_stats = get_fighter_stats(red_fighter, 'red', df_model, le_red, le_blue)
    blue_stats = get_fighter_stats(blue_fighter, 'blue', df_model, le_red, le_blue)

    # Compute difference columns (DIF)
    diffs = compute_differences(red_stats, blue_stats)

    # Create the new row dictionary.
    new_row = {}

    # Manual inputs: these are columns you want to enter manually.
    manual_columns = [
        'RedOdds', 'BlueOdds', 'RedExpectedValue', 'BlueExpectedValue', 'NumberOfRounds',
        'RedDecOdds', 'BlueDecOdds', 'RSubOdds', 'BSubOdds', 'RKOOdds', 'BKOOdds',
        'BetterRank_Red', 'BetterRank_neither', 'TitleBout_True', 'FightID'
    ]

    # Add fighter names (they'll be encoded later in prepare_features)
    new_row['RedFighter'] = red_fighter
    new_row['BlueFighter'] = blue_fighter

    for col in manual_columns:
        new_row[col] = manual_inputs.get(col)

    # Add the JIM features (extracted fighter stats)
    for key, value in red_stats.items():
        new_row[key] = value
    for key, value in blue_stats.items():
        new_row[key] = value

    # Add the DIF columns
    for key, value in diffs.items():
        new_row[key] = value

    return pd.DataFrame([new_row])

# ------------------------------------------------------------------------------
# 4. Function to prepare the features (encoding and scaling) before prediction.
# ------------------------------------------------------------------------------
def prepare_features(new_df, le_red, le_blue, scaler, model_columns):
    """
    Prepare the new fight DataFrame:
      - Label encode fighter names.
      - One-hot encode categorical variables.
      - Align the columns with the training set (model_columns).
      - Scale the features.
    """
    # Label encode fighter names using the already fitted encoders.
    new_df['RedFighter'] = le_red.transform(new_df['RedFighter'])
    new_df['BlueFighter'] = le_blue.transform(new_df['BlueFighter'])

    # One-hot encode any other categorical variables if needed.
    new_df = pd.get_dummies(new_df, drop_first=True)

    # Reindex to match the training model columns (fill missing with 0)
    new_df = new_df.reindex(columns=model_columns, fill_value=0)

    # Scale the features.
    scaled_features = scaler.transform(new_df.values.astype(float))

    return scaled_features

# ------------------------------------------------------------------------------
# 5. Function to predict a new fight.
# ------------------------------------------------------------------------------
def predict_new_fight(red_fighter, blue_fighter, manual_inputs, df_model, model, le_red, le_blue, scaler, model_columns):
    """
    Build the new fight feature row, prepare the features, and use the model to predict the outcome.

    Returns:
      predicted_winner: 'Red' or 'Blue'
      win_likelihood: probability for the predicted class
      new_fight_df: the constructed feature row (pre-scaled)
    """
     # Create new fight features.
    new_fight_df = create_fight_feature_row(red_fighter, blue_fighter, df_model, manual_inputs, le_red, le_blue)

    # Prepare features for prediction (encoding, aligning, scaling).
    X_new = prepare_features(new_fight_df.copy(), le_red, le_blue, scaler, model_columns)

     # Get prediction probabilities.
    with torch.no_grad():
        logits = model(torch.tensor(X_new, dtype=torch.float32)) # Get model output (logits)
        prob = F.softmax(logits, dim=1).cpu().numpy()[0]       # Apply softmax and get probabilities

    # Map model classes to fighter sides.
    # (Assuming WinnerBinary was mapped as {'Red': 0, 'Blue': 1})
    # Instead of model.classes_, use the index of the max probability
    predicted_class = np.argmax(prob)
    predicted_winner = 'Red' if predicted_class == 0 else 'Blue'
    win_likelihood = np.max(prob)

    return predicted_winner, win_likelihood, new_fight_df

# ------------------------------------------------------------------------------
# 6. Example usage
# ------------------------------------------------------------------------------
# Assume you already have:
# - df_model (the DataFrame with training data, where fighter names are encoded)
# - model (your trained classifier)
# - scaler (your fitted StandardScaler)
# - le_red and le_blue (fitted LabelEncoders for 'RedFighter' and 'BlueFighter')
# - model_columns (the columns used during training; e.g., X_train.columns)

# For example, if X_train is your training feature DataFrame:
# model_columns = X_train.columns

# Manual inputs for columns that need to be provided manually.
manual_inputs = {
    'RedOdds': 100,
    'BlueOdds': 100,
    'RedExpectedValue': 0.5,
    'BlueExpectedValue': 0.5,
    'NumberOfRounds': 3,
    'RedDecOdds': 100,
    'BlueDecOdds': 100,
    'RSubOdds': 100,
    'BSubOdds': 100,
    'RKOOdds': 100,
    'BKOOdds': 100,
    'BetterRank_Red': 1,         # e.g., 1 means Red is better ranked
    'BetterRank_neither': 0,     # 0 means not neutral
    'TitleBout_True': False,
    'FightID': 3263.5              # arbitrary ID since model needs it
}

# Specify fighter names for the new fight.
red_fighter = "Tom Aspinall"
blue_fighter = "Serghei Spivac"

# Example try/except for prediction.
try:
    predicted_winner, win_likelihood, fight_features = predict_new_fight(
        red_fighter, blue_fighter, manual_inputs,
        df_model, model, le_red, le_blue, scaler, model_columns
    )

    print(f"Predicted winner: {predicted_winner}")
    print(f"Win likelihood: {win_likelihood:.2f}")

    print("\nNew fight feature row (pre-scaled):")
    print(fight_features)

except ValueError as e:
    print("Error during prediction:", e)


Predicted winner: Red
Win likelihood: 0.70

New fight feature row (pre-scaled):
     RedFighter     BlueFighter  RedOdds  BlueOdds  RedExpectedValue  \
0  Tom Aspinall  Serghei Spivac      100       100               0.5   

   BlueExpectedValue  NumberOfRounds  RedDecOdds  BlueDecOdds  RSubOdds  ...  \
0                0.5               3         100          100       100  ...   

   LongestWinStreakDif  WinDif  LossDif  TotalRoundDif  TotalTitleBoutDif  \
0                   -2       0        3             12                 -1   

   KODif  SubDif  HeightDif  ReachDif  AgeDif  
0     -2       0      -5.08       0.0      -2  

[1 rows x 87 columns]


In [None]:
mean_fightid = df_model['FightID'].mean()
median_fightid = df_model['FightID'].median()

print("Mean FightID:", mean_fightid)
print("Median FightID:", median_fightid)

Mean FightID: 3270.0
Median FightID: 3270.0


In [None]:
fight_scaled = scaler.transform(fight_array)
print("Scaled fight row:", fight_scaled)


NameError: name 'fight_array' is not defined

In [None]:
highest_fightid = df_model['FightID'].max()
most_recent_match = df_model[df_model['FightID'] == highest_fightid]
most_recent_match

In [None]:
# Assuming 'df_model' is your DataFrame
class_balance = df_model['WinnerBinary'].value_counts()
print(class_balance)

In [None]:
input_dim = X_train_scaled.shape[1]
model = UFCNet(input_dim=input_dim, hidden_dim=64, output_dim=2)
input_dim

In [None]:
column_order = X_train.columns.tolist()
print(column_order)


In [None]:
print("Scaler scales:", scaler.scale_)
print("Scaler means:", scaler.mean_)
