In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay

from sklearn.impute import SimpleImputer

import sys, warnings, os
from sklearn.dummy import DummyClassifier

In [2]:
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

In [3]:
pd.set_option("display.max_columns", 120, "display.max_rows", None)

### Baseline Model:

In [4]:
df_baseline = pd.read_csv('../data/df_baseline.csv')
print(df_baseline.shape)
df_baseline.head()

(6528, 117)


Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,TitleBout,WeightClass,Gender,NumberOfRounds,BlueCurrentLoseStreak,BlueCurrentWinStreak,BlueDraws,BlueAvgSigStrLanded,BlueAvgSigStrPct,BlueAvgSubAtt,BlueAvgTDLanded,BlueAvgTDPct,BlueLongestWinStreak,BlueLosses,BlueTotalRoundsFought,BlueTotalTitleBouts,BlueWinsByDecisionMajority,BlueWinsByDecisionSplit,BlueWinsByDecisionUnanimous,BlueWinsByKO,BlueWinsBySubmission,BlueWinsByTKODoctorStoppage,BlueWins,BlueStance,BlueHeightCms,BlueReachCms,BlueWeightLbs,RedCurrentLoseStreak,RedCurrentWinStreak,RedDraws,RedAvgSigStrLanded,RedAvgSigStrPct,RedAvgSubAtt,RedAvgTDLanded,RedAvgTDPct,RedLongestWinStreak,RedLosses,RedTotalRoundsFought,RedTotalTitleBouts,RedWinsByDecisionMajority,RedWinsByDecisionSplit,RedWinsByDecisionUnanimous,RedWinsByKO,RedWinsBySubmission,RedWinsByTKODoctorStoppage,RedWins,RedStance,RedHeightCms,RedReachCms,RedWeightLbs,RedAge,BlueAge,LoseStreakDif,WinStreakDif,LongestWinStreakDif,WinDif,LossDif,TotalRoundDif,TotalTitleBoutDif,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,EmptyArena,BMatchWCRank,RMatchWCRank,RWFlyweightRank,RWFeatherweightRank,RWStrawweightRank,RWBantamweightRank,RHeavyweightRank,RLightHeavyweightRank,RMiddleweightRank,RWelterweightRank,RLightweightRank,RFeatherweightRank,RBantamweightRank,RFlyweightRank,RPFPRank,BWFlyweightRank,BWStrawweightRank,BWBantamweightRank,BHeavyweightRank,BLightHeavyweightRank,BMiddleweightRank,BWelterweightRank,BLightweightRank,BFeatherweightRank,BBantamweightRank,BFlyweightRank,BPFPRank,BetterRank,Finish,FinishDetails,FinishRound,FinishRoundTime,TotalFightTimeSecs,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds
0,66,1009,-250.0,215.0,40.0,215.0,563,68,33,1,1.0,3,1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,172.72,175.26,125.0,0.0,6.0,0.0,4.41,0.49,0.8,2.61,0.47,6.0,3.0,42.0,3.0,0.0,2.0,4.0,2.0,4.0,0.0,12.0,1,165.1,170.18,125.0,34.0,31.0,0.0,-6.0,-6.0,-12.0,-3.0,-42.0,-3.0,-2.0,-4.0,7.62,5.08,-3.0,-4.41,-0.8,-2.61,0.0,10.0,0.0,15.0,0.0,0.0,0.0,8.0,4.0,0.0,0.0,2.0,0.0,8.0,0.0,11.0,9.0,9.0,5.0,9.0,12.0,10.0,14.0,4.0,9.0,10.0,12.0,14.0,1,5,28,2.0,118,425.0,300.0,800.0,150.0,2500.0,400.0,350.0
1,1441,718,-210.0,295.0,47.619,295.0,563,68,33,1,0.0,8,1,3.0,0.0,8.0,0.0,5.5,0.55,0.3,0.77,0.55,8.0,0.0,20.0,0.0,0.0,1.0,4.0,3.0,0.0,0.0,8.0,1,190.5,187.96,170.0,0.0,6.0,0.0,4.12,0.61,1.8,1.49,0.29,6.0,0.0,11.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,6.0,1,185.42,195.58,170.0,30.0,27.0,0.0,2.0,2.0,2.0,0.0,9.0,0.0,2.0,-5.0,5.08,-7.62,-3.0,1.38,-1.5,-0.72,0.0,7.0,3.0,15.0,0.0,0.0,0.0,8.0,4.0,0.0,3.0,2.0,0.0,8.0,0.0,3.0,9.0,9.0,5.0,9.0,12.0,10.0,7.0,4.0,9.0,10.0,12.0,14.0,1,6,39,5.0,293,1500.0,250.0,650.0,180.0,3000.0,240.0,700.0
2,307,67,-380.0,300.0,26.3158,300.0,563,68,33,1,0.0,4,1,3.0,0.0,4.0,0.0,5.13,0.57,0.2,0.45,0.63,4.0,4.0,44.0,0.0,0.0,1.0,4.0,6.0,1.0,0.0,12.0,1,200.66,203.2,250.0,0.0,1.0,0.0,5.49,0.6,0.5,0.58,0.21,7.0,2.0,33.0,3.0,0.0,0.0,3.0,4.0,2.0,0.0,9.0,1,193.04,205.74,245.0,34.0,36.0,0.0,3.0,-3.0,3.0,2.0,11.0,-3.0,2.0,-1.0,7.62,-2.54,2.0,-0.36,-0.3,-0.13,0.0,3.0,2.0,15.0,0.0,0.0,0.0,2.0,4.0,0.0,0.0,2.0,0.0,8.0,0.0,3.0,9.0,9.0,5.0,3.0,12.0,10.0,14.0,4.0,9.0,10.0,12.0,14.0,1,4,39,3.0,293,900.0,-160.0,450.0,1100.0,3000.0,350.0,1100.0
3,221,1071,-950.0,625.0,10.5263,625.0,563,68,33,1,0.0,2,1,3.0,2.0,0.0,0.0,3.74,0.44,0.5,0.47,0.25,1.0,2.0,7.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2,175.26,177.8,145.0,1.0,0.0,0.0,2.3,0.58,1.6,3.45,0.41,6.0,2.0,22.0,0.0,1.0,0.0,5.0,0.0,1.0,0.0,7.0,2,177.8,177.8,145.0,30.0,36.0,1.0,0.0,-5.0,-6.0,0.0,-15.0,0.0,0.0,0.0,-2.54,0.0,6.0,1.44,-1.1,-2.98,0.0,10.0,13.0,15.0,0.0,0.0,0.0,8.0,4.0,0.0,0.0,2.0,13.0,8.0,0.0,3.0,9.0,9.0,5.0,9.0,12.0,10.0,14.0,4.0,9.0,10.0,12.0,14.0,1,1,6,3.0,32,639.0,-200.0,1100.0,380.0,1400.0,500.0,4000.0
4,1183,524,-130.0,110.0,76.9231,110.0,563,68,33,0,0.0,2,1,3.0,0.0,1.0,1.0,4.41,0.53,0.8,0.75,0.37,3.0,3.0,15.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,1,177.8,177.8,145.0,0.0,1.0,0.0,6.25,0.46,1.0,1.0,0.41,3.0,3.0,17.0,0.0,1.0,0.0,1.0,1.0,2.0,0.0,5.0,1,175.26,182.88,145.0,36.0,33.0,0.0,0.0,0.0,-1.0,0.0,-2.0,0.0,3.0,-2.0,2.54,-5.08,-3.0,-1.84,-0.2,-0.25,0.0,10.0,0.0,15.0,0.0,0.0,0.0,8.0,4.0,0.0,0.0,2.0,0.0,8.0,0.0,3.0,9.0,9.0,5.0,9.0,12.0,10.0,14.0,4.0,9.0,10.0,12.0,14.0,2,1,6,3.0,194,801.0,275.0,550.0,500.0,700.0,300.0,250.0


In [5]:
label = df_baseline['Winner']
df_baseline.drop(['Winner'], axis = 1, inplace = True)

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(df_baseline, label, random_state = 42, test_size = 0.3)

In [7]:
baseline_model = DummyClassifier(random_state = 42)
baseline_model.fit(X_train, y_train)

0,1,2
,strategy,'prior'
,random_state,42
,constant,


In [8]:
baseline_preds = baseline_model.predict(X_valid)
accuracy_score(y_valid, baseline_preds)

0.5855028075548749

### Enhanced Model:

In [9]:
ufc = pd.read_csv('../data/df_modeling.csv')
print(ufc.shape)
ufc.head()

(6528, 28)


Unnamed: 0,Winner,TitleBout,LoseStreakDif,WinStreakDif,LongestWinStreakDif,WinDif,LossDif,TotalRoundDif,TotalTitleBoutDif,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,BetterRank,draw_diff,avg_sig_str_pct_diff,avg_TD_pct_diff,M_DEC_diff,S_DEC_diff,U_DEC_diff,TKO_diff,odds_diff,ev_diff,Stance_diff
0,1,1,0,-6,-6,-12,-3,-42,-3,-2,-4,7.62,5.08,-3,-4.41,-0.8,-2.61,-1,0,-0.49,-0.47,0,-2,-4,0,465.0,175.0,0
1,1,0,0,2,2,2,0,9,0,2,-5,5.08,-7.62,-3,1.38,-1.5,-0.72,-1,0,-0.06,0.26,0,1,4,0,505.0,247.381,0
2,1,0,0,3,-3,3,2,11,-3,2,-1,7.62,-2.54,2,-0.36,-0.3,-0.13,-1,0,-0.03,0.42,0,1,1,0,680.0,273.6842,0
3,1,0,1,0,-5,-6,0,-15,0,0,0,-2.54,0.0,6,1.44,-1.1,-2.98,-1,0,-0.14,-0.16,-1,0,-5,0,1575.0,614.4737,0
4,0,0,0,0,0,-1,0,-2,0,3,-2,2.54,-5.08,-3,-1.84,-0.2,-0.25,0,1,0.07,-0.04,-1,0,-1,0,240.0,33.0769,0


In [16]:
from sklearn.tree import DecisionTreeClassifier


X = ufc.drop(columns=["Winner"])  # Replace 'Winner'
y = ufc["Winner"]

# === Time-based split ===
train_size = int(0.7 * len(ufc))
val_size = int(0.15 * len(ufc))

X_train = X.iloc[:train_size]
y_train = y.iloc[:train_size]
X_val = X.iloc[train_size:train_size + val_size]
y_val = y.iloc[train_size:train_size + val_size]
X_test = X.iloc[train_size + val_size:]
y_test = y.iloc[train_size + val_size:]

# === Define Models and Param Grids ===
models = {
    'RandomForest': (RandomForestClassifier(random_state=42), {
        'n_estimators': [100, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
    }),
    'DecisionTree': (DecisionTreeClassifier(random_state=42), {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
    }),
    'MLP': (MLPClassifier(max_iter=300, random_state=42), {
        'hidden_layer_sizes': [(32,), (64,), (64, 32)],
        'alpha': [0.0001, 0.001],
        'learning_rate_init': [0.001, 0.01],
    }),
}

# === Run Grid Search for each model ===
best_models = {}

for name, (model, params) in models.items():
    print(f"\n🔍 Tuning {name}...")
    clf = GridSearchCV(model, params, cv=3, scoring='f1', n_jobs=-1)
    clf.fit(X_train, y_train)
    best_models[name] = clf.best_estimator_

    preds = clf.predict(X_val)
    print(f"✅ {name} Results:")
    print("  Accuracy:", accuracy_score(y_val, preds))
    print("  F1 Score:", f1_score(y_val, preds))
    print("  Best Params:", clf.best_params_)

# === Final Evaluation on Test Set ===
print("\n🧪 Final Evaluation on Test Set:")
for name, model in best_models.items():
    preds = model.predict(X_test)
    print(f"\n📌 {name}")
    print("Accuracy:", accuracy_score(y_test, preds))
    print("F1 Score:", f1_score(y_test, preds))
    print(classification_report(y_test, preds))



🔍 Tuning RandomForest...
✅ RandomForest Results:
  Accuracy: 0.651685393258427
  F1 Score: 0.7269815852682145
  Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 300}

🔍 Tuning DecisionTree...
✅ DecisionTree Results:
  Accuracy: 0.6026557711950971
  F1 Score: 0.6504941599281222
  Best Params: {'max_depth': 10, 'min_samples_split': 2}

🔍 Tuning MLP...
✅ MLP Results:
  Accuracy: 0.6598569969356486
  F1 Score: 0.7208717518860017
  Best Params: {'alpha': 0.0001, 'hidden_layer_sizes': (32,), 'learning_rate_init': 0.001}

🧪 Final Evaluation on Test Set:

📌 RandomForest
Accuracy: 0.6520408163265307
F1 Score: 0.7287191726332538
              precision    recall  f1-score   support

           0       0.58      0.46      0.51       392
           1       0.68      0.78      0.73       588

    accuracy                           0.65       980
   macro avg       0.63      0.62      0.62       980
weighted avg       0.64      0.65      0.64       980


📌 DecisionTree
Accurac