In [1]:

# standard library imports
import git, os, sys

git_repo = git.Repo(os.getcwd(), search_parent_directories=True)
git_root = git_repo.git.rev_parse("--show-toplevel")
os.chdir(f'{git_root}/src')
sys.path.append(os.path.abspath(os.path.join(f'{git_root}/src')))
print(f'Changed working directory to {os.getcwd()}')

# local imports
from fight_stat_helpers import *
from data_handler import DataHandler

Changed working directory to C:\Users\Alex\OneDrive\Documents\GitHub\UFC_Prediction_2022\src


In [2]:
dh = DataHandler()
# get reported derived doubled data
ufc_fights_reported_derived_doubled = dh.get('ufc_fights_reported_derived_doubled')

In [3]:
ufc_fights_predictive_flattened_diffs = dh.make_ufc_fights_predictive_flattened_diffs(ufc_fights_reported_derived_doubled)
ufc_fights_predictive_flattened_diffs = dh.clean_ufc_fights_for_winner_prediction(ufc_fights_predictive_flattened_diffs)

In [4]:
# show division counts
print(ufc_fights_predictive_flattened_diffs['division'].value_counts())

division
Lightweight              1059
Welterweight             1015
Middleweight              819
Featherweight             668
Bantamweight              590
Light Heavyweight         576
Heavyweight               524
Flyweight                 304
Women's Strawweight       273
Women's Flyweight         222
Women's Bantamweight      178
Catch Weight               62
Women's Featherweight      26
Name: count, dtype: int64


In [5]:
# filter by division. start with just lightweight and featherweight
division_mask = ufc_fights_predictive_flattened_diffs['division'].isin(["Women's Strawweight", "Women's Flyweight", "Women's Bantamweight", "Women's Featherweight"])
ufc_fights_predictive_flattened_diffs_womens = ufc_fights_predictive_flattened_diffs[division_mask]
ufc_fights_predictive_flattened_diffs_womens.shape

(699, 325)

In [6]:
# drop the 'fighter_result' column and opponent_result column as it is not needed for the model and also fighter and opponent columns
X = ufc_fights_predictive_flattened_diffs_womens.drop(columns=['fighter', 'opponent', 'method', 'division'])
# KEEP result in X as this is what model_score is expecting for better or for worse
y = X['result']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=72)

# Use the entire data set with all features as a baseline

In [8]:
all_features = X_train.columns.tolist()
# remove result from all_features
if 'result' in all_features:
    all_features.remove('result')
model_test_score(X_train, X_test, all_features, _max_iter = 20000, scaled=True)

Training set size: (559, 321) accuracy: 0.7567084078711985
Test set size: (140, 321) accuracy: 0.5571428571428572
Test set neg log loss: -0.8609906854495246. Probability to observe data given model: 0.4227430693892821


In [9]:
best_features = additive_greedy(X_train, X_test, search_doubles=True, _max_iter=30000)

Starting with an empty feature set.
Added single feature: age_diff
Negative log loss on training set: -0.6772813100457699
Training set size: (559, 321) accuracy: 0.5885509838998211
Test set size: (140, 321) accuracy: 0.5214285714285715
Test set neg log loss: -0.7028449628886934. Probability to observe data given model: 0.49517454476113043
Added single feature: l1y_inf_takedowns_attempts_per_min_diff
Negative log loss on training set: -0.662882963747438
Training set size: (559, 321) accuracy: 0.6046511627906976
Test set size: (140, 321) accuracy: 0.55
Test set neg log loss: -0.6948110976598459. Probability to observe data given model: 0.499168733221342
Added single feature: l1y_abs_takedowns_landed_per_min_diff
Negative log loss on training set: -0.6557130307624255
Training set size: (559, 321) accuracy: 0.6189624329159212
Test set size: (140, 321) accuracy: 0.5571428571428572
Test set neg log loss: -0.6966019421918705. Probability to observe data given model: 0.4982755995953293
Added s

In [10]:
model_test_score(X_train, X_test, best_features, _max_iter = 30000, scaled=True)

Training set size: (559, 321) accuracy: 0.7048300536672629
Test set size: (140, 321) accuracy: 0.5785714285714286
Test set neg log loss: -0.8195394284577974. Probability to observe data given model: 0.440634551513057


In [11]:
best_features = ['l1y_inf_total_strikes_accuracy_diff',
 'age_diff',
 'all_wins_wins_diff',
 'l3y_abs_distance_strikes_landed_per_min_diff',
 'l1y_inf_takedowns_attempts_per_min_diff',
 'l1y_losses_losses_diff',
 'l5y_inf_leg_strikes_accuracy_diff',
 'l1y_inf_knockdowns_per_min_diff',
 'l3y_abs_distance_strikes_accuracy_diff',
 'l1y_abs_takedowns_landed_per_min_diff',
 'l3y_losses_sub_diff',
 'l3y_losses_ko_diff',
 'l3y_inf_sub_attempts_per_min_diff',
 'l5y_inf_sig_strikes_accuracy_diff',
 'l1y_losses_diff',
 'l1y_inf_takedowns_landed_per_min_diff',
 'l5y_abs_sub_attempts_per_min_diff',
 'l3y_inf_total_strikes_accuracy_diff',
 'all_wins_sub_diff',
 'l1y_abs_control_per_min_diff',
 'l3y_abs_total_strikes_accuracy_diff',
 'l3y_abs_head_strikes_accuracy_diff',
 'l1y_abs_body_strikes_landed_per_min_diff',
 'l1y_abs_sig_strikes_attempts_per_min_diff',
 'l5y_inf_clinch_strikes_landed_per_min_diff',
 'all_abs_distance_strikes_attempts_per_min_diff',
 'all_inf_sig_strikes_accuracy_diff',
 'l1y_abs_total_strikes_accuracy_diff',
 'l1y_abs_sig_strikes_accuracy_diff',
 'l3y_inf_control_per_min_diff',
 'l5y_wins_ko_diff',
 'all_inf_leg_strikes_landed_per_min_diff',
 'l3y_inf_leg_strikes_attempts_per_min_diff',
 'l3y_inf_total_strikes_landed_per_min_diff',
 'l5y_inf_ground_strikes_landed_per_min_diff',
 'all_abs_reversals_per_min_diff',
 'height_diff',
 'l3y_defensive_grappling_loss_diff',
 'l3y_abs_takedowns_attempts_per_min_diff',
 'l3y_inf_body_strikes_attempts_per_min_diff',
 'l5y_losses_sub_diff',
 'all_inf_clinch_strikes_accuracy_diff',
 'l1y_abs_knockdowns_per_min_diff',
 'all_abs_distance_strikes_accuracy_diff',
 'all_abs_total_strikes_accuracy_diff',
 'l3y_abs_sig_strikes_accuracy_diff',
 'l5y_abs_takedowns_attempts_per_min_diff',
 'l5y_abs_total_strikes_accuracy_diff',
 'l3y_abs_takedowns_landed_per_min_diff',
 'all_abs_control_per_min_diff',
 'l3y_wins_wins_diff',
 'l1y_abs_body_strikes_accuracy_diff',
 'all_inf_leg_strikes_attempts_per_min_diff',
 'l5y_inf_leg_strikes_attempts_per_min_diff',
 'all_inf_total_strikes_accuracy_diff',
 'l3y_inf_leg_strikes_accuracy_diff',
 'l3y_inf_sig_strikes_accuracy_diff',
 'l5y_inf_leg_strikes_landed_per_min_diff',
 'l3y_abs_leg_strikes_accuracy_diff',
 'l5y_abs_leg_strikes_accuracy_diff',
 'l3y_inf_distance_strikes_accuracy_diff',
 'l3y_inf_knockdowns_per_min_diff',
 'l5y_inf_head_strikes_accuracy_diff',
 'l3y_abs_distance_strikes_attempts_per_min_diff']

model_test_score(X_train, X_test, best_features, _max_iter = 30000, scaled=True)

Training set size: (559, 321) accuracy: 0.7227191413237924
Test set size: (140, 321) accuracy: 0.5714285714285714
Test set neg log loss: -0.696068156654517. Probability to observe data given model: 0.49854164290272746


In [12]:
# without fight math
# Training set size: (560, 317) accuracy: 0.725
# Test set size: (140, 317) accuracy: 0.5642857142857143
# Test set neg log loss: -0.7290313426064227. Probability to observe data given model: 0.4823760209564177