In [10]:

# standard library imports
import git, os, sys

git_repo = git.Repo(os.getcwd(), search_parent_directories=True)
git_root = git_repo.git.rev_parse("--show-toplevel")
os.chdir(f'{git_root}/src')
sys.path.append(os.path.abspath(os.path.join(f'{git_root}/src')))
print(f'Changed working directory to {os.getcwd()}')

# local imports
from fight_stat_helpers import *
from data_handler import DataHandler

Changed working directory to C:\Users\Alex\OneDrive\Documents\GitHub\UFC_Prediction_2022\src


In [11]:
dh = DataHandler()
# get reported derived doubled data
ufc_fights_reported_derived_doubled = dh.get('ufc_fights_reported_derived_doubled')

In [12]:
ufc_fights_predictive_flattened_diffs = dh.make_ufc_fights_predictive_flattened_diffs(ufc_fights_reported_derived_doubled)
ufc_fights_predictive_flattened_diffs = dh.clean_ufc_fights_for_winner_prediction(ufc_fights_predictive_flattened_diffs)

In [13]:
# show division counts
print(ufc_fights_predictive_flattened_diffs['division'].value_counts())

division
Lightweight              1059
Welterweight             1016
Middleweight              820
Featherweight             669
Bantamweight              593
Light Heavyweight         578
Heavyweight               525
Flyweight                 305
Women's Strawweight       274
Women's Flyweight         222
Women's Bantamweight      178
Catch Weight               62
Women's Featherweight      26
Name: count, dtype: int64


In [14]:
# filter by division. start with just lightweight and featherweight
division_mask = ufc_fights_predictive_flattened_diffs['division'].isin(["Women's Strawweight", "Women's Flyweight", "Women's Bantamweight", "Women's Featherweight"])
ufc_fights_predictive_flattened_diffs_womens = ufc_fights_predictive_flattened_diffs[division_mask]
ufc_fights_predictive_flattened_diffs_womens.shape

(700, 321)

In [15]:
# drop the 'fighter_result' column and opponent_result column as it is not needed for the model and also fighter and opponent columns
X = ufc_fights_predictive_flattened_diffs_womens.drop(columns=['fighter', 'opponent', 'method', 'division'])
# KEEP result in X as this is what model_score is expecting for better or for worse
y = X['result']

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=72)

# Use the entire data set with all features as a baseline

In [17]:
all_features = X_train.columns.tolist()
# remove result from all_features
if 'result' in all_features:
    all_features.remove('result')
model_test_score(X_train, X_test, all_features, _max_iter = 20000, scaled=True)

Training set size: (560, 317) accuracy: 0.7571428571428571
Test set size: (140, 317) accuracy: 0.5428571428571428
Test set neg log loss: -0.8985363879349987. Probability to observe data given model: 0.40716515568107653


In [18]:
best_features = additive_greedy(X_train, X_test, search_doubles=True, _max_iter=30000)

Starting with an empty feature set.


Added single feature: age_diff
Negative log loss on training set: -0.6779215480179095
Training set size: (560, 317) accuracy: 0.5821428571428572
Test set size: (140, 317) accuracy: 0.5357142857142857
Test set neg log loss: -0.6885549184471669. Probability to observe data given model: 0.50230141135396
Added single feature: l1y_inf_takedowns_attempts_per_min_diff
Negative log loss on training set: -0.6661938730829038
Training set size: (560, 317) accuracy: 0.6
Test set size: (140, 317) accuracy: 0.6
Test set neg log loss: -0.673673842383155. Probability to observe data given model: 0.5098320901960643
Added single feature: l3y_abs_head_strikes_landed_per_min_diff
Negative log loss on training set: -0.6578137710753098
Training set size: (560, 317) accuracy: 0.6375
Test set size: (140, 317) accuracy: 0.5571428571428572
Test set neg log loss: -0.69952126489462. Probability to observe data given model: 0.49682309352375503
Added single feature: l1y_abs_takedowns_landed_per_min_diff
Negative lo

In [20]:
model_test_score(X_train, X_test, best_features, _max_iter = 30000, scaled=True)

Training set size: (560, 317) accuracy: 0.725
Test set size: (140, 317) accuracy: 0.5642857142857143
Test set neg log loss: -0.7290313426064227. Probability to observe data given model: 0.4823760209564177


In [19]:
best_features = ['l1y_inf_total_strikes_accuracy_diff',
 'age_diff',
 'all_wins_wins_diff',
 'l3y_abs_distance_strikes_landed_per_min_diff',
 'l1y_inf_takedowns_attempts_per_min_diff',
 'l1y_losses_losses_diff',
 'l5y_inf_leg_strikes_accuracy_diff',
 'l1y_inf_knockdowns_per_min_diff',
 'l3y_abs_distance_strikes_accuracy_diff',
 'l1y_abs_takedowns_landed_per_min_diff',
 'l3y_losses_sub_diff',
 'l3y_losses_ko_diff',
 'l3y_inf_sub_attempts_per_min_diff',
 'l5y_inf_sig_strikes_accuracy_diff',
 'l1y_losses_diff',
 'l1y_inf_takedowns_landed_per_min_diff',
 'l5y_abs_sub_attempts_per_min_diff',
 'l3y_inf_total_strikes_accuracy_diff',
 'all_wins_sub_diff',
 'l1y_abs_control_per_min_diff',
 'l3y_abs_total_strikes_accuracy_diff',
 'l3y_abs_head_strikes_accuracy_diff',
 'l1y_abs_body_strikes_landed_per_min_diff',
 'l1y_abs_sig_strikes_attempts_per_min_diff',
 'l5y_inf_clinch_strikes_landed_per_min_diff',
 'all_abs_distance_strikes_attempts_per_min_diff',
 'all_inf_sig_strikes_accuracy_diff',
 'l1y_abs_total_strikes_accuracy_diff',
 'l1y_abs_sig_strikes_accuracy_diff',
 'l3y_inf_control_per_min_diff',
 'l5y_wins_ko_diff',
 'all_inf_leg_strikes_landed_per_min_diff',
 'l3y_inf_leg_strikes_attempts_per_min_diff',
 'l3y_inf_total_strikes_landed_per_min_diff',
 'l5y_inf_ground_strikes_landed_per_min_diff',
 'all_abs_reversals_per_min_diff',
 'height_diff',
 'l3y_defensive_grappling_loss_diff',
 'l3y_abs_takedowns_attempts_per_min_diff',
 'l3y_inf_body_strikes_attempts_per_min_diff',
 'l5y_losses_sub_diff',
 'all_inf_clinch_strikes_accuracy_diff',
 'l1y_abs_knockdowns_per_min_diff',
 'all_abs_distance_strikes_accuracy_diff',
 'all_abs_total_strikes_accuracy_diff',
 'l3y_abs_sig_strikes_accuracy_diff',
 'l5y_abs_takedowns_attempts_per_min_diff',
 'l5y_abs_total_strikes_accuracy_diff',
 'l3y_abs_takedowns_landed_per_min_diff',
 'all_abs_control_per_min_diff',
 'l3y_wins_wins_diff',
 'l1y_abs_body_strikes_accuracy_diff',
 'all_inf_leg_strikes_attempts_per_min_diff',
 'l5y_inf_leg_strikes_attempts_per_min_diff',
 'all_inf_total_strikes_accuracy_diff',
 'l3y_inf_leg_strikes_accuracy_diff',
 'l3y_inf_sig_strikes_accuracy_diff',
 'l5y_inf_leg_strikes_landed_per_min_diff',
 'l3y_abs_leg_strikes_accuracy_diff',
 'l5y_abs_leg_strikes_accuracy_diff',
 'l3y_inf_distance_strikes_accuracy_diff',
 'l3y_inf_knockdowns_per_min_diff',
 'l5y_inf_head_strikes_accuracy_diff',
 'l3y_abs_distance_strikes_attempts_per_min_diff']

model_test_score(X_train, X_test, best_features, _max_iter = 30000, scaled=True)

Training set size: (560, 317) accuracy: 0.725
Test set size: (140, 317) accuracy: 0.5642857142857143
Test set neg log loss: -0.7290313426064227. Probability to observe data given model: 0.4823760209564177
