In [7]:

# standard library imports
import git, os, sys

git_repo = git.Repo(os.getcwd(), search_parent_directories=True)
git_root = git_repo.git.rev_parse("--show-toplevel")
os.chdir(f'{git_root}/src')
sys.path.append(os.path.abspath(os.path.join(f'{git_root}/src')))
print(f'Changed working directory to {os.getcwd()}')

# local imports
from fight_stat_helpers import *
from data_handler import DataHandler

Changed working directory to C:\Users\Alex\OneDrive\Documents\GitHub\UFC_Prediction_2022\src


# NEW FEATURES

- Dominance Score for a single fight: (total score of fighter based on events in the fight) - (total score of opponent based on events in the fight)
- Average Dominance over time scales
- Fighter Scores
- Fight Math
- Fighter Score increments based on closeness of fights with opponents with high scores 

# Other things to try
- with or without regularization
- with or without scaling
- with or without sum features (seems like only age needs sum features)
- with or without squared features (seems like only age needs squared features)
- Decide whether to include SDEC in bullshit or not
- Do linear reg on draws and SDEC and see how this compares to log reg plane (hypothesis: they are the same plane since close fights should be near the plane boundary)

In [8]:
dh = DataHandler()
# get reported derived doubled data
ufc_fights_reported_derived_doubled = dh.get('ufc_fights_reported_derived_doubled')

# Verify FIGHT MATH Stats

In [9]:
ufc_fights_reported_derived_doubled[['all_fight_math']].sum()

all_fight_math    1832.0
dtype: float64

In [10]:
ufc_fights_predictive_flattened_diffs = dh.make_ufc_fights_predictive_flattened_diffs(ufc_fights_reported_derived_doubled)
ufc_fights_predictive_flattened_diffs = dh.clean_ufc_fights_for_winner_prediction(ufc_fights_predictive_flattened_diffs)

In [11]:
# show division counts
print(ufc_fights_predictive_flattened_diffs['division'].value_counts())

division
Lightweight              1059
Welterweight             1015
Middleweight              819
Featherweight             668
Bantamweight              590
Light Heavyweight         576
Heavyweight               524
Flyweight                 304
Women's Strawweight       273
Women's Flyweight         222
Women's Bantamweight      178
Catch Weight               62
Women's Featherweight      26
Name: count, dtype: int64


In [12]:
# drop the 'fighter_result' column and opponent_result column as it is not needed for the model and also fighter and opponent columns
X = ufc_fights_predictive_flattened_diffs.drop(columns=['fighter', 'opponent', 'method', 'division'])
# KEEP result in X as this is what model_score is expecting for better or for worse
y = X['result']

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=91)

# Use the entire data set with all features as a baseline

In [14]:
all_features = X_train.columns.tolist()
# remove result from all_features
if 'result' in all_features:
    all_features.remove('result')
model_test_score(X_train, X_test, all_features, _max_iter = 20000, scaled=True)

Training set size: (5052, 321) accuracy: 0.6666666666666666
Test set size: (1264, 321) accuracy: 0.6218354430379747
Test set neg log loss: -0.6636982123513288. Probability to observe data given model: 0.5149434385792835


In [15]:
best_features = additive_greedy(X_train, X_test, search_doubles=True, _max_iter=30000)

Starting with an empty feature set.
Added single feature: l5y_overall_fighter_score_diff
Negative log loss on training set: -0.680011906474426
Training set size: (5052, 321) accuracy: 0.5696753760886778
Test set size: (1264, 321) accuracy: 0.5862341772151899
Test set neg log loss: -0.6793573544798297. Probability to observe data given model: 0.5069426721432919
Added single feature: age_diff
Negative log loss on training set: -0.6706129950769397
Training set size: (5052, 321) accuracy: 0.5892715756136183
Test set size: (1264, 321) accuracy: 0.6107594936708861
Test set neg log loss: -0.6674214747863464. Probability to observe data given model: 0.5130297338414997
Added single feature: l5y_wins_diff
Negative log loss on training set: -0.664536820846416
Training set size: (5052, 321) accuracy: 0.5894695170229612
Test set size: (1264, 321) accuracy: 0.6257911392405063
Test set neg log loss: -0.6615886817490836. Probability to observe data given model: 0.516030874107386
Added single feature: 

In [16]:
model_test_score(X_train, X_test, best_features, _max_iter = 30000, scaled=True)

Training set size: (5052, 321) accuracy: 0.6526128266033254
Test set size: (1264, 321) accuracy: 0.6416139240506329
Test set neg log loss: -0.6480221324476045. Probability to observe data given model: 0.52307933594839


In [17]:
best_features

['l5y_overall_fighter_score_diff',
 'age_diff',
 'l5y_wins_diff',
 'l5y_wins_wins_diff',
 'l1y_inf_clinch_strikes_accuracy_diff',
 'all_abs_head_strikes_accuracy_diff',
 'all_inf_distance_strikes_accuracy_diff',
 'l5y_inf_takedowns_attempts_per_min_diff',
 'all_inf_total_strikes_attempts_per_min_diff',
 'l1y_inf_clinch_strikes_landed_per_min_diff',
 'all_abs_leg_strikes_attempts_per_min_diff',
 'all_abs_takedowns_attempts_per_min_diff',
 'all_abs_control_per_min_diff',
 'l3y_abs_takedowns_landed_per_min_diff',
 'reach_diff',
 'all_inf_sub_attempts_per_min_diff',
 'l3y_defensive_grappling_loss_diff',
 'l3y_inf_sub_attempts_per_min_diff',
 'all_inf_clinch_strikes_accuracy_diff',
 'l5y_inf_body_strikes_accuracy_diff',
 'l1y_inf_body_strikes_landed_per_min_diff',
 'l3y_abs_body_strikes_accuracy_diff',
 'l5y_inf_knockdowns_per_min_diff',
 'l1y_abs_total_strikes_attempts_per_min_diff',
 'all_wins_wins_diff',
 'all_losses_losses_diff',
 'l1y_inf_ground_strikes_attempts_per_min_diff',
 'all_de

In [None]:
best_features = ['l5y_overall_fighter_score_diff',
 'age_diff',
 'l5y_wins_diff',
 'l5y_wins_wins_diff',
 'l1y_inf_clinch_strikes_accuracy_diff',
 'all_abs_head_strikes_accuracy_diff',
 'all_inf_distance_strikes_accuracy_diff',
 'l5y_inf_takedowns_attempts_per_min_diff',
 'all_inf_total_strikes_attempts_per_min_diff',
 'l1y_inf_clinch_strikes_landed_per_min_diff',
 'all_abs_leg_strikes_attempts_per_min_diff',
 'all_abs_takedowns_attempts_per_min_diff',
 'all_abs_control_per_min_diff',
 'l3y_abs_takedowns_landed_per_min_diff',
 'reach_diff',
 'all_inf_sub_attempts_per_min_diff',
 'l3y_defensive_grappling_loss_diff',
 'l3y_inf_sub_attempts_per_min_diff',
 'all_inf_clinch_strikes_accuracy_diff',
 'l5y_inf_body_strikes_accuracy_diff',
 'l1y_inf_body_strikes_landed_per_min_diff',
 'l3y_abs_body_strikes_accuracy_diff',
 'l5y_inf_knockdowns_per_min_diff',
 'l1y_abs_total_strikes_attempts_per_min_diff',
 'all_wins_wins_diff',
 'all_losses_losses_diff',
 'l1y_inf_ground_strikes_attempts_per_min_diff',
 'all_defensive_grappling_loss_diff',
 'l3y_losses_dec_diff',
 'l5y_defensive_grappling_loss_diff',
 'l3y_overall_fighter_score_diff',
 'l1y_abs_ground_strikes_landed_per_min_diff',
 'all_inf_head_strikes_accuracy_diff',
 'l3y_inf_head_strikes_accuracy_diff',
 'l3y_inf_distance_strikes_landed_per_min_diff',
 'l3y_inf_sig_strikes_attempts_per_min_diff',
 'l3y_inf_body_strikes_accuracy_diff',
 'l3y_inf_takedowns_accuracy_diff',
 'l3y_abs_total_strikes_attempts_per_min_diff',
 'l3y_abs_control_per_min_diff',
 'l5y_fight_math_diff',
 'l1y_wins_sub_diff',
 'l1y_wins_ko_diff',
 'all_abs_sig_strikes_accuracy_diff',
 'l1y_abs_clinch_strikes_attempts_per_min_diff',
 'l1y_abs_clinch_strikes_landed_per_min_diff',
 'all_abs_clinch_strikes_accuracy_diff',
 'l3y_abs_leg_strikes_attempts_per_min_diff',
 'all_wins_ko_diff',
 'l3y_wins_ko_diff',
 'l1y_losses_losses_diff',
 'l1y_inf_body_strikes_attempts_per_min_diff',
 'l3y_abs_body_strikes_attempts_per_min_diff',
 'l3y_abs_body_strikes_landed_per_min_diff',
 'l1y_abs_body_strikes_attempts_per_min_diff',
 'l1y_inf_distance_strikes_accuracy_diff',
 'l1y_inf_total_strikes_accuracy_diff',
 'l5y_abs_body_strikes_landed_per_min_diff',
 'all_offensive_grappling_score_diff',
 'l3y_inf_takedowns_landed_per_min_diff',
 'l3y_offensive_grappling_score_diff']