In [77]:

# standard library imports
import git, os, sys

git_repo = git.Repo(os.getcwd(), search_parent_directories=True)
git_root = git_repo.git.rev_parse("--show-toplevel")
os.chdir(f'{git_root}/src')
sys.path.append(os.path.abspath(os.path.join(f'{git_root}/src')))
print(f'Changed working directory to {os.getcwd()}')

# local imports
from fight_stat_helpers import *
from data_handler import DataHandler

Changed working directory to C:\Users\Alex\OneDrive\Documents\GitHub\UFC_Prediction_2022\src


# NEW FEATURES

- Dominance Score for a single fight: (total score of fighter based on events in the fight) - (total score of opponent based on events in the fight)
- Average Dominance over time scales
- Fighter Scores
- Fight Math
- Fighter Score increments based on closeness of fights with opponents with high scores 

# Other things to try
- with or without regularization
- with or without scaling
- with or without sum features (seems like only age needs sum features)
- with or without squared features (seems like only age needs squared features)
- Decide whether to include SDEC in bullshit or not
- Do linear reg on draws and SDEC and see how this compares to log reg plane (hypothesis: they are the same plane since close fights should be near the plane boundary)

In [78]:
dh = DataHandler()
# get reported derived doubled data
ufc_fights_reported_derived_doubled = dh.get('ufc_fights_reported_derived_doubled')

In [79]:
# only take fights in the last 10 years
ufc_fights_reported_derived_doubled['date'] = pd.to_datetime(ufc_fights_reported_derived_doubled['date'])
date_10_years_ago = pd.Timestamp.now() - pd.DateOffset(years=10)
ufc_fights_reported_derived_doubled = ufc_fights_reported_derived_doubled[ufc_fights_reported_derived_doubled['date'] >= date_10_years_ago]
ufc_fights_reported_derived_doubled.shape

(9864, 326)

# Verify FIGHT MATH Stats

In [80]:
ufc_fights_reported_derived_doubled[['all_fight_math']].sum()

all_fight_math    1374.0
dtype: float64

In [81]:
ufc_fights_predictive_flattened_diffs = dh.make_ufc_fights_predictive_flattened_diffs(ufc_fights_reported_derived_doubled)
ufc_fights_predictive_flattened_diffs = dh.clean_ufc_fights_for_winner_prediction(ufc_fights_predictive_flattened_diffs)

In [82]:
all_features = [f for f in ufc_fights_predictive_flattened_diffs.columns if not ('score_diff' in f or 'loss_diff' in f)]

In [None]:
len(all_features)

In [36]:
# show division counts
print(ufc_fights_predictive_flattened_diffs['division'].value_counts())

division
Lightweight              609
Welterweight             591
Featherweight            496
Middleweight             473
Bantamweight             453
Light Heavyweight        328
Heavyweight              327
Women's Strawweight      256
Flyweight                241
Women's Flyweight        222
Women's Bantamweight     153
Catch Weight              52
Women's Featherweight     26
Name: count, dtype: int64


In [83]:
# drop the 'fighter_result' column and opponent_result column as it is not needed for the model and also fighter and opponent columns
X = ufc_fights_predictive_flattened_diffs[all_features].drop(columns=['fighter', 'opponent', 'method', 'division'])
# KEEP result in X as this is what model_score is expecting for better or for worse
y = X['result']

In [None]:
list(X.columns)

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=91)

# Use the entire data set with all features as a baseline

In [None]:
all_features

In [85]:
all_features = X_train.columns.tolist()
# remove result from all_features
if 'result' in all_features:
    all_features.remove('result')
model_test_score(X_train, X_test, all_features, _max_iter = 20000, scaled=False)

Training set size: (3381, 301) accuracy: 0.663117420881396
Test set size: (846, 301) accuracy: 0.640661938534279
Test set neg log loss: -0.6495917563945068. Probability to observe data given model: 0.5222589421198902


In [40]:
best_features = additive_greedy(X_train, X_test, search_doubles=True, _max_iter=30000)

Starting with an empty feature set.
Added single feature: age_diff
Negative log loss on training set: -0.6786837474330236
Training set size: (3381, 301) accuracy: 0.5752735876959479
Test set size: (846, 301) accuracy: 0.6016548463356974
Test set neg log loss: -0.6665963713903177. Probability to observe data given model: 0.5134532110993928
Added single feature: l5y_wins_diff
Negative log loss on training set: -0.663604267733662
Training set size: (3381, 301) accuracy: 0.5950902099970423
Test set size: (846, 301) accuracy: 0.5886524822695035
Test set neg log loss: -0.6659579046675438. Probability to observe data given model: 0.5137811385626279
Added single feature: l5y_losses_losses_diff
Negative log loss on training set: -0.656850238518206
Training set size: (3381, 301) accuracy: 0.5986394557823129
Test set size: (846, 301) accuracy: 0.5886524822695035
Test set neg log loss: -0.6732123934493655. Probability to observe data given model: 0.5100674059594255
Added single feature: all_inf_co

In [41]:
model_test_score(X_train, X_test, best_features, _max_iter = 30000, scaled=True)

Training set size: (3381, 301) accuracy: 0.6521739130434783
Test set size: (846, 301) accuracy: 0.6276595744680851
Test set neg log loss: -0.652234546128243. Probability to observe data given model: 0.5208805437600875


In [42]:
best_features

['age_diff',
 'l5y_wins_diff',
 'l5y_losses_losses_diff',
 'all_inf_control_per_min_diff',
 'l3y_abs_head_strikes_accuracy_diff',
 'all_inf_distance_strikes_accuracy_diff',
 'l5y_wins_wins_diff',
 'l1y_inf_takedowns_attempts_per_min_diff',
 'l1y_abs_takedowns_attempts_per_min_diff',
 'l3y_fight_math_diff',
 'reach_diff',
 'l3y_abs_clinch_strikes_landed_per_min_diff',
 'l3y_inf_ground_strikes_attempts_per_min_diff',
 'l5y_losses_ko_diff',
 'all_wins_wins_diff',
 'all_losses_losses_diff',
 'l3y_losses_sub_diff',
 'all_inf_body_strikes_accuracy_diff',
 'all_wins_diff',
 'l3y_inf_takedowns_attempts_per_min_diff',
 'l3y_abs_body_strikes_accuracy_diff',
 'all_inf_clinch_strikes_landed_per_min_diff',
 'l5y_inf_body_strikes_attempts_per_min_diff',
 'l5y_inf_clinch_strikes_attempts_per_min_diff',
 'l1y_abs_knockdowns_per_min_diff',
 'l1y_abs_clinch_strikes_accuracy_diff',
 'all_abs_takedowns_attempts_per_min_diff',
 'l3y_abs_clinch_strikes_accuracy_diff',
 'l1y_inf_body_strikes_attempts_per_min

In [86]:
best_features = [
        'age_diff',
        'reach_diff',
        'l5y_wins_diff',
        'l5y_losses_ko_diff',
        'all_wins_wins_diff',
        'l5y_wins_wins_diff',
        'l5y_losses_losses_diff',
        'all_losses_losses_diff',
        'l3y_losses_sub_diff',
        'l1y_wins_sub_diff',
        'l1y_wins_diff',
        # 'all_wins_diff',
        'l3y_fight_math_diff',
        'all_inf_control_per_min_diff',
        'all_inf_distance_strikes_accuracy_diff',
        'l1y_inf_takedowns_landed_per_min_diff',
        # 'l1y_inf_takedowns_attempts_per_min_diff',
        'l3y_inf_takedowns_attempts_per_min_diff',
        'l3y_inf_ground_strikes_attempts_per_min_diff',
        'all_inf_body_strikes_accuracy_diff',
        'l1y_inf_body_strikes_attempts_per_min_diff',
        'l5y_inf_body_strikes_attempts_per_min_diff',
        'all_inf_clinch_strikes_landed_per_min_diff',
        # 'l5y_inf_clinch_strikes_attempts_per_min_diff',
        'l1y_inf_total_strikes_landed_per_min_diff',
        'l1y_abs_knockdowns_per_min_diff',
        'l1y_abs_takedowns_attempts_per_min_diff',
        'all_abs_takedowns_attempts_per_min_diff',
        'l3y_abs_head_strikes_accuracy_diff',
        'l1y_abs_body_strikes_accuracy_diff',
        'l3y_abs_body_strikes_accuracy_diff',
        # 'l1y_abs_clinch_strikes_accuracy_diff',
        'l3y_abs_clinch_strikes_landed_per_min_diff',
        'l3y_abs_clinch_strikes_accuracy_diff',
        ]

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

In [88]:
model_test_score(X_train, X_test, best_features, _max_iter = 30000, scaled=False)

Training set size: (3381, 301) accuracy: 0.6406388642413487
Test set size: (846, 301) accuracy: 0.6524822695035462
Test set neg log loss: -0.6233637775174445. Probability to observe data given model: 0.5361379522007008
