In [1]:

# standard library imports
import git, os, sys

git_repo = git.Repo(os.getcwd(), search_parent_directories=True)
git_root = git_repo.git.rev_parse("--show-toplevel")
os.chdir(f'{git_root}/src')
sys.path.append(os.path.abspath(os.path.join(f'{git_root}/src')))
print(f'Changed working directory to {os.getcwd()}')

# local imports
from fight_stat_helpers import *
from data_handler import DataHandler

Changed working directory to C:\Users\Alex\OneDrive\Documents\GitHub\UFC_Prediction_2022\src


# TODO import these functions from fight_stat_helpers like in the womens notebook

# NEW FEATURES

- Dominance Score for a single fight: (total score of fighter based on events in the fight) - (total score of opponent based on events in the fight)
- Average Dominance over time scales
- Fighter Scores
- Fight Math
- Fighter Score increments based on closeness of fights with opponents with high scores 

# Other things to try
- with or without regularization
- with or without scaling
- with or without sum features (seems like only age needs sum features)
- with or without squared features (seems like only age needs squared features)
- Decide whether to include SDEC in bullshit or not
- Do linear reg on draws and SDEC and see how this compares to log reg plane (hypothesis: they are the same plane since close fights should be near the plane boundary)

In [2]:
dh = DataHandler()
# get reported derived doubled data
ufc_fights_reported_derived_doubled = dh.get('ufc_fights_reported_derived_doubled')

In [3]:
ufc_fights_predictive_flattened_diffs = dh.make_ufc_fights_predictive_flattened_diffs(ufc_fights_reported_derived_doubled)
ufc_fights_predictive_flattened_diffs = dh.clean_ufc_fights_for_winner_prediction(ufc_fights_predictive_flattened_diffs)

In [4]:
# show division counts
print(ufc_fights_predictive_flattened_diffs['division'].value_counts())

division
Lightweight              1059
Welterweight             1015
Middleweight              819
Featherweight             668
Bantamweight              590
Light Heavyweight         576
Heavyweight               524
Flyweight                 304
Women's Strawweight       273
Women's Flyweight         222
Women's Bantamweight      178
Catch Weight               62
Women's Featherweight      26
Name: count, dtype: int64


In [5]:
# filter by division. start with just lightweight and featherweight
division_mask = ufc_fights_predictive_flattened_diffs['division'].isin(['Welterweight', 'Middleweight', 'Light Heavyweight', 'Heavyweight'])
ufc_fights_predictive_flattened_diffs_welter_heavy = ufc_fights_predictive_flattened_diffs[division_mask]
ufc_fights_predictive_flattened_diffs_welter_heavy.shape

(2934, 325)

In [6]:
# drop the 'fighter_result' column and opponent_result column as it is not needed for the model and also fighter and opponent columns
X = ufc_fights_predictive_flattened_diffs_welter_heavy.drop(columns=['fighter', 'opponent', 'method', 'division'])
# KEEP result in X as this is what model_score is expecting for better or for worse
y = X['result']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=67)

# Use the entire data set with all features as a baseline

In [8]:
all_features = X_train.columns.tolist()
# remove result from all_features
if 'result' in all_features:
    all_features.remove('result')
model_test_score(X_train, X_test, all_features, _max_iter = 20000, scaled=True)

Training set size: (2347, 321) accuracy: 0.6774605879846612
Test set size: (587, 321) accuracy: 0.5996592844974447
Test set neg log loss: -0.6972910848541268. Probability to observe data given model: 0.49793233491483996


In [9]:
best_features = additive_greedy(X_train, X_test, search_doubles=True, _max_iter=30000)

Starting with an empty feature set.
Added single feature: l5y_overall_fighter_score_diff
Negative log loss on training set: -0.6804388011790548
Training set size: (2347, 321) accuracy: 0.5807413719642096
Test set size: (587, 321) accuracy: 0.5911413969335605
Test set neg log loss: -0.6894062793408626. Probability to observe data given model: 0.501873953561728
Added single feature: l5y_wins_wins_diff
Negative log loss on training set: -0.6727661101756038
Training set size: (2347, 321) accuracy: 0.5986365573072007
Test set size: (587, 321) accuracy: 0.5877342419080068
Test set neg log loss: -0.6913456292986958. Probability to observe data given model: 0.5009015875148386
Added single feature: l3y_wins_diff
Negative log loss on training set: -0.6646719559899675
Training set size: (2347, 321) accuracy: 0.6011930123561994
Test set size: (587, 321) accuracy: 0.5809199318568995
Test set neg log loss: -0.6853424084969709. Probability to observe data given model: 0.5039176543442778
Added single 

In [10]:
model_test_score(X_train, X_test, best_features, _max_iter = 30000, scaled=True)

Training set size: (2347, 321) accuracy: 0.6553046442266723
Test set size: (587, 321) accuracy: 0.6013628620102215
Test set neg log loss: -0.6809429903208845. Probability to observe data given model: 0.5061394826243101
