In [2]:

# standard library imports
import git, os, sys

git_repo = git.Repo(os.getcwd(), search_parent_directories=True)
git_root = git_repo.git.rev_parse("--show-toplevel")
os.chdir(f'{git_root}/src')
sys.path.append(os.path.abspath(os.path.join(f'{git_root}/src')))
print(f'Changed working directory to {os.getcwd()}')

# local imports
from fight_stat_helpers import *
from data_handler import DataHandler

Changed working directory to C:\Users\Alex\OneDrive\Documents\GitHub\UFC_Prediction_2022\src


In [3]:
dh = DataHandler()
# get reported derived doubled data
ufc_fights_reported_derived_doubled = dh.get('ufc_fights_reported_derived_doubled')

In [4]:
# only take fights in the last 10 years
ufc_fights_reported_derived_doubled['date'] = pd.to_datetime(ufc_fights_reported_derived_doubled['date'])
date_10_years_ago = pd.Timestamp.now() - pd.DateOffset(years=10)
ufc_fights_reported_derived_doubled = ufc_fights_reported_derived_doubled[ufc_fights_reported_derived_doubled['date'] >= date_10_years_ago]
ufc_fights_reported_derived_doubled.shape

(9862, 326)

In [5]:
ufc_fights_predictive_flattened_diffs = dh.make_ufc_fights_predictive_flattened_diffs(ufc_fights_reported_derived_doubled)
ufc_fights_predictive_flattened_diffs = dh.clean_ufc_fights_for_winner_prediction(ufc_fights_predictive_flattened_diffs)

In [6]:
# got 64 % on a test set
amazing_feature_set =  [
'age_diff',
'reach_diff',
'l5y_wins_diff',
'l5y_losses_ko_diff',
'all_wins_wins_diff',
'l5y_wins_wins_diff',
'l5y_losses_losses_diff',
'all_losses_losses_diff',
'l3y_losses_sub_diff',
'l1y_wins_sub_diff',
'l1y_wins_diff',
# 'all_wins_diff',
'l3y_fight_math_diff',
'all_inf_control_per_min_diff',
'all_inf_distance_strikes_accuracy_diff',
'l1y_inf_takedowns_landed_per_min_diff',
# 'l1y_inf_takedowns_attempts_per_min_diff',
'l3y_inf_takedowns_attempts_per_min_diff',
'l3y_inf_ground_strikes_attempts_per_min_diff',
'all_inf_body_strikes_accuracy_diff',
'l1y_inf_body_strikes_attempts_per_min_diff',
'l5y_inf_body_strikes_attempts_per_min_diff',
'all_inf_clinch_strikes_landed_per_min_diff',
# 'l5y_inf_clinch_strikes_attempts_per_min_diff',
'l1y_inf_total_strikes_landed_per_min_diff',
'l1y_abs_knockdowns_per_min_diff',
'l1y_abs_takedowns_attempts_per_min_diff',
'all_abs_takedowns_attempts_per_min_diff',
'l3y_abs_head_strikes_accuracy_diff',
'l1y_abs_body_strikes_accuracy_diff',
'l3y_abs_body_strikes_accuracy_diff',
# 'l1y_abs_clinch_strikes_accuracy_diff',
'l3y_abs_clinch_strikes_landed_per_min_diff',
'l3y_abs_clinch_strikes_accuracy_diff',
]

In [40]:
# drop the 'fighter_result' column and opponent_result column as it is not needed for the model and also fighter and opponent columns
X = ufc_fights_predictive_flattened_diffs
# KEEP result in X as this is what model_score is expecting for better or for worse
y = X['result']

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

# Use the entire data set with all features as a baseline

In [42]:
model_test_score(X_train, X_test, amazing_feature_set, _max_iter = 20000, scaled=True)

Training set size: (3381, 325) accuracy: 0.6418219461697723
Test set size: (846, 325) accuracy: 0.6536643026004728
Test set neg log loss: -0.6360888359968085. Average probability to observe data given model: 0.5293587895040919


(0.6418219461697723, 0.6536643026004728)

In [56]:
X_train['division'].unique()

array(['Lightweight', "Women's Bantamweight", 'Bantamweight',
       'Heavyweight', 'Featherweight', 'Middleweight',
       'Light Heavyweight', "Women's Flyweight", "Women's Strawweight",
       'Welterweight', 'Flyweight', 'Catch Weight',
       "Women's Featherweight"], dtype=object)

In [68]:
# find the part of X_test that is in division middleweight, light heavyweight, heavyweight
heavy_division_list = ['Middleweight', 'Light Heavyweight', 'Heavyweight']
light_division_list = ['Lightweight', 'Featherweight', 'Bantamweight', 'Flyweight', 'Welterweight']
women_division_list = ['Women\'s Bantamweight', 'Women\'s Featherweight', 'Women\'s Flyweight', 'Women\'s Strawweight']
all_division_list = heavy_division_list + light_division_list + women_division_list
division_list = women_division_list
X_test_division = X_test[X_test['division'].isin(division_list)]
y_test_division = y_test[X_test['division'].isin(division_list)]
X_test_division = X_test_division.drop(columns=['result', 'fighter', 'opponent', 'method','division'])
X_test_division.shape

(117, 320)

In [69]:
# fit model on training set
from sklearn.linear_model import LogisticRegression
best_model = LogisticRegression(solver='lbfgs', max_iter=20000, C=0.1, penalty='l2', fit_intercept=False)
scaler = StandardScaler()
X_train_cleaned = X_train[amazing_feature_set]
X_train_scaled = scaler.fit_transform(X_train_cleaned)
X_test_scaled = scaler.transform(X_test_division[amazing_feature_set])

best_model.fit(X_train_scaled, y_train)

# evaluate the model on the training set
train_score = best_model.score(X_train_scaled, y_train)
print(f'Training set size: {X_train_cleaned.shape} accuracy: {train_score}')

# evaluate the model on the test set
test_score = best_model.score(X_test_scaled, y_test_division)
print(f'Test set size: {X_test_division.shape} accuracy: {test_score}')

# get the neg log loss score of the test set and convert it to a probability
y_proba_test = best_model.predict_proba(X_test_scaled)
log_loss = sklearn.metrics.log_loss(y_test_division, y_proba_test)
print(f'Test set neg log loss: {-log_loss}. Average probability to observe data given model: {np.exp(-log_loss)}')

theta = list(best_model.coef_[0])
b = best_model.intercept_[0]

Training set size: (3381, 30) accuracy: 0.640047323277137
Test set size: (117, 320) accuracy: 0.6410256410256411
Test set neg log loss: -0.6303558587867399. Average probability to observe data given model: 0.5324023072532715
