In [1]:
import git, os, sys
git_repo = git.Repo(os.getcwd(), search_parent_directories=True)
git_root = git_repo.git.rev_parse("--show-toplevel")
os.chdir(f'{git_root}/src')
sys.path.append(os.path.abspath(os.path.join(f'{git_root}/src')))
print(f'Changed working directory to {os.getcwd()}')

Changed working directory to C:\Users\Alex\OneDrive\Documents\GitHub\UFC_Prediction_2022\src


In [2]:
#getting dependencies
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn' (disables SettingWithCopyWarning)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import numpy as np
from datetime import datetime
from datetime import date
import matplotlib.pyplot as plt
import random
import sklearn
import scipy
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
import itertools

# from fight_stat_helpers import *

In [None]:
from sklearn.metrics import get_scorer_names

In [None]:
print(get_scorer_names())

In [3]:
#scores a model
def model_score(dataframe, features, iloc_val = 3200, _max_iter = 2000, scoring='neg_log_loss', scaled=True):
    yyy=dataframe['result'].iloc[0:iloc_val] # TODO we use result or fighter result depending on the dataframe... this is not ideal
    XXX=dataframe[features].iloc[0:iloc_val]
    XXXscaler = preprocessing.StandardScaler().fit(XXX)
    XXX_scaled = XXXscaler.transform(XXX) 
    X = XXX_scaled if scaled else XXX
    winPredictionModel=LogisticRegression(solver='lbfgs', max_iter=_max_iter, fit_intercept=False)
    # find the cross val score with log loss
    return cross_val_score(winPredictionModel,X,yyy,cv=4,scoring=scoring).mean()

In [4]:
ufc_fights_predictive_doubled_path = f'{git_root}/src/content/data/processed/ufc_fights_predictive_doubled.csv'
ufc_fights_predictive_doubled = pd.read_csv(ufc_fights_predictive_doubled_path)
ufc_fights_predictive_doubled[['fighter_age', 'opponent_age', 'fighter_result']][::-1].head()

# try this later
# ufc_fights_predictive_doubled_path = f'{git_root}/src/content/data/processed/ufc_fights_predictive_doubled.csv'
# ufc_fights_predictive_doubled = pd.read_csv(ufc_fights_predictive_doubled_path)

Unnamed: 0,fighter_age,opponent_age,fighter_result
16457,40.424367,25.596167,W
16456,27.893224,42.414784,W
16455,33.138946,37.295003,W
16454,37.097878,29.711157,L
16453,27.871321,37.672827,W


In [5]:
# train a model just on fighter_age and opponent_age
# first drop rows with NaN in fighter_age or opponent_age
age_df = ufc_fights_predictive_doubled.dropna(subset=['fighter_age', 'opponent_age', 'fighter_result']).copy()
age_df['result'] = age_df['fighter_result']  # rename for consistency with model_score function
# add difference columns
age_df['age_diff'] = age_df['fighter_age'] - age_df['opponent_age']

# check the model score
features = ['fighter_age', 'opponent_age']
score = model_score(age_df, features, iloc_val=3200, _max_iter=2000, scoring='accuracy', scaled=True)
print(f'Score for fighter_age and opponent_age: {score}')

# check the model score
features = ['age_diff']
score = model_score(age_df, features, iloc_val=3200, _max_iter=2000, scoring='accuracy', scaled=True)
print(f'Score for fighter_age and opponent_age with diffs: {score}')


Score for fighter_age and opponent_age: 0.485625
Score for fighter_age and opponent_age with diffs: 0.5390625


In [12]:
# check this by grabbing the age diff from the predictive diffs dataframe
ufc_fights_predictive_diffs_path = f'{git_root}/src/content/data/processed/ufc_fights_predictive_flattened_diffs.csv'
ufc_fights_predictive_diffs = pd.read_csv(ufc_fights_predictive_diffs_path)
df = ufc_fights_predictive_diffs.dropna(subset=['age_diff', 'result']).copy()
features = ['age_diff']
score = model_score(df, features, iloc_val=3200, _max_iter=2000, scoring='accuracy', scaled=True)
print(f'Score for age_diff from predictive diffs: {score}')

Score for age_diff from predictive diffs: 0.54125


In [13]:
# filter by division
# print out all unique divisions
print(ufc_fights_predictive_doubled['fighter_division'].unique())

['Open Weight' 'Lightweight' 'Heavyweight' 'Middleweight' 'Welterweight'
 'Super Heavyweight' 'Light Heavyweight' 'Catch Weight' 'Featherweight'
 'Bantamweight' 'Flyweight' "Women's Bantamweight" "Women's Strawweight"
 "Women's Flyweight" "Women's Featherweight"]


In [14]:
# specify the divisions we care about
divisions = [
    'Flyweight',
    'Bantamweight',
    'Featherweight',
    'Lightweight',
    'Welterweight',
    'Middleweight',
    'Light Heavyweight',
    'Heavyweight',
    "Women's Bantamweight", 
    "Women's Strawweight",
    "Women's Flyweight",
    "Women's Featherweight",
]

In [15]:
# get counts of fights per division
division_counts = ufc_fights_predictive_doubled['fighter_division'].value_counts()
print(division_counts)

fighter_division
Lightweight              2794
Welterweight             2686
Middleweight             2172
Featherweight            1620
Heavyweight              1472
Bantamweight             1460
Light Heavyweight        1432
Flyweight                 760
Women's Strawweight       684
Women's Flyweight         514
Women's Bantamweight      454
Open Weight               204
Catch Weight              144
Women's Featherweight      60
Super Heavyweight           2
Name: count, dtype: int64


In [18]:
df = ufc_fights_predictive_doubled[ufc_fights_predictive_doubled['fighter_division'] == 'Lightweight'].copy()
age_cols_dict = {}
age_cols_diffs_dict = {}
for optimal_age in range(18, 40):
    fighter_age_over_optimal = df['fighter_age'] - optimal_age
    # take the min of this and 0
    fighter_age_over_optimal = fighter_age_over_optimal.clip(lower=0)
    opponent_age_over_optimal = df['opponent_age'] - optimal_age
    opponent_age_over_optimal = opponent_age_over_optimal.clip(lower=0)
    age_cols_dict[f'fighter_age_over_optimal_{optimal_age}'] = fighter_age_over_optimal
    age_cols_dict[f'opponent_age_over_optimal_{optimal_age}'] = opponent_age_over_optimal
    # do the same for under optimal
    fighter_age_under_optimal = optimal_age - df['fighter_age']
    fighter_age_under_optimal = fighter_age_under_optimal.clip(lower=0)
    opponent_age_under_optimal = optimal_age - df['opponent_age']
    opponent_age_under_optimal = opponent_age_under_optimal.clip(lower=0)
    age_cols_dict[f'fighter_age_under_optimal_{optimal_age}'] = fighter_age_under_optimal
    age_cols_dict[f'opponent_age_under_optimal_{optimal_age}'] = opponent_age_under_optimal
    # add the diffs
    age_cols_diffs_dict[f'fighter_age_over_optimal_{optimal_age}_diff'] = fighter_age_over_optimal - opponent_age_over_optimal
    age_cols_diffs_dict[f'fighter_age_under_optimal_{optimal_age}_diff'] = fighter_age_under_optimal - opponent_age_under_optimal
    
# make a dataframe from the age_cols_dict
age_cols_df = pd.DataFrame(age_cols_diffs_dict)
age_cols_df['result'] = df['fighter_result']
# remove rows with nan
age_cols_df = age_cols_df.dropna()
best_score = 0
best_optimal_age = None

for optimal_age in range(18, 40):
    # score a model with the diffs features
    features = [f'fighter_age_over_optimal_{optimal_age}_diff', f'fighter_age_under_optimal_{optimal_age}_diff']
    score = model_score(age_cols_df, features, iloc_val=3200, _max_iter=2000, scoring='accuracy', scaled=True)
    print(f'Optimal age: {optimal_age}, Score: {score}, Features: {features}')
    if score > best_score:
        best_score = score
        best_optimal_age = optimal_age
        print(f'New best score: {best_score} with optimal age {optimal_age} and features {features}')


Optimal age: 18, Score: 0.5872434017595308, Features: ['fighter_age_over_optimal_18_diff', 'fighter_age_under_optimal_18_diff']
New best score: 0.5872434017595308 with optimal age 18 and features ['fighter_age_over_optimal_18_diff', 'fighter_age_under_optimal_18_diff']
Optimal age: 19, Score: 0.5872434017595308, Features: ['fighter_age_over_optimal_19_diff', 'fighter_age_under_optimal_19_diff']
Optimal age: 20, Score: 0.5872434017595308, Features: ['fighter_age_over_optimal_20_diff', 'fighter_age_under_optimal_20_diff']
Optimal age: 21, Score: 0.5865102639296188, Features: ['fighter_age_over_optimal_21_diff', 'fighter_age_under_optimal_21_diff']
Optimal age: 22, Score: 0.5872434017595308, Features: ['fighter_age_over_optimal_22_diff', 'fighter_age_under_optimal_22_diff']
Optimal age: 23, Score: 0.5854105571847508, Features: ['fighter_age_over_optimal_23_diff', 'fighter_age_under_optimal_23_diff']
Optimal age: 24, Score: 0.5791788856304986, Features: ['fighter_age_over_optimal_24_diff',

In [17]:
df = ufc_fights_predictive_doubled[ufc_fights_predictive_doubled['fighter_division'] == 'Lightweight'].copy()
age_cols_dict = {}
age_cols_diffs_dict = {}
for optimal_age in range(18, 40):
    fighter_age_dist_from_optimal = (df['fighter_age'] - optimal_age ).abs()
    opponent_age_dist_from_optimal = (df['opponent_age'] - optimal_age ).abs()
    age_cols_diffs_dict[f'fighter_age_dist_from_optimal_{optimal_age}_diff'] = fighter_age_dist_from_optimal - opponent_age_dist_from_optimal
    
# make a dataframe from the age_cols_dict
age_cols_df = pd.DataFrame(age_cols_diffs_dict)
age_cols_df['result'] = df['fighter_result']
# remove rows with nan
age_cols_df = age_cols_df.dropna()
best_score = 0
best_optimal_age = None

for optimal_age in range(18, 40):
    # score a model with the diffs features
    features = [f'fighter_age_dist_from_optimal_{optimal_age}_diff']
    score = model_score(age_cols_df, features, iloc_val=3200, _max_iter=2000, scoring='accuracy', scaled=True)
    print(f'Optimal age: {optimal_age}, Score: {score}, Features: {features}')
    if score > best_score:
        best_score = score
        best_optimal_age = optimal_age
        print(f'New best score: {best_score} with optimal age {optimal_age} and features {features}')


Optimal age: 18, Score: 0.5872434017595308, Features: ['fighter_age_dist_from_optimal_18_diff']
New best score: 0.5872434017595308 with optimal age 18 and features ['fighter_age_dist_from_optimal_18_diff']
Optimal age: 19, Score: 0.5872434017595308, Features: ['fighter_age_dist_from_optimal_19_diff']
Optimal age: 20, Score: 0.5872434017595308, Features: ['fighter_age_dist_from_optimal_20_diff']
Optimal age: 21, Score: 0.5872434017595308, Features: ['fighter_age_dist_from_optimal_21_diff']
Optimal age: 22, Score: 0.5879765395894428, Features: ['fighter_age_dist_from_optimal_22_diff']
New best score: 0.5879765395894428 with optimal age 22 and features ['fighter_age_dist_from_optimal_22_diff']
Optimal age: 23, Score: 0.5865102639296188, Features: ['fighter_age_dist_from_optimal_23_diff']
Optimal age: 24, Score: 0.5821114369501467, Features: ['fighter_age_dist_from_optimal_24_diff']
Optimal age: 25, Score: 0.5769794721407625, Features: ['fighter_age_dist_from_optimal_25_diff']
Optimal age:

# TODO figure out whats going on with this... it should help to use an optimal age but these do not look right...