In [1]:
import math
import pandas as pd
import numpy as np

new_fights = pd.read_csv('../data/ufc_fight_night.csv')
old_fights = pd.read_csv('../data/fight_arrangements.csv')
df = pd.read_csv('../data/raw_total_fight_data.csv', sep=';')
fighter_details = pd.read_csv('../data/raw_fighter_details.csv', index_col='fighter_name')

In [2]:
fights = pd.concat([new_fights, old_fights])

In [3]:
fights.reset_index(inplace=True)

In [4]:
fighter_details.drop(
    columns=["SLpM",
            "Str_Acc",
            "SApM",
            "Str_Def",
            "TD_Avg",
            "TD_Acc",
            "TD_Def",
            "Sub_Avg",
        ], inplace=True)

In [5]:
columns = ['R_SIG_STR.', 'B_SIG_STR.', 'R_TOTAL_STR.', 'B_TOTAL_STR.',
       'R_TD', 'B_TD', 'R_HEAD', 'B_HEAD', 'R_BODY','B_BODY', 'R_LEG', 'B_LEG', 
        'R_DISTANCE', 'B_DISTANCE', 'R_CLINCH','B_CLINCH', 'R_GROUND', 'B_GROUND']

In [6]:
attempt_suffix = '_att'
landed_suffix = '_landed'

for column in columns:
    df[column+attempt_suffix] = df[column].apply(lambda X: int(X.split('of')[1]))
    df[column+landed_suffix] = df[column].apply(lambda X: int(X.split('of')[0]))
    
df.drop(columns, axis=1, inplace=True)

In [7]:
df['Winner'].fillna('Draw', inplace=True)

In [8]:
pct_columns = ['R_SIG_STR_pct','B_SIG_STR_pct', 'R_TD_pct', 'B_TD_pct']

def pct_to_frac(X):
    if X != '---':
        return float(X.replace('%', ''))/100
    else:
        # if '---' means it's taking pct of `0 of 0`. 
        # Taking a call here to consider 0 landed of 0 attempted as 0 percentage
        return 0

for column in pct_columns:
    df[column] = df[column].apply(pct_to_frac)

In [9]:
df['title_bout'] = df['Fight_type'].apply(lambda X: True if 'Title Bout' in X else False)

In [10]:
def make_weight_class(X):
    for weight_class in weight_classes:
        if weight_class in X:
            return weight_class
    if X == 'Catch Weight Bout' or 'Catchweight Bout':
        return 'Catch Weight'
    else:
        return 'Open Weight'

In [11]:
weight_classes = ['Women\'s Strawweight', 'Women\'s Bantamweight', 
                  'Women\'s Featherweight', 'Women\'s Flyweight', 'Lightweight', 
                  'Welterweight', 'Middleweight','Light Heavyweight', 
                  'Heavyweight', 'Featherweight','Bantamweight', 'Flyweight', 'Open Weight']

df['weight_class'] = df['Fight_type'].apply(make_weight_class)

In [12]:
df.drop(columns=['Fight_type'], inplace=True)

In [13]:
time_in_first_round = {'3 Rnd (5-5-5)': 5*60, '5 Rnd (5-5-5-5-5)': 5*60, '1 Rnd + OT (12-3)': 12*60,
       'No Time Limit': 1, '3 Rnd + OT (5-5-5-5)': 5*60, '1 Rnd (20)': 1*20,
       '2 Rnd (5-5)': 5*60, '1 Rnd (15)': 15*60, '1 Rnd (10)': 10*60,
       '1 Rnd (12)':12*60, '1 Rnd + OT (30-5)': 30*60, '1 Rnd (18)': 18*60, '1 Rnd + OT (15-3)': 15*60,
       '1 Rnd (30)': 30*60, '1 Rnd + OT (31-5)': 31*5,
       '1 Rnd + OT (27-3)': 27*60, '1 Rnd + OT (30-3)': 30*60}

exception_format_time = {'1 Rnd + 2OT (15-3-3)': [15*60, 3*60], '1 Rnd + 2OT (24-3-3)': [24*60, 3*60]}

In [14]:
# Converting to seconds
df['last_round_time'] = df['last_round_time'].apply(lambda X: int(X.split(':')[0])*60 + int(X.split(':')[1]))

In [15]:
def get_total_time(row):
    if row['Format'] in time_in_first_round.keys():
        return (row['last_round'] - 1) * time_in_first_round[row['Format']] + row['last_round_time']
    elif row['Format'] in exception_format_time.keys():
        if (row['last_round'] - 1) >= 2:
            return exception_format_time[row['Format']][0] + (row['last_round'] - 2) * \
                    exception_format_time[row['Format']][1] + row['last_round_time']
        else:
            return (row['last_round'] - 1) * exception_format_time[row['Format']][0] + row['last_round_time']
    
# So if the fight ended in round 1, we only need last_round_time. 
# If it ended in round 2, we need the full time of round 1 and the last_round_time
# This works for fights with same time in each round and fights with only two rounds.

In [16]:
df['total_time_fought(seconds)'] = df.apply(get_total_time, axis=1)

In [17]:
def get_no_of_rounds(X):
    if X == 'No Time Limit':
        return 1
    else:
        return len(X.split('(')[1].replace(')', '').split('-'))

df['no_of_rounds'] = df['Format'].apply(get_no_of_rounds)

In [18]:
df.drop(columns=['Format', 'last_round_time'], inplace=True)

In [19]:
CTRL_columns = ['R_CTRL','B_CTRL']

def conv_to_sec(X):
    if X != '--':
        return int(X.split(':')[0])*60 + int(X.split(':')[1])
    else:
        # if '--' means there was no time spent on the ground. 
        # Taking a call here to consider this as 0 seconds
        return 0

for column in CTRL_columns:
    df[column+'_time(seconds)'] = df[column].apply(conv_to_sec)

In [20]:
df.drop(columns=['R_CTRL', 'B_CTRL'], inplace=True)

In [21]:
new_fights['R_fighter'].value_counts().index

Index(['Mike Jackson', 'Tatsuro Taira', 'Piera Rodriguez', 'Joanderson Brito',
       'Nick Maximov', 'Mana Martinez', 'Raphael Assuncao', 'Misha Cirkunov',
       'Jordan Wright', 'Cub Swanson', 'Alexa Grasso'],
      dtype='object')

In [22]:
new_empty_fights = pd.read_csv('../data/empty_fight_data.csv')

In [23]:
df = pd.concat([df,pd.get_dummies(df['win_by'], prefix='win_by')],axis=1)
df.drop(['win_by'],axis=1, inplace=True)

In [24]:
df = pd.concat([new_empty_fights,df], ignore_index=True)

In [25]:
red_fighters = df['R_fighter'].value_counts().index
blue_fighters = df['B_fighter'].value_counts().index

fighters = list(set(red_fighters) | set(blue_fighters))

In [26]:
Numerical_columns = ['hero_KD', 'opp_KD', 'hero_SIG_STR_pct',
       'opp_SIG_STR_pct', 'hero_TD_pct', 'opp_TD_pct', 'hero_SUB_ATT', 'opp_SUB_ATT',
        'hero_REV', 'opp_REV', 'hero_SIG_STR._att', 'hero_SIG_STR._landed',
       'opp_SIG_STR._att', 'opp_SIG_STR._landed', 'hero_TOTAL_STR._att',
       'hero_TOTAL_STR._landed', 'opp_TOTAL_STR._att', 'opp_TOTAL_STR._landed',
       'hero_TD_att', 'hero_TD_landed', 'opp_TD_att', 'opp_TD_landed', 'hero_HEAD_att',
       'hero_HEAD_landed', 'opp_HEAD_att', 'opp_HEAD_landed', 'hero_BODY_att',
       'hero_BODY_landed', 'opp_BODY_att', 'opp_BODY_landed', 'hero_LEG_att',
       'hero_LEG_landed', 'opp_LEG_att', 'opp_LEG_landed', 'hero_DISTANCE_att',
       'hero_DISTANCE_landed', 'opp_DISTANCE_att', 'opp_DISTANCE_landed',
       'hero_CLINCH_att', 'hero_CLINCH_landed', 'opp_CLINCH_att', 'opp_CLINCH_landed',
       'hero_GROUND_att', 'hero_GROUND_landed', 'opp_GROUND_att', 'opp_GROUND_landed', 
        'hero_CTRL_time(seconds)', 'opp_CTRL_time(seconds)',
       'total_time_fought(seconds)']

Categorical_columns = ['win_by', 'last_round',
        'Winner', 'title_bout']

In [27]:
import re

def lreplace(pattern, sub, string):
    """
    Replaces 'pattern' in 'string' with 'sub' if 'pattern' starts 'string'.
    """
    return re.sub('^%s' % pattern, sub, string)

In [28]:
red = df.groupby('R_fighter')
blue = df.groupby('B_fighter')

In [29]:
def get_fighter_red(fighter_name):
    try:
        fighter_red = red.get_group(fighter_name)
    except:
        return None
    rename_columns = {}
    for column in fighter_red.columns:
        if re.search('^R_', column) is not None:
            rename_columns[column] = lreplace('R_', 'hero_', column)
        elif re.search('^B_', column) is not None:
            rename_columns[column] = lreplace('B_', 'opp_', column)
    fighter_red = fighter_red.rename(rename_columns, axis='columns')
    return fighter_red

In [30]:
def get_fighter_blue(fighter_name):
    try:
        fighter_blue = blue.get_group(fighter_name)
    except:
        return None
    rename_columns = {}
    for column in fighter_blue.columns:
        if re.search('^B_', column) is not None:
            rename_columns[column] = lreplace('B_', 'hero_', column)
        elif re.search('^R_', column) is not None:
            rename_columns[column] = lreplace('R_', 'opp_', column)
    fighter_blue = fighter_blue.rename(rename_columns, axis='columns')
    return fighter_blue

In [31]:
def get_result_stats(result_list):
    result_list.reverse() # To get it in ascending order
    current_win_streak = 0
    current_lose_streak = 0
    longest_win_streak = 0
    wins = 0
    losses = 0
    draw = 0
    for result in result_list:
        if result == 'hero':
            wins += 1
            current_win_streak += 1
            current_lose_streak = 0
            if longest_win_streak < current_win_streak:
                longest_win_streak += 1
        elif result == 'opp':
            losses += 1
            current_win_streak = 0
            current_lose_streak += 1
        elif result == 'draw':
            draw += 1
            current_lose_streak = 0
            current_win_streak = 0
            
    return current_win_streak, current_lose_streak, longest_win_streak, wins, losses, draw

In [32]:
win_by_columns = ['win_by_Decision - Majority', 'win_by_Decision - Split',
       'win_by_Decision - Unanimous', 'win_by_KO/TKO','win_by_Submission',
       'win_by_TKO - Doctor\'s Stoppage']

In [33]:
temp_blue_frame = pd.DataFrame()
temp_red_frame = pd.DataFrame()
result_stats = ['current_win_streak', 'current_lose_streak', 'longest_win_streak', 'wins', 'losses', 'draw']

for fighter_name in fighters:
    fighter_red = get_fighter_red(fighter_name)
    fighter_blue = get_fighter_blue(fighter_name)
    fighter_index = None
    
    if fighter_red is None:
        fighter = fighter_blue
        fighter_index = 'blue'
    elif fighter_blue is None:
        fighter = fighter_red
        fighter_index = 'red'
    else:
        fighter = pd.concat([fighter_red, fighter_blue]).sort_index()
        
    fighter['Winner'] = fighter['Winner'].apply(lambda X: 'hero' if X == fighter_name else 'opp')
    
    for i, index in enumerate(fighter.index):
        fighter_slice = fighter[(i+1):].sort_index(ascending=False)
        s = fighter_slice[Numerical_columns].ewm(span=3, adjust=False).mean().tail(1)
        if len(s) != 0:
            pass
        else:
            s.loc[len(s)] = [np.NaN for _ in s.columns]
        s['total_rounds_fought'] = fighter_slice['last_round'].sum()
        s['total_title_bouts'] = fighter_slice[fighter_slice['title_bout']==True]['title_bout'].count()
        s['hero_fighter'] = fighter_name
        results = get_result_stats(list(fighter_slice['Winner']))
        for result_stat, result in zip(result_stats, results):
            s[result_stat] = result
        win_by_results = fighter_slice[fighter_slice['Winner'] == 'hero'][win_by_columns].sum()
        for win_by_column,win_by_result in zip(win_by_columns, win_by_results):
            s[win_by_column] = win_by_result
        s.index = [index]


        if fighter_index is None:
            if index in fighter_blue.index:
                temp_blue_frame = pd.concat([temp_blue_frame, s])
            elif index in fighter_red.index:
                temp_red_frame = pd.concat([temp_red_frame, s])
        elif fighter_index == 'blue':
            temp_blue_frame = pd.concat([temp_blue_frame, s])
        elif fighter_index == 'red':
            temp_red_frame = pd.concat([temp_red_frame, s])

In [34]:
def convert_to_cms(X):
    if X is np.NaN:
        return X
    elif len(X.split("'")) == 2:
        feet = float(X.split("'")[0])
        inches = int(X.split("'")[1].replace(' ', '').replace('"',''))
        return (feet * 30.48) + (inches * 2.54)
    else:
        return float(X.replace('"','')) * 2.54

In [35]:
temp_blue_frame

Unnamed: 0,hero_KD,opp_KD,hero_SIG_STR_pct,opp_SIG_STR_pct,hero_TD_pct,opp_TD_pct,hero_SUB_ATT,opp_SUB_ATT,hero_REV,opp_REV,...,longest_win_streak,wins,losses,draw,win_by_Decision - Majority,win_by_Decision - Split,win_by_Decision - Unanimous,win_by_KO/TKO,win_by_Submission,win_by_TKO - Doctor's Stoppage
6340,,,,,,,,,,,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
5900,0.0625,1.009766,0.491013,0.438183,0.401138,0.047189,0.829254,0.345703,0.0,0.3125,...,4,11,5,0,0.0,0.0,2.0,4.0,4.0,1.0
5961,0.1250,0.019531,0.552026,0.436366,0.472275,0.094377,1.658508,0.691406,0.0,0.6250,...,4,11,4,0,0.0,0.0,2.0,4.0,4.0,1.0
6345,0.5000,0.078125,0.428103,0.365464,0.139102,0.377510,0.634033,0.765625,0.0,0.5000,...,4,10,3,0,0.0,0.0,2.0,4.0,3.0,1.0
6370,0.0000,0.156250,0.596206,0.380928,0.078203,0.755020,0.268066,0.531250,0.0,0.0000,...,4,10,2,0,0.0,0.0,2.0,4.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3741,0.0000,0.000000,0.580000,0.570000,0.400000,0.500000,1.000000,0.000000,0.0,0.0000,...,1,1,0,0,0.0,0.0,1.0,0.0,0.0,0.0
3903,,,,,,,,,,,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
5374,0.0000,0.500000,0.350000,0.515000,0.000000,0.355000,1.000000,1.000000,1.5,0.0000,...,0,0,2,0,0.0,0.0,0.0,0.0,0.0,0.0
5460,0.0000,1.000000,0.160000,0.420000,0.000000,0.000000,0.000000,0.000000,0.0,0.0000,...,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
fighter_details['Height_cms'] = fighter_details['Height'].apply(convert_to_cms)
fighter_details['Reach_cms'] = fighter_details['Reach'].apply(convert_to_cms)

In [37]:
fighter_details['Weight_lbs'] = fighter_details['Weight'].apply(lambda X: float(X.replace(' lbs.', '')) if X is not np.NaN else X)

In [38]:
fighter_details.drop(['Height', 'Weight', 'Reach'], axis=1, inplace=True)

In [39]:
fighter_details.reset_index(inplace=True)
temp_red_frame.reset_index(inplace=True)
temp_blue_frame.reset_index(inplace=True)

In [40]:
temp_blue_frame = temp_blue_frame.merge(fighter_details, left_on='hero_fighter', right_on='fighter_name', how='left')
temp_blue_frame.set_index('index', inplace=True)

In [41]:
temp_red_frame = temp_red_frame.merge(fighter_details, left_on='hero_fighter', right_on='fighter_name', how='left')
temp_red_frame.set_index('index', inplace=True)

In [42]:
temp_red_frame

Unnamed: 0_level_0,hero_KD,opp_KD,hero_SIG_STR_pct,opp_SIG_STR_pct,hero_TD_pct,opp_TD_pct,hero_SUB_ATT,opp_SUB_ATT,hero_REV,opp_REV,...,win_by_Decision - Unanimous,win_by_KO/TKO,win_by_Submission,win_by_TKO - Doctor's Stoppage,fighter_name,Stance,DOB,Height_cms,Reach_cms,Weight_lbs
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6278,0.25,0.039062,0.524052,0.372732,0.444551,0.188755,1.317017,0.382812,0.0,1.25,...,2.0,4.0,3.0,1.0,Evan Tanner,Orthodox,"Feb 11, 1971",182.88,187.96,185.0
6391,0.00,0.312500,0.522412,0.321855,0.156406,0.510039,0.536133,0.062500,0.0,0.00,...,2.0,3.0,3.0,1.0,Evan Tanner,Orthodox,"Feb 11, 1971",182.88,187.96,185.0
6403,0.00,0.625000,0.694824,0.433711,0.312812,0.020078,0.072266,0.125000,0.0,0.00,...,2.0,3.0,2.0,1.0,Evan Tanner,Orthodox,"Feb 11, 1971",182.88,187.96,185.0
6416,0.00,1.250000,0.679648,0.527422,0.215625,0.040156,0.144531,0.250000,0.0,0.00,...,1.0,3.0,2.0,1.0,Evan Tanner,Orthodox,"Feb 11, 1971",182.88,187.96,185.0
6439,0.00,0.500000,0.539297,0.524844,0.181250,0.080313,0.289062,0.500000,0.0,0.00,...,1.0,2.0,2.0,1.0,Evan Tanner,Orthodox,"Feb 11, 1971",182.88,187.96,185.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1875,0.00,0.000000,0.485000,0.130000,0.665000,0.000000,0.500000,0.500000,0.0,0.00,...,0.0,0.0,1.0,0.0,Mara Romero Borella,Orthodox,"Jun 03, 1986",167.64,175.26,125.0
2498,,,,,,,,,,,...,0.0,0.0,0.0,0.0,Mara Romero Borella,Orthodox,"Jun 03, 1986",167.64,175.26,125.0
703,0.00,1.000000,0.505000,0.380000,0.250000,0.000000,0.000000,0.000000,0.0,0.00,...,0.0,1.0,0.0,0.0,Dusko Todorovic,Orthodox,"May 19, 1994",185.42,187.96,185.0
3621,0.00,0.000000,0.480000,0.405000,0.200000,0.055000,0.000000,1.000000,0.0,0.00,...,1.0,0.0,0.0,0.0,Mark Eddiva,Orthodox,"Feb 16, 1986",172.72,172.72,145.0


In [43]:
temp_blue_frame.drop('fighter_name', axis=1, inplace=True)
temp_red_frame.drop('fighter_name', axis=1, inplace=True)

In [44]:
blue_frame = temp_blue_frame.add_prefix('B_')
red_frame = temp_red_frame.add_prefix('R_')

In [45]:
frame = blue_frame.join(red_frame, how='outer')

In [46]:
rename_cols = {}
for col in frame.columns:
    if 'hero' in col:
        rename_cols[col] = col.replace('_hero_', '_avg_').replace('.', '')
    if 'opp' in col:
        rename_cols[col] = col.replace('_opp_', '_avg_opp_').replace('.', '')
    if 'win_by' in col:
        rename_cols[col] = col.replace(' ', '').replace('-', '_').replace('\'s', '_')

In [47]:
frame.rename(rename_cols, axis='columns', inplace=True)
frame.drop(['R_avg_fighter','B_avg_fighter'], axis=1, inplace=True)

In [48]:
fights = fights.join(frame, how='outer')

In [49]:
fights

Unnamed: 0,index,R_fighter,B_fighter,date,title_bout,weight_class,no_of_rounds,Referee,location,Winner,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_DOB,R_Height_cms,R_Reach_cms,R_Weight_lbs
0,0,Mike Jackson,Pete Rodriguez,"October 15, 2022",False,Welterweight,3,,,,...,0.0,0.0,0.0,0.0,0.0,Orthodox,"Mar 22, 1985",187.96,187.96,170.0
1,1,Tatsuro Taira,CJ Vergara,"October 15, 2022",False,Flyweight,3,,,,...,0.0,1.0,0.0,0.0,0.0,Orthodox,"Jan 27, 2000",170.18,177.80,125.0
2,2,Piera Rodriguez,Sam Hughes,"October 15, 2022",False,Women's Strawweight,3,,,,...,0.0,1.0,0.0,0.0,0.0,Orthodox,"Nov 11, 1992",160.02,160.02,115.0
3,3,Joanderson Brito,Lucas Alexander,"October 15, 2022",False,Featherweight,3,,,,...,0.0,0.0,1.0,0.0,0.0,Orthodox,"Feb 11, 1995",172.72,182.88,145.0
4,4,Nick Maximov,Jacob Malkoun,"October 15, 2022",False,Middleweight,3,,,,...,1.0,1.0,0.0,0.0,0.0,Southpaw,"Dec 23, 1997",182.88,193.04,185.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6813,6802,Orlando Wiet,Robert Lucarelli,"March 11, 1994",False,Open Weight,1,John McCarthy,"Denver, Colorado, USA",Orlando Wiet,...,0.0,0.0,0.0,0.0,0.0,Southpaw,,177.80,,170.0
6814,6803,Frank Hamaker,Thaddeus Luster,"March 11, 1994",False,Open Weight,1,John McCarthy,"Denver, Colorado, USA",Frank Hamaker,...,0.0,0.0,0.0,0.0,0.0,,,,,
6815,6804,Scott Morris,Sean Daugherty,"March 11, 1994",False,Open Weight,1,John McCarthy,"Denver, Colorado, USA",Scott Morris,...,0.0,0.0,0.0,0.0,0.0,Orthodox,,177.80,,210.0
6816,6805,Patrick Smith,Ray Wizard,"March 11, 1994",False,Open Weight,1,John McCarthy,"Denver, Colorado, USA",Patrick Smith,...,0.0,0.0,0.0,0.0,0.0,Orthodox,"Aug 28, 1963",187.96,,225.0


In [50]:
fights['R_DOB'] = pd.to_datetime(fights['R_DOB'])
fights['B_DOB'] = pd.to_datetime(fights['B_DOB'])
fights['date'] = pd.to_datetime(fights['date'])

In [51]:
def get_age(row):
    B_age = (row['date'] - row['B_DOB']).days
    R_age = (row['date'] - row['R_DOB']).days
    if np.isnan(B_age)!=True:
        B_age = math.floor(B_age/365.25)
    if np.isnan(R_age)!=True:
        R_age = math.floor(R_age/365.25)
    return pd.Series([B_age, R_age], index=['B_age', 'R_age'])

In [52]:
fights[['B_age', 'R_age']]= fights[['date', 'R_DOB', 'B_DOB']].apply(get_age, axis=1)
fights.drop(['R_DOB', 'B_DOB'], axis=1, inplace=True)

In [53]:
fights

Unnamed: 0,index,R_fighter,B_fighter,date,title_bout,weight_class,no_of_rounds,Referee,location,Winner,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,0,Mike Jackson,Pete Rodriguez,2022-10-15,False,Welterweight,3,,,,...,0.0,0.0,0.0,0.0,Orthodox,187.96,187.96,170.0,25.0,37.0
1,1,Tatsuro Taira,CJ Vergara,2022-10-15,False,Flyweight,3,,,,...,1.0,0.0,0.0,0.0,Orthodox,170.18,177.80,125.0,31.0,22.0
2,2,Piera Rodriguez,Sam Hughes,2022-10-15,False,Women's Strawweight,3,,,,...,1.0,0.0,0.0,0.0,Orthodox,160.02,160.02,115.0,30.0,29.0
3,3,Joanderson Brito,Lucas Alexander,2022-10-15,False,Featherweight,3,,,,...,0.0,1.0,0.0,0.0,Orthodox,172.72,182.88,145.0,27.0,27.0
4,4,Nick Maximov,Jacob Malkoun,2022-10-15,False,Middleweight,3,,,,...,1.0,0.0,0.0,0.0,Southpaw,182.88,193.04,185.0,27.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6813,6802,Orlando Wiet,Robert Lucarelli,1994-03-11,False,Open Weight,1,John McCarthy,"Denver, Colorado, USA",Orlando Wiet,...,0.0,0.0,0.0,0.0,Southpaw,177.80,,170.0,,
6814,6803,Frank Hamaker,Thaddeus Luster,1994-03-11,False,Open Weight,1,John McCarthy,"Denver, Colorado, USA",Frank Hamaker,...,0.0,0.0,0.0,0.0,,,,,,
6815,6804,Scott Morris,Sean Daugherty,1994-03-11,False,Open Weight,1,John McCarthy,"Denver, Colorado, USA",Scott Morris,...,0.0,0.0,0.0,0.0,Orthodox,177.80,,210.0,18.0,
6816,6805,Patrick Smith,Ray Wizard,1994-03-11,False,Open Weight,1,John McCarthy,"Denver, Colorado, USA",Patrick Smith,...,0.0,0.0,0.0,0.0,Orthodox,187.96,,225.0,,30.0


In [54]:
df2 = fights.copy()

In [55]:
df2['R_Reach_cms'].fillna(df2['R_Height_cms'], inplace=True)
df2['B_Reach_cms'].fillna(df2['B_Height_cms'], inplace=True)
df2.fillna(df2.median(numeric_only=True), inplace=True)
df2['R_Stance'].fillna('Orthodox', inplace=True)
df2['B_Stance'].fillna('Orthodox', inplace=True)
df2.drop(columns=['Referee','location', 'date', 'R_fighter', 'B_fighter'], inplace=True)

In [56]:
def convert_bool_to_int(X):
    if X:
        return 1
    else:
        return 0

In [57]:
df2['title_bout'] = df2['title_bout'].apply(convert_bool_to_int)

In [58]:
df2 = pd.concat([df2, pd.get_dummies(df2[['weight_class', 'B_Stance', 'R_Stance']])], axis=1)
df2.drop(columns=['weight_class', 'B_Stance', 'R_Stance'], inplace=True)

In [59]:
df2.columns[120:]

Index(['R_total_title_bouts', 'R_current_win_streak', 'R_current_lose_streak',
       'R_longest_win_streak', 'R_wins', 'R_losses', 'R_draw',
       'R_win_by_Decision_Majority', 'R_win_by_Decision_Split',
       'R_win_by_Decision_Unanimous', 'R_win_by_KO/TKO', 'R_win_by_Submission',
       'R_win_by_TKO_Doctor_Stoppage', 'R_Height_cms', 'R_Reach_cms',
       'R_Weight_lbs', 'B_age', 'R_age', 'weight_class_Bantamweight',
       'weight_class_Catch Weight', 'weight_class_Featherweight',
       'weight_class_Flyweight', 'weight_class_Heavyweight',
       'weight_class_Light Heavyweight', 'weight_class_Lightweight',
       'weight_class_Middleweight', 'weight_class_Open Weight',
       'weight_class_Welterweight', 'weight_class_Women's Bantamweight',
       'weight_class_Women's Featherweight', 'weight_class_Women's Flyweight',
       'weight_class_Women's Strawweight', 'B_Stance_Open Stance',
       'B_Stance_Orthodox', 'B_Stance_Sideways', 'B_Stance_Southpaw',
       'B_Stance_Switch',

In [63]:
df2[:20]

Unnamed: 0,index,title_bout,no_of_rounds,Winner,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Sideways,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Sideways,R_Stance_Southpaw,R_Stance_Switch
0,0,0,3,,0.0,1.0,0.36,0.53,0.0,0.0,...,0,0,0,0,1,0,1,0,0,0
1,1,0,3,,0.0,0.0,0.555,0.48,0.0,0.21,...,0,1,0,0,0,0,1,0,0,0
2,2,0,3,,0.0,0.125,0.5475,0.421875,0.385,0.17625,...,0,1,0,0,0,0,1,0,0,0
3,3,0,3,,0.015625,0.0,0.454297,0.435,0.2425,0.2,...,0,1,0,0,0,0,1,0,0,0
4,4,0,3,,0.0,0.125,0.51625,0.57625,0.38375,0.5,...,0,1,0,0,0,0,0,0,1,0
5,5,0,3,,0.0,0.625,0.308672,0.461406,0.237812,0.226719,...,0,1,0,0,0,0,1,0,0,0
6,6,0,3,,0.0,0.0,0.51,0.48,0.0,0.14,...,0,0,0,0,1,0,1,0,0,0
7,7,0,3,,0.007812,0.28125,0.746328,0.283359,0.5175,0.064062,...,0,1,0,0,0,0,1,0,0,0
8,8,0,3,,0.0,0.625,0.595625,0.62625,0.42125,0.075,...,0,1,0,0,0,0,1,0,0,0
9,9,0,3,,0.113281,0.316406,0.524629,0.354512,0.517578,0.006836,...,0,0,0,1,0,0,1,0,0,0


In [61]:
df2.columns

Index(['index', 'title_bout', 'no_of_rounds', 'Winner', 'B_avg_KD',
       'B_avg_opp_KD', 'B_avg_SIG_STR_pct', 'B_avg_opp_SIG_STR_pct',
       'B_avg_TD_pct', 'B_avg_opp_TD_pct',
       ...
       'B_Stance_Open Stance', 'B_Stance_Orthodox', 'B_Stance_Sideways',
       'B_Stance_Southpaw', 'B_Stance_Switch', 'R_Stance_Open Stance',
       'R_Stance_Orthodox', 'R_Stance_Sideways', 'R_Stance_Southpaw',
       'R_Stance_Switch'],
      dtype='object', length=162)

In [62]:
df2.head(11).to_csv('../data/prediction_data.csv', index=False)