### feature_engineering.ipynb: 
goal: clean, select, and engineer features for modeling

In [391]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay, f1_score
from sklearn.impute import SimpleImputer

import sys, warnings, os
from sklearn.dummy import DummyClassifier

In [17]:
pd.set_option('display.max_columns', 120, "display.max_rows", None)

In [18]:
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

In [60]:
df = pd.read_csv('../data/ufc-master.csv')
print(df.shape)
df.head()

(6528, 118)


Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,TitleBout,WeightClass,Gender,NumberOfRounds,BlueCurrentLoseStreak,BlueCurrentWinStreak,BlueDraws,BlueAvgSigStrLanded,BlueAvgSigStrPct,BlueAvgSubAtt,BlueAvgTDLanded,BlueAvgTDPct,BlueLongestWinStreak,BlueLosses,BlueTotalRoundsFought,BlueTotalTitleBouts,BlueWinsByDecisionMajority,BlueWinsByDecisionSplit,BlueWinsByDecisionUnanimous,BlueWinsByKO,BlueWinsBySubmission,BlueWinsByTKODoctorStoppage,BlueWins,BlueStance,BlueHeightCms,BlueReachCms,BlueWeightLbs,RedCurrentLoseStreak,RedCurrentWinStreak,RedDraws,RedAvgSigStrLanded,RedAvgSigStrPct,RedAvgSubAtt,RedAvgTDLanded,RedAvgTDPct,RedLongestWinStreak,RedLosses,RedTotalRoundsFought,RedTotalTitleBouts,RedWinsByDecisionMajority,RedWinsByDecisionSplit,RedWinsByDecisionUnanimous,RedWinsByKO,RedWinsBySubmission,RedWinsByTKODoctorStoppage,RedWins,RedStance,RedHeightCms,RedReachCms,RedWeightLbs,RedAge,BlueAge,LoseStreakDif,WinStreakDif,LongestWinStreakDif,WinDif,LossDif,TotalRoundDif,TotalTitleBoutDif,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,EmptyArena,BMatchWCRank,RMatchWCRank,RWFlyweightRank,RWFeatherweightRank,RWStrawweightRank,RWBantamweightRank,RHeavyweightRank,RLightHeavyweightRank,RMiddleweightRank,RWelterweightRank,RLightweightRank,RFeatherweightRank,RBantamweightRank,RFlyweightRank,RPFPRank,BWFlyweightRank,BWFeatherweightRank,BWStrawweightRank,BWBantamweightRank,BHeavyweightRank,BLightHeavyweightRank,BMiddleweightRank,BWelterweightRank,BLightweightRank,BFeatherweightRank,BBantamweightRank,BFlyweightRank,BPFPRank,BetterRank,Finish,FinishDetails,FinishRound,FinishRoundTime,TotalFightTimeSecs,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds
0,Alexandre Pantoja,Kai Asakura,-250.0,215.0,40.0,215.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,True,Flyweight,MALE,5,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,Orthodox,172.72,175.26,125,0,6,0,4.41,0.49,0.8,2.61,0.47,6,3,42,3,0,2,4,2,4,0,12,Orthodox,165.1,170.18,125,34,31,0,-6,-6,-12,-3,-42,-3,-2,-4,7.62,5.08,-3,-4.41,-0.8,-2.61,,,0.0,,,,,,,,,,,,0.0,11.0,,,,,,,,,,,,,,Red,SUB,Rear Naked Choke,2.0,2:05,425.0,300.0,800.0,150.0,2500.0,400.0,350.0
1,Shavkat Rakhmonov,Ian Machado Garry,-210.0,295.0,47.619,295.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,False,Welterweight,MALE,3,0,8,0,5.5,0.55,0.3,0.77,0.55,8,0,20,0,0,1,4,3,0,0,8,Orthodox,190.5,187.96,170,0,6,0,4.12,0.61,1.8,1.49,0.29,6,0,11,0,0,0,0,1,5,0,6,Orthodox,185.42,195.58,170,30,27,0,2,2,2,0,9,0,2,-5,5.08,-7.62,-3,1.38,-1.5,-0.72,,7.0,3.0,,,,,,,,3.0,,,,,,,,,,,,,7.0,,,,,,Red,U-DEC,,5.0,5:00,1500.0,250.0,650.0,180.0,3000.0,240.0,700.0
2,Ciryl Gane,Alexander Volkov,-380.0,300.0,26.3158,300.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,False,Heavyweight,MALE,3,0,4,0,5.13,0.57,0.2,0.45,0.63,4,4,44,0,0,1,4,6,1,0,12,Orthodox,200.66,203.2,250,0,1,0,5.49,0.6,0.5,0.58,0.21,7,2,33,3,0,0,3,4,2,0,9,Orthodox,193.04,205.74,245,34,36,0,3,-3,3,2,11,-3,2,-1,7.62,-2.54,2,-0.36,-0.3,-0.13,,3.0,2.0,,,,,2.0,,,,,,,,,,,,,3.0,,,,,,,,,Red,S-DEC,,3.0,5:00,900.0,-160.0,450.0,1100.0,3000.0,350.0,1100.0
3,Bryce Mitchell,Kron Gracie,-950.0,625.0,10.5263,625.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,False,Featherweight,MALE,3,2,0,0,3.74,0.44,0.5,0.47,0.25,1,2,7,0,0,0,0,0,1,0,1,Southpaw,175.26,177.8,145,1,0,0,2.3,0.58,1.6,3.45,0.41,6,2,22,0,1,0,5,0,1,0,7,Southpaw,177.8,177.8,145,30,36,1,0,-5,-6,0,-15,0,0,0,-2.54,0.0,6,1.44,-1.1,-2.98,,,13.0,,,,,,,,,,13.0,,,,,,,,,,,,,,,,,Red,KO/TKO,Elbows,3.0,0:39,639.0,-200.0,1100.0,380.0,1400.0,500.0,4000.0
4,Nate Landwehr,Dooho Choi,-130.0,110.0,76.9231,110.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Blue,False,Featherweight,MALE,3,0,1,1,4.41,0.53,0.8,0.75,0.37,3,3,15,0,0,0,0,4,0,0,4,Orthodox,177.8,177.8,145,0,1,0,6.25,0.46,1.0,1.0,0.41,3,3,17,0,1,0,1,1,2,0,5,Orthodox,175.26,182.88,145,36,33,0,0,0,-1,0,-2,0,3,-2,2.54,-5.08,-3,-1.84,-0.2,-0.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,KO/TKO,Elbows,3.0,3:21,801.0,275.0,550.0,500.0,700.0,300.0,250.0


### DF for the Baseline Model:

In [502]:
# dropping BWFeatherweightRank as it errors due to all-but-one NaN values
X = df.drop(['BWFeatherweightRank'], axis = 1)

In [503]:
X.head()

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,TitleBout,WeightClass,Gender,NumberOfRounds,BlueCurrentLoseStreak,BlueCurrentWinStreak,BlueDraws,BlueAvgSigStrLanded,BlueAvgSigStrPct,BlueAvgSubAtt,BlueAvgTDLanded,BlueAvgTDPct,BlueLongestWinStreak,BlueLosses,BlueTotalRoundsFought,BlueTotalTitleBouts,BlueWinsByDecisionMajority,BlueWinsByDecisionSplit,BlueWinsByDecisionUnanimous,BlueWinsByKO,BlueWinsBySubmission,BlueWinsByTKODoctorStoppage,BlueWins,BlueStance,BlueHeightCms,BlueReachCms,BlueWeightLbs,RedCurrentLoseStreak,RedCurrentWinStreak,RedDraws,RedAvgSigStrLanded,RedAvgSigStrPct,RedAvgSubAtt,RedAvgTDLanded,RedAvgTDPct,RedLongestWinStreak,RedLosses,RedTotalRoundsFought,RedTotalTitleBouts,RedWinsByDecisionMajority,RedWinsByDecisionSplit,RedWinsByDecisionUnanimous,RedWinsByKO,RedWinsBySubmission,RedWinsByTKODoctorStoppage,RedWins,RedStance,RedHeightCms,RedReachCms,RedWeightLbs,RedAge,BlueAge,LoseStreakDif,WinStreakDif,LongestWinStreakDif,WinDif,LossDif,TotalRoundDif,TotalTitleBoutDif,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,EmptyArena,BMatchWCRank,RMatchWCRank,RWFlyweightRank,RWFeatherweightRank,RWStrawweightRank,RWBantamweightRank,RHeavyweightRank,RLightHeavyweightRank,RMiddleweightRank,RWelterweightRank,RLightweightRank,RFeatherweightRank,RBantamweightRank,RFlyweightRank,RPFPRank,BWFlyweightRank,BWStrawweightRank,BWBantamweightRank,BHeavyweightRank,BLightHeavyweightRank,BMiddleweightRank,BWelterweightRank,BLightweightRank,BFeatherweightRank,BBantamweightRank,BFlyweightRank,BPFPRank,BetterRank,Finish,FinishDetails,FinishRound,FinishRoundTime,TotalFightTimeSecs,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds
0,Alexandre Pantoja,Kai Asakura,-250.0,215.0,40.0,215.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,True,Flyweight,MALE,5,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,Orthodox,172.72,175.26,125,0,6,0,4.41,0.49,0.8,2.61,0.47,6,3,42,3,0,2,4,2,4,0,12,Orthodox,165.1,170.18,125,34,31,0,-6,-6,-12,-3,-42,-3,-2,-4,7.62,5.08,-3,-4.41,-0.8,-2.61,,,0.0,,,,,,,,,,,,0.0,11.0,,,,,,,,,,,,,Red,SUB,Rear Naked Choke,2.0,2:05,425.0,300.0,800.0,150.0,2500.0,400.0,350.0
1,Shavkat Rakhmonov,Ian Machado Garry,-210.0,295.0,47.619,295.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,False,Welterweight,MALE,3,0,8,0,5.5,0.55,0.3,0.77,0.55,8,0,20,0,0,1,4,3,0,0,8,Orthodox,190.5,187.96,170,0,6,0,4.12,0.61,1.8,1.49,0.29,6,0,11,0,0,0,0,1,5,0,6,Orthodox,185.42,195.58,170,30,27,0,2,2,2,0,9,0,2,-5,5.08,-7.62,-3,1.38,-1.5,-0.72,,7.0,3.0,,,,,,,,3.0,,,,,,,,,,,,7.0,,,,,,Red,U-DEC,,5.0,5:00,1500.0,250.0,650.0,180.0,3000.0,240.0,700.0
2,Ciryl Gane,Alexander Volkov,-380.0,300.0,26.3158,300.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,False,Heavyweight,MALE,3,0,4,0,5.13,0.57,0.2,0.45,0.63,4,4,44,0,0,1,4,6,1,0,12,Orthodox,200.66,203.2,250,0,1,0,5.49,0.6,0.5,0.58,0.21,7,2,33,3,0,0,3,4,2,0,9,Orthodox,193.04,205.74,245,34,36,0,3,-3,3,2,11,-3,2,-1,7.62,-2.54,2,-0.36,-0.3,-0.13,,3.0,2.0,,,,,2.0,,,,,,,,,,,,3.0,,,,,,,,,Red,S-DEC,,3.0,5:00,900.0,-160.0,450.0,1100.0,3000.0,350.0,1100.0
3,Bryce Mitchell,Kron Gracie,-950.0,625.0,10.5263,625.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,False,Featherweight,MALE,3,2,0,0,3.74,0.44,0.5,0.47,0.25,1,2,7,0,0,0,0,0,1,0,1,Southpaw,175.26,177.8,145,1,0,0,2.3,0.58,1.6,3.45,0.41,6,2,22,0,1,0,5,0,1,0,7,Southpaw,177.8,177.8,145,30,36,1,0,-5,-6,0,-15,0,0,0,-2.54,0.0,6,1.44,-1.1,-2.98,,,13.0,,,,,,,,,,13.0,,,,,,,,,,,,,,,,Red,KO/TKO,Elbows,3.0,0:39,639.0,-200.0,1100.0,380.0,1400.0,500.0,4000.0
4,Nate Landwehr,Dooho Choi,-130.0,110.0,76.9231,110.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Blue,False,Featherweight,MALE,3,0,1,1,4.41,0.53,0.8,0.75,0.37,3,3,15,0,0,0,0,4,0,0,4,Orthodox,177.8,177.8,145,0,1,0,6.25,0.46,1.0,1.0,0.41,3,3,17,0,1,0,1,1,2,0,5,Orthodox,175.26,182.88,145,36,33,0,0,0,-1,0,-2,0,3,-2,2.54,-5.08,-3,-1.84,-0.2,-0.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,KO/TKO,Elbows,3.0,3:21,801.0,275.0,550.0,500.0,700.0,300.0,250.0


In [504]:
categorical = [col for col in X.columns if X[col].dtypes == 'object']
numerical = [col for col in X.columns if col not in categorical]

In [505]:
# time to encode categorical cols
enc = LabelEncoder()
for i in X[categorical]:
    X[i] = enc.fit_transform(X[i].astype(str))

In [506]:
imp = SimpleImputer(strategy = 'most_frequent')
imp.fit(X[numerical])
X[numerical] = imp.transform(X[numerical])

In [507]:
cat_imp = SimpleImputer(strategy = 'most_frequent')
cat_imp.fit(X[categorical])
X[categorical] = cat_imp.transform(X[categorical])

In [508]:
X.to_csv('../data/df_baseline.csv', index = False)

#### Now, we'll make a better dataset for better modeling

In [509]:
ufc = df.copy()
ufc.head()

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,TitleBout,WeightClass,Gender,NumberOfRounds,BlueCurrentLoseStreak,BlueCurrentWinStreak,BlueDraws,BlueAvgSigStrLanded,BlueAvgSigStrPct,BlueAvgSubAtt,BlueAvgTDLanded,BlueAvgTDPct,BlueLongestWinStreak,BlueLosses,BlueTotalRoundsFought,BlueTotalTitleBouts,BlueWinsByDecisionMajority,BlueWinsByDecisionSplit,BlueWinsByDecisionUnanimous,BlueWinsByKO,BlueWinsBySubmission,BlueWinsByTKODoctorStoppage,BlueWins,BlueStance,BlueHeightCms,BlueReachCms,BlueWeightLbs,RedCurrentLoseStreak,RedCurrentWinStreak,RedDraws,RedAvgSigStrLanded,RedAvgSigStrPct,RedAvgSubAtt,RedAvgTDLanded,RedAvgTDPct,RedLongestWinStreak,RedLosses,RedTotalRoundsFought,RedTotalTitleBouts,RedWinsByDecisionMajority,RedWinsByDecisionSplit,RedWinsByDecisionUnanimous,RedWinsByKO,RedWinsBySubmission,RedWinsByTKODoctorStoppage,RedWins,RedStance,RedHeightCms,RedReachCms,RedWeightLbs,RedAge,BlueAge,LoseStreakDif,WinStreakDif,LongestWinStreakDif,WinDif,LossDif,TotalRoundDif,TotalTitleBoutDif,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,EmptyArena,BMatchWCRank,RMatchWCRank,RWFlyweightRank,RWFeatherweightRank,RWStrawweightRank,RWBantamweightRank,RHeavyweightRank,RLightHeavyweightRank,RMiddleweightRank,RWelterweightRank,RLightweightRank,RFeatherweightRank,RBantamweightRank,RFlyweightRank,RPFPRank,BWFlyweightRank,BWFeatherweightRank,BWStrawweightRank,BWBantamweightRank,BHeavyweightRank,BLightHeavyweightRank,BMiddleweightRank,BWelterweightRank,BLightweightRank,BFeatherweightRank,BBantamweightRank,BFlyweightRank,BPFPRank,BetterRank,Finish,FinishDetails,FinishRound,FinishRoundTime,TotalFightTimeSecs,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds
0,Alexandre Pantoja,Kai Asakura,-250.0,215.0,40.0,215.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,True,Flyweight,MALE,5,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,Orthodox,172.72,175.26,125,0,6,0,4.41,0.49,0.8,2.61,0.47,6,3,42,3,0,2,4,2,4,0,12,Orthodox,165.1,170.18,125,34,31,0,-6,-6,-12,-3,-42,-3,-2,-4,7.62,5.08,-3,-4.41,-0.8,-2.61,,,0.0,,,,,,,,,,,,0.0,11.0,,,,,,,,,,,,,,Red,SUB,Rear Naked Choke,2.0,2:05,425.0,300.0,800.0,150.0,2500.0,400.0,350.0
1,Shavkat Rakhmonov,Ian Machado Garry,-210.0,295.0,47.619,295.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,False,Welterweight,MALE,3,0,8,0,5.5,0.55,0.3,0.77,0.55,8,0,20,0,0,1,4,3,0,0,8,Orthodox,190.5,187.96,170,0,6,0,4.12,0.61,1.8,1.49,0.29,6,0,11,0,0,0,0,1,5,0,6,Orthodox,185.42,195.58,170,30,27,0,2,2,2,0,9,0,2,-5,5.08,-7.62,-3,1.38,-1.5,-0.72,,7.0,3.0,,,,,,,,3.0,,,,,,,,,,,,,7.0,,,,,,Red,U-DEC,,5.0,5:00,1500.0,250.0,650.0,180.0,3000.0,240.0,700.0
2,Ciryl Gane,Alexander Volkov,-380.0,300.0,26.3158,300.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,False,Heavyweight,MALE,3,0,4,0,5.13,0.57,0.2,0.45,0.63,4,4,44,0,0,1,4,6,1,0,12,Orthodox,200.66,203.2,250,0,1,0,5.49,0.6,0.5,0.58,0.21,7,2,33,3,0,0,3,4,2,0,9,Orthodox,193.04,205.74,245,34,36,0,3,-3,3,2,11,-3,2,-1,7.62,-2.54,2,-0.36,-0.3,-0.13,,3.0,2.0,,,,,2.0,,,,,,,,,,,,,3.0,,,,,,,,,Red,S-DEC,,3.0,5:00,900.0,-160.0,450.0,1100.0,3000.0,350.0,1100.0
3,Bryce Mitchell,Kron Gracie,-950.0,625.0,10.5263,625.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,False,Featherweight,MALE,3,2,0,0,3.74,0.44,0.5,0.47,0.25,1,2,7,0,0,0,0,0,1,0,1,Southpaw,175.26,177.8,145,1,0,0,2.3,0.58,1.6,3.45,0.41,6,2,22,0,1,0,5,0,1,0,7,Southpaw,177.8,177.8,145,30,36,1,0,-5,-6,0,-15,0,0,0,-2.54,0.0,6,1.44,-1.1,-2.98,,,13.0,,,,,,,,,,13.0,,,,,,,,,,,,,,,,,Red,KO/TKO,Elbows,3.0,0:39,639.0,-200.0,1100.0,380.0,1400.0,500.0,4000.0
4,Nate Landwehr,Dooho Choi,-130.0,110.0,76.9231,110.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Blue,False,Featherweight,MALE,3,0,1,1,4.41,0.53,0.8,0.75,0.37,3,3,15,0,0,0,0,4,0,0,4,Orthodox,177.8,177.8,145,0,1,0,6.25,0.46,1.0,1.0,0.41,3,3,17,0,1,0,1,1,2,0,5,Orthodox,175.26,182.88,145,36,33,0,0,0,-1,0,-2,0,3,-2,2.54,-5.08,-3,-1.84,-0.2,-0.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,KO/TKO,Elbows,3.0,3:21,801.0,275.0,550.0,500.0,700.0,300.0,250.0


In [511]:
ufc['draw_diff'] = (ufc['BlueDraws'] - ufc['RedDraws'])
ufc['avg_sig_str_pct_diff'] = (ufc['BlueAvgSigStrPct'] - ufc['RedAvgSigStrPct'])
ufc['avg_TD_pct_diff'] = (ufc['BlueAvgTDPct'] - ufc['RedAvgTDPct'])

In [512]:
ufc['M_DEC_diff'] = (ufc['BlueWinsByDecisionMajority'] - ufc['RedWinsByDecisionMajority'])
ufc['S_DEC_diff'] = (ufc['BlueWinsByDecisionSplit'] - ufc['RedWinsByDecisionSplit'])
ufc['U_DEC_diff'] = (ufc['BlueWinsByDecisionUnanimous'] - ufc['RedWinsByDecisionUnanimous'])

In [513]:
ufc['TKO_diff'] = (ufc['BlueWinsByTKODoctorStoppage'] - ufc['RedWinsByTKODoctorStoppage'])
ufc['odds_diff'] = (ufc['BlueOdds'] - ufc['RedOdds'])
ufc['ev_diff'] = (ufc['BlueExpectedValue'] - ufc['RedExpectedValue'])

In [514]:
redundant = ['BlueOdds', 'RedOdds', 'BlueExpectedValue', 'RedExpectedValue', 'BlueCurrentLoseStreak', 
            'RedCurrentLoseStreak', 'BlueCurrentWinStreak', 'RedCurrentWinStreak', 'BlueLongestWinStreak', 
            'RedLongestWinStreak', 'BlueWins', 'RedWins', 'BlueLosses', 'RedLosses', 'BlueTotalRoundsFought',
            'RedTotalRoundsFought', 'BlueTotalTitleBouts', 'RedTotalTitleBouts', 'BlueWinsByKO',
            'RedWinsByKO', 'BlueWinsBySubmission', 'RedWinsBySubmission', 'BlueHeightCms', 'RedHeightCms',
            'BlueReachCms', 'RedReachCms', 'BlueAge', 'RedAge', 'BlueAvgSigStrLanded', 'RedAvgSigStrLanded',
            'BlueAvgSubAtt', 'RedAvgSubAtt', 'BlueAvgTDLanded', 'RedAvgTDLanded', 'BlueDraws', 'RedDraws',
            'BlueAvgSigStrPct', 'RedAvgSigStrPct', 'BlueAvgTDPct', 'RedAvgTDPct', 'BlueWinsByDecisionMajority',
            'RedWinsByDecisionMajority', 'BlueWinsByDecisionSplit', 'RedWinsByDecisionSplit', 
            'BlueWinsByDecisionUnanimous', 'RedWinsByDecisionUnanimous', 'BlueWinsByTKODoctorStoppage', 'RedWinsByTKODoctorStoppage']

ufc.drop(redundant, axis = 1, inplace = True)

In [515]:
drop = ['Date', 'Location', 'Country', 'WeightClass', 'Gender','NumberOfRounds', 'EmptyArena', 'Finish',
        'FinishDetails', 'FinishRound', 'FinishRoundTime', 'TotalFightTimeSecs', 'BlueWeightLbs', 'RedWeightLbs']

ufc.drop(drop, axis = 1, inplace = True)

In [516]:
ufc['BlueStance'].loc[ufc['BlueStance'] == 'Switch '] = 'Switch'

In [517]:
ufc['BlueStance'] = [4 if stance == 'Orthodox' else 3 if stance == 'Southpaw' else 2 if stance == 'Switch' else 1 for stance in ufc['BlueStance']]
ufc['RedStance'] = [4 if stance == 'Orthodox' else 3 if stance == 'Southpaw' else 2 if stance == 'Switch' else 1 for stance in ufc['RedStance']]

In [518]:
ufc['BetterRank'] = [-1 if rank == 'Red' else 1 if rank == 'Blue' else 0 for rank in ufc['BetterRank']]
ufc['TitleBout'] = [1 if title_bout else 0 for title_bout in ufc['TitleBout']]

In [519]:
ufc['Stance_diff'] = (ufc['BlueStance'] - ufc['RedStance'])
ufc.drop(['BlueStance', 'RedStance'], axis = 1, inplace = True)

In [520]:
ufc['Winner'] = [1 if winner == 'Red' else 0 for winner in ufc['Winner']]

In [521]:
ufc.loc[:, 'BMatchWCRank':'BetterRank'].isnull().sum()

BMatchWCRank             5328
RMatchWCRank             4749
RWFlyweightRank          6432
RWFeatherweightRank      6519
RWStrawweightRank        6382
RWBantamweightRank       6374
RHeavyweightRank         6342
RLightHeavyweightRank    6344
RMiddleweightRank        6346
RWelterweightRank        6337
RLightweightRank         6344
RFeatherweightRank       6351
RBantamweightRank        6347
RFlyweightRank           6340
RPFPRank                 6275
BWFlyweightRank          6455
BWFeatherweightRank      6527
BWStrawweightRank        6428
BWBantamweightRank       6421
BHeavyweightRank         6380
BLightHeavyweightRank    6408
BMiddleweightRank        6391
BWelterweightRank        6409
BLightweightRank         6408
BFeatherweightRank       6404
BBantamweightRank        6409
BFlyweightRank           6398
BPFPRank                 6461
BetterRank                  0
dtype: int64

In [522]:
ufc.drop(ufc.loc[:, 'BMatchWCRank':'BPFPRank'], axis = 1, inplace = True)

In [523]:
label = ufc['Winner']
ufc.drop(['RedFighter', 'BlueFighter', 'Winner'], axis = 1, inplace = True)

In [489]:
ufc.drop(['RedDecOdds', 'BlueDecOdds', 'RSubOdds', 'BSubOdds', 'RKOOdds', 'BKOOdds'], axis = 1, inplace = True)

In [525]:
impute = SimpleImputer(strategy = 'median')
impute.fit(ufc[['avg_sig_str_pct_diff', 'avg_TD_pct_diff', 'odds_diff', 'ev_diff']])
ufc[['avg_sig_str_pct_diff', 'avg_TD_pct_diff', 'odds_diff', 'ev_diff']] = impute.transform(ufc[['avg_sig_str_pct_diff', 'avg_TD_pct_diff', 'odds_diff', 'ev_diff']])

In [526]:
ufc.head()

Unnamed: 0,TitleBout,LoseStreakDif,WinStreakDif,LongestWinStreakDif,WinDif,LossDif,TotalRoundDif,TotalTitleBoutDif,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,BetterRank,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds,draw_diff,avg_sig_str_pct_diff,avg_TD_pct_diff,M_DEC_diff,S_DEC_diff,U_DEC_diff,TKO_diff,odds_diff,ev_diff,Stance_diff
0,1,0,-6,-6,-12,-3,-42,-3,-2,-4,7.62,5.08,-3,-4.41,-0.8,-2.61,-1,300.0,800.0,150.0,2500.0,400.0,350.0,0,-0.49,-0.47,0,-2,-4,0,465.0,175.0,0
1,0,0,2,2,2,0,9,0,2,-5,5.08,-7.62,-3,1.38,-1.5,-0.72,-1,250.0,650.0,180.0,3000.0,240.0,700.0,0,-0.06,0.26,0,1,4,0,505.0,247.381,0
2,0,0,3,-3,3,2,11,-3,2,-1,7.62,-2.54,2,-0.36,-0.3,-0.13,-1,-160.0,450.0,1100.0,3000.0,350.0,1100.0,0,-0.03,0.42,0,1,1,0,680.0,273.6842,0
3,0,1,0,-5,-6,0,-15,0,0,0,-2.54,0.0,6,1.44,-1.1,-2.98,-1,-200.0,1100.0,380.0,1400.0,500.0,4000.0,0,-0.14,-0.16,-1,0,-5,0,1575.0,614.4737,0
4,0,0,0,0,-1,0,-2,0,3,-2,2.54,-5.08,-3,-1.84,-0.2,-0.25,0,275.0,550.0,500.0,700.0,300.0,250.0,1,0.07,-0.04,-1,0,-1,0,240.0,33.0769,0


In [527]:
ufc.to_csv('../data/df_modeling.csv', index = False)