In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
fight_data = pd.read_csv('fight_data.csv')
fighter_data = pd.read_csv('fighter_data.csv')

In [3]:
fight_data.head()

Unnamed: 0,event,fighter_1,fighter_2,result,method,round,time
0,UFC 308: Topuria vs. Holloway,Ilia Topuria,Max Holloway,win,KO/TKO,3,1:34
1,UFC 308: Topuria vs. Holloway,Khamzat Chimaev,Robert Whittaker,win,SUB,1,3:34
2,UFC 308: Topuria vs. Holloway,Magomed Ankalaev,Aleksandar Rakic,win,U-DEC,3,5:00
3,UFC 308: Topuria vs. Holloway,Lerone Murphy,Dan Ige,win,U-DEC,3,5:00
4,UFC 308: Topuria vs. Holloway,Shara Magomedov,Armen Petrosyan,win,KO/TKO,2,4:52


In [4]:
fight_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7906 entries, 0 to 7905
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   event      7906 non-null   object
 1   fighter_1  7906 non-null   object
 2   fighter_2  7906 non-null   object
 3   result     7906 non-null   object
 4   method     7906 non-null   object
 5   round      7906 non-null   int64 
 6   time       7906 non-null   object
dtypes: int64(1), object(6)
memory usage: 432.5+ KB


what should we do with this data? Maybe we can see the top fighters of all time? How to do that?
maybe we can first count how many wins each fighter has.


# Using fight data

In [5]:
# now we need to loop through and add a win each time they have won
# or maybe we can aggregate and group by name. fighter2 is always the one who loses if there is a winner. that makes it easier
# note this omits fighters with 0 wins
fight_wins = fight_data[fight_data['result'] == 'win']
fighter_wins = fight_wins.groupby('fighter_1').size().reset_index(name='wins')

In [6]:
# group by fighter and add losses column
# note this omits fighters with 0 losses
fighter_losses = fight_wins.groupby('fighter_2').size().reset_index(name='losses')

In [7]:
fighter_win_loss = fighter_wins.merge(fighter_losses, left_on='fighter_1', right_on='fighter_2', how='outer')

In [8]:
fighter_win_loss[fighter_win_loss['fighter_1'] != fighter_win_loss['fighter_2']]

Unnamed: 0,fighter_1,wins,fighter_2,losses
6,Abdul-Kerim Edilov,1.0,,
14,Adlan Amagov,2.0,,
45,Alex Hunter,2.0,,
64,Aliaskhab Khizriev,1.0,,
78,Amir Albazi,5.0,,
...,...,...,...,...
2521,,,Zane Frazier,2.0
2522,,,Zarah Fairn,4.0
2523,,,Zelim Imadaev,3.0
2524,,,Zviad Lazishvili,1.0


have lots of NaNs now where a fighter has either no wins or no losses

In [9]:
fighter_win_loss['wins'] = fighter_win_loss['wins'].fillna(0)
fighter_win_loss['losses'] = fighter_win_loss['losses'].fillna(0)

In [10]:
# filling in NaNs in the fighter names columns
fighter_win_loss['fighter_1'] = fighter_win_loss['fighter_1'].fillna(fighter_win_loss['fighter_2'])
fighter_win_loss['fighter_2'] = fighter_win_loss['fighter_2'].fillna(fighter_win_loss['fighter_1'])

In [11]:
# check if there are any mismatches. should be none at this point.
fighter_win_loss[fighter_win_loss['fighter_1'] != fighter_win_loss['fighter_2']]

Unnamed: 0,fighter_1,wins,fighter_2,losses


In [12]:
# lets clean up the columns now
fighter_win_loss['fighter'] = fighter_win_loss['fighter_1']
fighter_win_loss = fighter_win_loss[['fighter', 'wins', 'losses']]

In [13]:
fighter_win_loss.sort_values('wins', ascending=False).head()

Unnamed: 0,fighter,wins,losses
771,Jim Miller,26.0,17.0
93,Andrei Arlovski,23.0,18.0
453,Donald Cerrone,23.0,14.0
421,Demian Maia,22.0,11.0
1146,Max Holloway,22.0,8.0


In [14]:
# lets add a plus minus column
fighter_win_loss['plus_minus'] = fighter_win_loss['wins'] - fighter_win_loss['losses']

In [15]:
fighter_win_loss.sort_values('plus_minus', ascending=False).head(12)

Unnamed: 0,fighter,wins,losses,plus_minus
827,Jon Jones,21.0,1.0,20.0
577,Georges St-Pierre,20.0,2.0,18.0
662,Islam Makhachev,15.0,1.0,14.0
471,Dustin Poirier,22.0,8.0,14.0
1146,Max Holloway,22.0,8.0,14.0
75,Amanda Nunes,16.0,2.0,14.0
420,Demetrious Johnson,15.0,2.0,13.0
956,Khabib Nurmagomedov,13.0,0.0,13.0
66,Aljamain Sterling,16.0,4.0,12.0
913,Kamaru Usman,15.0,3.0,12.0


okay sick that looks pretty good. lets try something else now.

# Using fighter data

lets merge with the fighter data, and then overwrite the mismatches since fight_data is oct 2024 and fighter_data is older

In [16]:
fighter_data.head(2)

Unnamed: 0,name,nickname,wins,losses,draws,height_cm,weight_in_kg,reach_in_cm,stance,date_of_birth,significant_strikes_landed_per_minute,significant_striking_accuracy,significant_strikes_absorbed_per_minute,significant_strike_defence,average_takedowns_landed_per_15_minutes,takedown_accuracy,takedown_defense,average_submissions_attempted_per_15_minutes
0,Robert Drysdale,,7,0,0,190.5,92.99,,Orthodox,1981-10-05 00:00:00.000,0.0,0,0.0,0,7.32,100,0,21.9
1,Daniel McWilliams,The Animal,15,37,0,185.42,83.91,,,,3.36,77,0.0,0,0.0,0,100,21.6


In [17]:
fighter_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4111 entries, 0 to 4110
Data columns (total 18 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   name                                          4111 non-null   object 
 1   nickname                                      2257 non-null   object 
 2   wins                                          4111 non-null   int64  
 3   losses                                        4111 non-null   int64  
 4   draws                                         4111 non-null   int64  
 5   height_cm                                     3813 non-null   float64
 6   weight_in_kg                                  4024 non-null   float64
 7   reach_in_cm                                   2184 non-null   float64
 8   stance                                        3288 non-null   object 
 9   date_of_birth                                 2976 non-null   o

In [18]:
# fighter date of birth is not a datetime so lets make it one
fighter_data['date_of_birth'] = pd.to_datetime(fighter_data['date_of_birth'], errors='coerce')

In [19]:
# now we can get the date today to check their age
import datetime
today = pd.to_datetime(datetime.datetime.today())

# and get the age
fighter_data['age'] = (today - fighter_data['date_of_birth']).dt.days // 365

In [20]:
# alright here we do the merge
fighter_stats = fighter_win_loss.merge(
    fighter_data, left_on='fighter', right_on='name', how='outer', suffixes=('_new', '_old'))

In [21]:
fighter_stats.head()

Unnamed: 0,fighter,wins_new,losses_new,plus_minus,name,nickname,wins_old,losses_old,draws,height_cm,...,date_of_birth,significant_strikes_landed_per_minute,significant_striking_accuracy,significant_strikes_absorbed_per_minute,significant_strike_defence,average_takedowns_landed_per_15_minutes,takedown_accuracy,takedown_defense,average_submissions_attempted_per_15_minutes,age
0,AJ Dobson,1.0,3.0,-2.0,AJ Dobson,,7.0,2.0,0.0,185.42,...,1992-01-18,4.38,47.0,5.25,47.0,1.82,75.0,64.0,0.3,33.0
1,AJ Fletcher,1.0,3.0,-2.0,AJ Fletcher,The Ghost,10.0,3.0,0.0,177.8,...,1997-02-18,3.36,49.0,4.61,46.0,1.54,35.0,33.0,0.9,28.0
2,Aaron Riley,3.0,6.0,-3.0,Aaron Riley,,30.0,14.0,1.0,172.72,...,1980-12-09,3.45,34.0,3.78,61.0,1.18,34.0,60.0,0.1,44.0
3,Aaron Rosa,1.0,2.0,-1.0,Aaron Rosa,,18.0,6.0,0.0,193.04,...,1983-05-28,4.03,47.0,4.41,45.0,0.36,33.0,85.0,0.4,41.0
4,Aaron Simpson,7.0,4.0,3.0,Aaron Simpson,A-Train,12.0,5.0,0.0,182.88,...,1974-07-20,3.6,52.0,2.53,56.0,3.78,34.0,83.0,0.5,50.0


because fighter_data includes data from before a fighter came into the ufc, i.e. all pro fights, the win/loss is different. for example khabib. should also be noted that the other stats include pre-ufc era for the fighters

In [22]:
fighter_stats[fighter_stats['fighter'] == 'Khabib Nurmagomedov']

Unnamed: 0,fighter,wins_new,losses_new,plus_minus,name,nickname,wins_old,losses_old,draws,height_cm,...,date_of_birth,significant_strikes_landed_per_minute,significant_striking_accuracy,significant_strikes_absorbed_per_minute,significant_strike_defence,average_takedowns_landed_per_15_minutes,takedown_accuracy,takedown_defense,average_submissions_attempted_per_15_minutes,age
958,Khabib Nurmagomedov,13.0,0.0,13.0,Khabib Nurmagomedov,The Eagle,29.0,0.0,0.0,177.8,...,1988-09-20,4.1,48.0,1.75,65.0,5.32,48.0,84.0,0.8,36.0


In [23]:
fighter_stats[fighter_stats['fighter'] == 'Alex Pereira']

Unnamed: 0,fighter,wins_new,losses_new,plus_minus,name,nickname,wins_old,losses_old,draws,height_cm,...,date_of_birth,significant_strikes_landed_per_minute,significant_striking_accuracy,significant_strikes_absorbed_per_minute,significant_strike_defence,average_takedowns_landed_per_15_minutes,takedown_accuracy,takedown_defense,average_submissions_attempted_per_15_minutes,age
50,Alex Pereira,9.0,1.0,8.0,Alex Pereira,Poatan,9.0,2.0,0.0,193.04,...,1987-07-07,5.0,62.0,3.65,50.0,0.19,100.0,70.0,0.4,37.0


In [24]:
fight_data[(fight_data['fighter_1'] == 'Alex Pereira') |  (fight_data['fighter_2'] == 'Alex Pereira')]

Unnamed: 0,event,fighter_1,fighter_2,result,method,round,time
36,UFC 307: Pereira vs. Rountree Jr.,Alex Pereira,Khalil Rountree Jr.,win,KO/TKO,4,4:32
167,UFC 303: Pereira vs. Prochazka 2,Alex Pereira,Jiri Prochazka,win,KO/TKO\n\n \n\n Kick,2,0:13
278,UFC 300: Pereira vs. Hill,Alex Pereira,Jamahal Hill,win,KO/TKO\n\n \n\n Punch,1,3:14
490,UFC 295: Prochazka vs. Pereira,Alex Pereira,Jiri Prochazka,win,KO/TKO\n\n \n\n Elbows,2,4:08
643,UFC 291: Poirier vs. Gaethje 2,Alex Pereira,Jan Blachowicz,win,S-DEC,3,5:00
826,UFC 287: Pereira vs. Adesanya 2,Israel Adesanya,Alex Pereira,win,KO/TKO\n\n \n\n Punch,2,4:21
1011,UFC 281: Adesanya vs. Pereira,Alex Pereira,Israel Adesanya,win,KO/TKO\n\n \n\n Punches,5,2:01
1206,UFC 276: Adesanya vs. Cannonier,Alex Pereira,Sean Strickland,win,KO/TKO\n\n \n\n Punches,1,2:36
1379,UFC Fight Night: Santos vs. Ankalaev,Alex Pereira,Bruno Silva,win,U-DEC,3,5:00
1539,UFC 268: Usman vs. Covington 2,Alex Pereira,Andreas Michailidis,win,KO/TKO\n\n \n\n Flying Knee,2,0:18


see poatan is also an interesting case, since he has such a low number of "pro fights" before becoming champion and defeating big names. also it just so happens that the time difference between datasets shows alex having 9 wins in each dataset.

In [25]:
fighter_stats['wins'] = fighter_stats['wins_new']
fighter_stats['losses'] = fighter_stats['losses_new']

In [26]:
fighter_stats.columns

Index(['fighter', 'wins_new', 'losses_new', 'plus_minus', 'name', 'nickname',
       'wins_old', 'losses_old', 'draws', 'height_cm', 'weight_in_kg',
       'reach_in_cm', 'stance', 'date_of_birth',
       'significant_strikes_landed_per_minute',
       'significant_striking_accuracy',
       'significant_strikes_absorbed_per_minute', 'significant_strike_defence',
       'average_takedowns_landed_per_15_minutes', 'takedown_accuracy',
       'takedown_defense', 'average_submissions_attempted_per_15_minutes',
       'age', 'wins', 'losses'],
      dtype='object')

In [27]:
fighter_stats = fighter_stats[['fighter', 'wins', 'losses', 'plus_minus', 'draws',
       'height_cm', 'weight_in_kg', 'reach_in_cm',
       'significant_strikes_landed_per_minute',
       'significant_striking_accuracy',
       'significant_strikes_absorbed_per_minute', 'significant_strike_defence',
       'average_takedowns_landed_per_15_minutes', 'takedown_accuracy',
       'takedown_defense', 'average_submissions_attempted_per_15_minutes',
       'age']]

In [28]:
# okay lets remove the NaNs now so that we only considers fighters in the ufc
fighter_stats = fighter_stats.dropna(subset='fighter')

In [29]:
fighter_stats.sort_values('plus_minus', ascending=False).head()

Unnamed: 0,fighter,wins,losses,plus_minus,draws,height_cm,weight_in_kg,reach_in_cm,significant_strikes_landed_per_minute,significant_striking_accuracy,significant_strikes_absorbed_per_minute,significant_strike_defence,average_takedowns_landed_per_15_minutes,takedown_accuracy,takedown_defense,average_submissions_attempted_per_15_minutes,age
829,Jon Jones,21.0,1.0,20.0,0.0,193.04,112.49,213.36,4.29,57.0,2.22,64.0,1.93,45.0,95.0,0.5,37.0
578,Georges St-Pierre,20.0,2.0,18.0,0.0,180.34,83.91,193.04,3.78,53.0,1.4,72.0,4.16,74.0,83.0,1.1,44.0
75,Amanda Nunes,16.0,2.0,14.0,0.0,172.72,61.23,175.26,4.55,51.0,2.62,57.0,2.73,54.0,82.0,0.7,36.0
1148,Max Holloway,22.0,8.0,14.0,0.0,180.34,65.77,175.26,7.17,47.0,4.75,59.0,0.27,53.0,84.0,0.3,33.0
663,Islam Makhachev,15.0,1.0,14.0,0.0,177.8,70.31,177.8,2.46,60.0,1.27,61.0,3.17,60.0,90.0,1.0,33.0


okay now to do some analysis we need to merge this with the fight_data to make a predictive model

should probably remove the time, or maybe create a new round + time column

In [30]:
fight_data['fight_time_seconds'] = 60 * 5 * (fight_data['round'] - 1) + pd.to_timedelta(
    '00:' + fight_data['time']).dt.total_seconds().astype(int)

In [31]:
fight_data = fight_data.drop(columns=['round', 'time'])

In [32]:
fight_data.head()

Unnamed: 0,event,fighter_1,fighter_2,result,method,fight_time_seconds
0,UFC 308: Topuria vs. Holloway,Ilia Topuria,Max Holloway,win,KO/TKO,694
1,UFC 308: Topuria vs. Holloway,Khamzat Chimaev,Robert Whittaker,win,SUB,214
2,UFC 308: Topuria vs. Holloway,Magomed Ankalaev,Aleksandar Rakic,win,U-DEC,900
3,UFC 308: Topuria vs. Holloway,Lerone Murphy,Dan Ige,win,U-DEC,900
4,UFC 308: Topuria vs. Holloway,Shara Magomedov,Armen Petrosyan,win,KO/TKO,592


right now all the fighter_1 values are the winner, so we'll need to change it for a predictive model (all results right now are Y=1)

In [33]:
fight_data = fight_data[fight_data['result'] == 'win']

In [34]:
# fighter_1 win, result = 1. fighter_2 win, result = 0.
fight_data['result'] = 1

In [35]:
flipped = fight_data.copy()
flipped['fighter_1'], flipped['fighter_2'] = fight_data['fighter_2'], fight_data['fighter_1']
flipped['result'] = 0

In [36]:
balanced_fight_data = pd.concat([fight_data, flipped], ignore_index=True)

we basically created training symmetry here. Now the model will have examples of what a fighter_1 loss looks like, so it wont always give "win" as the predicted result. 

In [37]:
fight_stats_dummy = balanced_fight_data.merge(fighter_stats.add_suffix('_f1'), left_on='fighter_1', right_on='fighter_f1')
fight_stats = fight_stats_dummy.merge(fighter_stats.add_suffix('_f2'), left_on='fighter_2', right_on='fighter_f2')

## Logistic Regression

In [39]:
model_data = fight_stats.copy().drop(columns=['event', 'fighter_1', 'fighter_2', 'method',
       'fight_time_seconds', 'fighter_f1', 'fighter_f2'])

In [61]:
model_data.head()

Unnamed: 0,result,wins_f1,losses_f1,plus_minus_f1,draws_f1,height_cm_f1,weight_in_kg_f1,reach_in_cm_f1,significant_strikes_landed_per_minute_f1,significant_striking_accuracy_f1,...,reach_in_cm_f2,significant_strikes_landed_per_minute_f2,significant_striking_accuracy_f2,significant_strikes_absorbed_per_minute_f2,significant_strike_defence_f2,average_takedowns_landed_per_15_minutes_f2,takedown_accuracy_f2,takedown_defense_f2,average_submissions_attempted_per_15_minutes_f2,age_f2
0,1,8.0,0.0,8.0,0.0,170.18,65.77,175.26,4.44,46.0,...,175.26,7.17,47.0,4.75,59.0,0.27,53.0,84.0,0.3,33.0
1,0,11.0,2.0,9.0,0.0,172.72,65.77,177.8,3.34,40.0,...,175.26,7.17,47.0,4.75,59.0,0.27,53.0,84.0,0.3,33.0
2,0,12.0,10.0,2.0,0.0,180.34,65.77,187.96,3.83,37.0,...,175.26,7.17,47.0,4.75,59.0,0.27,53.0,84.0,0.3,33.0
3,0,14.0,8.0,6.0,0.0,170.18,61.23,177.8,3.57,46.0,...,175.26,7.17,47.0,4.75,59.0,0.27,53.0,84.0,0.3,33.0
4,0,14.0,8.0,6.0,0.0,170.18,61.23,177.8,3.57,46.0,...,175.26,7.17,47.0,4.75,59.0,0.27,53.0,84.0,0.3,33.0


In [49]:
# dropping NaNs for now
model_data = model_data.dropna()

In [50]:
X = model_data.drop(columns=['result'])
y = model_data['result']

In [53]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [59]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [60]:
## Results for 1000 max iterations
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7518968133535661
Confusion Matrix:
 [[1011  349]
 [ 305  971]]
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.74      0.76      1360
           1       0.74      0.76      0.75      1276

    accuracy                           0.75      2636
   macro avg       0.75      0.75      0.75      2636
weighted avg       0.75      0.75      0.75      2636



Thats not bad for simple model. It might be better to create an elo score for the fighters and use this in the modelling. However im not so sure because the beta coefficient would always favour the fighter with higher elo. Its a little inaccurate since number of fights is always so low (<30 usually). I guess elo would contribute to the fighter being picked, but the other features would weigh in as well. side note: I think a good feature to add could be average fight time. actually no, i forgot main event fights are five rounds. I guess maybe fight wins/losses by finish

## Random Forest

lets try a random forest now for what we have.

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=1000, random_state=42)

Accuracy: 0.7469650986342944
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.73      0.75      1360
           1       0.73      0.76      0.74      1276

    accuracy                           0.75      2636
   macro avg       0.75      0.75      0.75      2636
weighted avg       0.75      0.75      0.75      2636



In [None]:
# Train the model
rf_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = rf_model.predict(X_test)

In [67]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7469650986342944
Confusion Matrix:
 [[997 363]
 [304 972]]
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.73      0.75      1360
           1       0.73      0.76      0.74      1276

    accuracy                           0.75      2636
   macro avg       0.75      0.75      0.75      2636
weighted avg       0.75      0.75      0.75      2636



Wow it did worse. okay then. I guess it makes sense though for a tree-based model. everything should really be in comparison between fighter 1 and fighter 2. e.g. a reach of 190cm is great if your opponent has 170cm, but not great if your opponent has 200cm. the tree model doesnt work like that though, it just picks a column and predicts based on the value. I think I need to do feature engineering to engineer everything in terms of "difference"

## Feature engineering stat differences between fighters

all differences will be fighter 1 - fighter 2 for consistency

In [72]:
model_data.columns

Index(['result', 'wins_f1', 'losses_f1', 'plus_minus_f1', 'draws_f1',
       'height_cm_f1', 'weight_in_kg_f1', 'reach_in_cm_f1',
       'significant_strikes_landed_per_minute_f1',
       'significant_striking_accuracy_f1',
       'significant_strikes_absorbed_per_minute_f1',
       'significant_strike_defence_f1',
       'average_takedowns_landed_per_15_minutes_f1', 'takedown_accuracy_f1',
       'takedown_defense_f1',
       'average_submissions_attempted_per_15_minutes_f1', 'age_f1', 'wins_f2',
       'losses_f2', 'plus_minus_f2', 'draws_f2', 'height_cm_f2',
       'weight_in_kg_f2', 'reach_in_cm_f2',
       'significant_strikes_landed_per_minute_f2',
       'significant_striking_accuracy_f2',
       'significant_strikes_absorbed_per_minute_f2',
       'significant_strike_defence_f2',
       'average_takedowns_landed_per_15_minutes_f2', 'takedown_accuracy_f2',
       'takedown_defense_f2',
       'average_submissions_attempted_per_15_minutes_f2', 'age_f2'],
      dtype='object')

In [74]:
model_data['wins_diff'] = model_data['wins_f1'] - model_data['wins_f2']
model_data['losses_diff'] = model_data['losses_f1'] - model_data['losses_f2']
model_data['height_cm_diff'] = model_data['height_cm_f1'] - model_data['height_cm_f2']
model_data['reach_in_cm_diff'] = model_data['reach_in_cm_f1'] - model_data['reach_in_cm_f2']
model_data['significant_strikes_landed_per_minute_diff'] = model_data['significant_strikes_landed_per_minute_f1'] - model_data['significant_strikes_landed_per_minute_f2']
model_data['significant_striking_accuracy_diff'] = model_data['significant_striking_accuracy_f1'] - model_data['significant_striking_accuracy_f2']
model_data['significant_strike_defence_diff'] = model_data['significant_strike_defence_f1'] - model_data['significant_strike_defence_f2']
model_data['average_takedowns_landed_per_15_minutes_diff'] = model_data['average_takedowns_landed_per_15_minutes_f1'] - model_data['average_takedowns_landed_per_15_minutes_f2']
model_data['takedown_accuracy_diff'] = model_data['takedown_accuracy_f1'] - model_data['takedown_accuracy_f2']
model_data['takedown_defense_diff'] = model_data['takedown_defense_f1'] - model_data['takedown_defense_f2']
model_data['average_submissions_attempted_per_15_minutes_diff'] = model_data['average_submissions_attempted_per_15_minutes_f1'] - model_data['average_submissions_attempted_per_15_minutes_f2']
model_data['age_diff'] = model_data['age_f1'] - model_data['age_f2']

In [76]:
model_data = model_data[['result', 'wins_diff', 'losses_diff', 'height_cm_diff', 'reach_in_cm_diff',
       'significant_strikes_landed_per_minute_diff',
       'significant_striking_accuracy_diff', 'significant_strike_defence_diff',
       'average_takedowns_landed_per_15_minutes_diff',
       'takedown_accuracy_diff', 'takedown_defense_diff',
       'average_submissions_attempted_per_15_minutes_diff', 'age_diff']]

In [77]:
# random forest
X = model_data.drop(columns=['result'])
y = model_data['result']

# Split data into train/test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7405159332321699
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      1360
           1       0.73      0.75      0.74      1276

    accuracy                           0.74      2636
   macro avg       0.74      0.74      0.74      2636
weighted avg       0.74      0.74      0.74      2636



thats surprising honestly, I thought it would get better. We can try the regression now I guess.

In [80]:
# logistic regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# Predict
y_pred = model.predict(X_test_scaled)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7507587253414264
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.74      0.75      1360
           1       0.73      0.76      0.75      1276

    accuracy                           0.75      2636
   macro avg       0.75      0.75      0.75      2636
weighted avg       0.75      0.75      0.75      2636



hmm looks like 75% is the highest we get here