# Adding addtional features to the current dataset

In [None]:
import pandas as pd
import numpy as np

Load Datasets

In [None]:
df_rankings_history = pd.read_csv('rankings_history.csv')
df_ufc_event_data = pd.read_csv('ufc_event_data.csv')
df_ufc_fight_data = pd.read_csv('ufc_fight_data.csv')
df_ufc_fighter_data = pd.read_csv('ufc_fighter_data.csv')

Merge Fighter Data

In [None]:
df_fight_data_f1 = pd.merge(df_ufc_fight_data, df_ufc_fighter_data, left_on='f_1', right_on='fighter_id', suffixes=('_f1', '_f2'))
df_fight_data_combined = pd.merge(df_fight_data_f1, df_ufc_fighter_data, left_on='f_2', right_on='fighter_id', suffixes=('_f1', '_f2'))

Merge Rankings Data

In [None]:
df_rankings_history['date'] = pd.to_datetime(df_rankings_history['date'])
df_fight_data_combined = pd.merge(df_fight_data_combined, df_rankings_history, left_on='fighter_l_name_f1', right_on='fighter', how='left', suffixes=('', '_f1_rank'))
df_fight_data_combined = pd.merge(df_fight_data_combined, df_rankings_history, left_on='fighter_l_name_f2', right_on='fighter', how='left', suffixes=('_f1', '_f2'))

Merge Event Data

In [None]:
df_fight_data_final = pd.merge(df_fight_data_combined, df_ufc_event_data, on='event_id', how='left')

Clean and create relevant features

In [None]:
df_fight_data_final_cleaned = df_fight_data_final[[
    'fighter_l_name_f1', 'fighter_l_name_f2', 'winner', 'num_rounds', 'title_fight',
    'weight_class', 'fighter_height_cm_f1', 'fighter_height_cm_f2', 'fighter_reach_cm_f1',
    'fighter_reach_cm_f2', 'rank_f1', 'rank_f2', 'event_name', 'event_city', 'event_country'
]]

Feature Engineering

In [None]:
df_fight_data_final_cleaned['height_diff'] = df_fight_data_final_cleaned['fighter_height_cm_f1'] - df_fight_data_final_cleaned['fighter_height_cm_f2']
df_fight_data_final_cleaned['reach_diff'] = df_fight_data_final_cleaned['fighter_reach_cm_f1'] - df_fight_data_final_cleaned['fighter_reach_cm_f2']
df_fight_data_final_cleaned['rank_f1'].fillna(999, inplace=True)
df_fight_data_final_cleaned['rank_f2'].fillna(999, inplace=True)
df_fight_data_final_cleaned['rank_diff'] = df_fight_data_final_cleaned['rank_f1'] - df_fight_data_final_cleaned['rank_f2']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fight_data_final_cleaned['height_diff'] = df_fight_data_final_cleaned['fighter_height_cm_f1'] - df_fight_data_final_cleaned['fighter_height_cm_f2']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fight_data_final_cleaned['reach_diff'] = df_fight_data_final_cleaned['fighter_reach_cm_f1'] - df_fight_data_final_cleaned['fighter_reach_cm_f2']
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For e

Final Dataset

In [None]:
df_fight_data_final_with_features = df_fight_data_final_cleaned[[
    'fighter_l_name_f1', 'fighter_l_name_f2', 'winner', 'num_rounds', 'title_fight', 'weight_class',
    'height_diff', 'reach_diff', 'rank_diff', 'event_name', 'event_city', 'event_country'
]]

In [None]:
df_fight_data_final_with_features.to_csv('merged_fight_data_with_features.csv', index=False)
df_fight_data_final_with_features.head()

Unnamed: 0,fighter_l_name_f1,fighter_l_name_f2,winner,num_rounds,title_fight,weight_class,height_diff,reach_diff,rank_diff,event_name,event_city,event_country
0,Fiziev,Gamrot,2884.0,5,F,Lightweight,-5.08,2.54,0.0,UFC Fight Night: Fiziev vs. Gamrot,Las Vegas,USA
1,Mitchell,Ige,1662.0,3,F,Featherweight,7.62,-2.54,0.0,UFC Fight Night: Fiziev vs. Gamrot,Las Vegas,USA
2,Rodriguez,Waterson-Gomez,981.0,3,F,Women's Strawweight,7.62,7.62,0.0,UFC Fight Night: Fiziev vs. Gamrot,Las Vegas,USA
3,Battle,Fletcher,3831.0,3,F,Welterweight,7.62,25.4,0.0,UFC Fight Night: Fiziev vs. Gamrot,Las Vegas,USA
4,Ramos,Jourdain,2320.0,3,F,Featherweight,0.0,7.62,0.0,UFC Fight Night: Fiziev vs. Gamrot,Las Vegas,USA


Load previous dataset

In [None]:
cleaned_fight_data = pd.read_csv('cleaned_fight_data.csv')
cleaned_fight_data.head()

Unnamed: 0,winner,strikes_att_diff,strikes_succ_diff,sig_strikes_att_diff,sig_strikes_succ_diff,takedown_att_diff,takedown_succ_diff,submission_att_diff,ctrl_time_diff
0,1,8.0,-2.0,8.0,-2.0,6.0,1.0,0.0,88.0
1,0,17.0,20.0,0.0,7.0,9.0,4.0,1.0,365.0
2,0,13.0,-5.0,15.0,-5.0,-15.0,-5.0,-1.0,-322.0
3,0,58.0,58.0,47.0,52.0,1.0,1.0,0.0,-8.0
4,0,-93.0,-74.0,-70.0,-57.0,5.0,0.0,0.0,46.0


Merge Old and New Datasets

In [None]:
merged_final_data = pd.concat([cleaned_fight_data.reset_index(drop=True),
                               df_fight_data_final_with_features.reset_index(drop=True)], axis=1)

merged_final_data = merged_final_data.loc[:, ~merged_final_data.columns.duplicated()]
merged_final_data.head()

Unnamed: 0,winner,strikes_att_diff,strikes_succ_diff,sig_strikes_att_diff,sig_strikes_succ_diff,takedown_att_diff,takedown_succ_diff,submission_att_diff,ctrl_time_diff,fighter_l_name_f1,fighter_l_name_f2,num_rounds,title_fight,weight_class,height_diff,reach_diff,rank_diff,event_name,event_city,event_country
0,1,8.0,-2.0,8.0,-2.0,6.0,1.0,0.0,88.0,Fiziev,Gamrot,5,F,Lightweight,-5.08,2.54,0.0,UFC Fight Night: Fiziev vs. Gamrot,Las Vegas,USA
1,0,17.0,20.0,0.0,7.0,9.0,4.0,1.0,365.0,Mitchell,Ige,3,F,Featherweight,7.62,-2.54,0.0,UFC Fight Night: Fiziev vs. Gamrot,Las Vegas,USA
2,0,13.0,-5.0,15.0,-5.0,-15.0,-5.0,-1.0,-322.0,Rodriguez,Waterson-Gomez,3,F,Women's Strawweight,7.62,7.62,0.0,UFC Fight Night: Fiziev vs. Gamrot,Las Vegas,USA
3,0,58.0,58.0,47.0,52.0,1.0,1.0,0.0,-8.0,Battle,Fletcher,3,F,Welterweight,7.62,25.4,0.0,UFC Fight Night: Fiziev vs. Gamrot,Las Vegas,USA
4,0,-93.0,-74.0,-70.0,-57.0,5.0,0.0,0.0,46.0,Ramos,Jourdain,3,F,Featherweight,0.0,7.62,0.0,UFC Fight Night: Fiziev vs. Gamrot,Las Vegas,USA


Handle missing values by filling with median

In [None]:
numerical_columns = ['strikes_att_diff', 'strikes_succ_diff', 'sig_strikes_att_diff', 'sig_strikes_succ_diff',
                     'takedown_att_diff', 'takedown_succ_diff', 'submission_att_diff', 'ctrl_time_diff',
                     'height_diff', 'reach_diff', 'rank_diff']
merged_final_data[numerical_columns] = merged_final_data[numerical_columns].fillna(merged_final_data[numerical_columns].median())

Drop rows where critical catgorical data is missing

In [None]:
merged_final_data.dropna(subset=['weight_class', 'num_rounds'], inplace=True)

Encode categorical columns

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
le = LabelEncoder()
merged_final_data['num_rounds'] = le.fit_transform(merged_final_data['num_rounds'])
merged_final_data['title_fight'] = le.fit_transform(merged_final_data['title_fight'])
merged_final_data['weight_class'] = le.fit_transform(merged_final_data['weight_class'])
merged_final_data.head()

Unnamed: 0,winner,strikes_att_diff,strikes_succ_diff,sig_strikes_att_diff,sig_strikes_succ_diff,takedown_att_diff,takedown_succ_diff,submission_att_diff,ctrl_time_diff,fighter_l_name_f1,fighter_l_name_f2,num_rounds,title_fight,weight_class,height_diff,reach_diff,rank_diff,event_name,event_city,event_country
0,1,8.0,-2.0,8.0,-2.0,6.0,1.0,0.0,88.0,Fiziev,Gamrot,3,0,6,-5.08,2.54,0.0,UFC Fight Night: Fiziev vs. Gamrot,Las Vegas,USA
1,0,17.0,20.0,0.0,7.0,9.0,4.0,1.0,365.0,Mitchell,Ige,2,0,2,7.62,-2.54,0.0,UFC Fight Night: Fiziev vs. Gamrot,Las Vegas,USA
2,0,13.0,-5.0,15.0,-5.0,-15.0,-5.0,-1.0,-322.0,Rodriguez,Waterson-Gomez,2,0,13,7.62,7.62,0.0,UFC Fight Night: Fiziev vs. Gamrot,Las Vegas,USA
3,0,58.0,58.0,47.0,52.0,1.0,1.0,0.0,-8.0,Battle,Fletcher,2,0,9,7.62,25.4,0.0,UFC Fight Night: Fiziev vs. Gamrot,Las Vegas,USA
4,0,-93.0,-74.0,-70.0,-57.0,5.0,0.0,0.0,46.0,Ramos,Jourdain,2,0,2,0.0,7.62,0.0,UFC Fight Night: Fiziev vs. Gamrot,Las Vegas,USA


In [None]:
merged_final_data.to_csv('final_data.csv', index=False)