In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

# UFC Matches Analysis

### ufc_event_details
    - EVENT, BOUT, DATE, LOCATION

### ufc_fight_results.csv
    - EVENT, BOUT, OUTCOME, WEIGHTCLASS, METHOD, ROUND, TIME FORMAT

### ufc_fight_stats.csv
    BOUT, ROUND, FIGHTER, KD, SIG.STR., SIG.STR.%, TOTAL STR., TD, TD%, SUB.ATT, REV., CTRL, HEAD, BODY, LEG, DISTANCE, CLINCH, GROUND

### ufc_fighter_details.csv
    FIRST, LAST, NICKNAME

### ufc_fighter_tott.csv
    FIGHTER, HEIGHT, WEIGHT, REACH, STANCE, DOB

In [2]:
df_fight_stats = pd.read_csv('ufc_fight_stats.csv')
df_fighter_details = pd.read_csv('ufc_fighter_details.csv')
df_ufc_event_details = pd.read_csv('ufc_event_details.csv')
df_ufc_fight_results = pd.read_csv('ufc_fight_results.csv')
df_ufc_fighter_tott = pd.read_csv('ufc_fighter_tott.csv')

In [3]:
print(df_fight_stats.info())
print(df_fighter_details.info())
print(df_ufc_event_details.info())
print(df_ufc_fight_results.info())
print(df_ufc_fighter_tott.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37302 entries, 0 to 37301
Data columns (total 19 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   EVENT       37302 non-null  object 
 1   BOUT        37302 non-null  object 
 2   ROUND       37260 non-null  object 
 3   FIGHTER     37260 non-null  object 
 4   KD          37260 non-null  float64
 5   SIG.STR.    37260 non-null  object 
 6   SIG.STR. %  37260 non-null  object 
 7   TOTAL STR.  37260 non-null  object 
 8   TD          37260 non-null  object 
 9   TD %        37260 non-null  object 
 10  SUB.ATT     37260 non-null  float64
 11  REV.        37260 non-null  float64
 12  CTRL        37260 non-null  object 
 13  HEAD        37260 non-null  object 
 14  BODY        37260 non-null  object 
 15  LEG         37260 non-null  object 
 16  DISTANCE    37260 non-null  object 
 17  CLINCH      37260 non-null  object 
 18  GROUND      37260 non-null  object 
dtypes: float64(3), object(16)

## Cleaning df_ufc_event_details

In [4]:
df_ufc_event_details = df_ufc_event_details.drop(columns='URL')


In [5]:
df_ufc_event_details['EVENT'] = df_ufc_event_details['EVENT'].str.strip().str.lower()

In [6]:
def cv_date(date_str):
    date_ob = datetime.strptime(date_str,"%B %d, %Y")
    return date_ob.strftime("%m-%d-%y")
df_ufc_event_details['DATE']=df_ufc_event_details['DATE'].apply(cv_date)

In [7]:
df_ufc_event_details

Unnamed: 0,EVENT,DATE,LOCATION
0,ufc fight night: covington vs. buckley,12-14-24,"Tampa, Florida, USA"
1,ufc 310: pantoja vs. asakura,12-07-24,"Las Vegas, Nevada, USA"
2,ufc fight night: yan vs. figueiredo,11-23-24,"Macau, China"
3,ufc 309: jones vs. miocic,11-16-24,"New York City, New York, USA"
4,ufc fight night: magny vs. prates,11-09-24,"Las Vegas, Nevada, USA"
...,...,...,...
710,ufc 6: clash of the titans,07-14-95,"Casper, Wyoming, USA"
711,ufc 5: the return of the beast,04-07-95,"Charlotte, North Carolina, USA"
712,ufc 4: revenge of the warriors,12-16-94,"Tulsa, Oklahoma, USA"
713,ufc 3: the american dream,09-09-94,"Charlotte, North Carolina, USA"


## Cleaning df_ufc_fight_results

In [8]:
df_ufc_fight_results = pd.read_csv('ufc_fight_results.csv')

In [9]:
df_ufc_fight_results

Unnamed: 0,EVENT,BOUT,OUTCOME,WEIGHTCLASS,METHOD,ROUND,TIME,TIME FORMAT,REFEREE,DETAILS,URL
0,UFC Fight Night: Covington vs. Buckley,Colby Covington vs. Joaquin Buckley,L/W,Welterweight Bout,TKO - Doctor's Stoppage,3,4:42,5 Rnd (5-5-5-5-5),Dan Miragliotta,Cut above eye,http://ufcstats.com/fight-details/00c6a2ef07ca...
1,UFC Fight Night: Covington vs. Buckley,Cub Swanson vs. Billy Quarantillo,W/L,Featherweight Bout,KO/TKO,3,1:36,3 Rnd (5-5-5),Jason Herzog,Punch to Head At Distance,http://ufcstats.com/fight-details/752101555408...
2,UFC Fight Night: Covington vs. Buckley,Manel Kape vs. Bruno Silva,W/L,Flyweight Bout,KO/TKO,3,1:57,3 Rnd (5-5-5),Keith Peterson,Punches to Head At Distance,http://ufcstats.com/fight-details/1a635a5e4551...
3,UFC Fight Night: Covington vs. Buckley,Vitor Petrino vs. Dustin Jacoby,L/W,Light Heavyweight Bout,KO/TKO,3,3:44,3 Rnd (5-5-5),Andrew Glenn,Punch to Head At Distance,http://ufcstats.com/fight-details/7b1bc4ff776f...
4,UFC Fight Night: Covington vs. Buckley,Adrian Yanez vs. Daniel Marcos,L/W,Bantamweight Bout,Decision - Split,3,5:00,3 Rnd (5-5-5),Keith Peterson,Derek Cleary 27 - 30.Chris Lee 29 - 28.Eric Co...,http://ufcstats.com/fight-details/5238f6470d05...
...,...,...,...,...,...,...,...,...,...,...,...
7930,UFC 2: No Way Out,Orlando Wiet vs. Robert Lucarelli,W/L,Open Weight Bout,KO/TKO,1,2:50,No Time Limit,John McCarthy,toCorner Stoppage,http://ufcstats.com/fight-details/3b020d4914b4...
7931,UFC 2: No Way Out,Frank Hamaker vs. Thaddeus Luster,W/L,Open Weight Bout,Submission,1,4:52,No Time Limit,John McCarthy,Keylock From Half Guard,http://ufcstats.com/fight-details/d917c8c7461b...
7932,UFC 2: No Way Out,Johnny Rhodes vs. David Levicki,W/L,Open Weight Bout,KO/TKO,1,12:13,No Time Limit,John McCarthy,Punches to Head From GuardSubmission to Strikes,http://ufcstats.com/fight-details/ccee020be2e8...
7933,UFC 2: No Way Out,Patrick Smith vs. Ray Wizard,W/L,Open Weight Bout,Submission,1,0:58,No Time Limit,John McCarthy,Guillotine Choke Standing,http://ufcstats.com/fight-details/4b9ae533ccb3...


In [10]:
df_ufc_fight_results = df_ufc_fight_results.drop(columns=['URL','DETAILS','REFEREE'])
df_ufc_fight_results['EVENT'] = df_ufc_fight_results['EVENT'].str.strip().str.lower()
df_ufc_fight_results['BOUT'] = df_ufc_fight_results['BOUT'].str.strip().str.lower()
df_ufc_fight_results['METHOD'] = df_ufc_fight_results['METHOD'].str.strip().str.lower()


In [11]:
def cv_time(time):
    minutes,seconds = time.split(':')
    return int(minutes)*60+int(seconds)

In [12]:
df_ufc_fight_results['TIME'] = df_ufc_fight_results['TIME'].apply(cv_time)
df_ufc_fight_results[['FIGHTER_1','FIGHTER_2']] = df_ufc_fight_results['BOUT'].str.split(' vs. ', expand=True)

In [13]:
df_ufc_fight_results['TIME FORMAT'].unique()

array(['5 Rnd (5-5-5-5-5)', '3 Rnd (5-5-5)', '3 Rnd + OT (5-5-5-5)',
       '2 Rnd (5-5)', '1 Rnd + 2OT (15-3-3)', '1 Rnd + OT (12-3)',
       '1 Rnd (12)', '1 Rnd + OT (15-3)', '1 Rnd (15)',
       '1 Rnd + 2OT (24-3-3)', '1 Rnd (10)', '1 Rnd + OT (27-3)',
       '1 Rnd (18)', '1 Rnd + OT (30-5)', '1 Rnd + OT (30-3)',
       '1 Rnd (20)', '1 Rnd (30)', '1 Rnd + OT (31-5)', 'No Time Limit'],
      dtype=object)

In [14]:
def cv_time_format(time_fm):
    if 'No Time Limit' in time_fm:
        return None, None, None, None
    rounds_info = time_fm.split(' Rnd')
    round_num = int(rounds_info[0])

    if '+' in rounds_info[1]:
        main, overtime = rounds_info[1].split('OT')
        main_round_time = overtime.strip(' ( )').split('-')[0]
        ot_round_time = overtime.strip(' ()').split('-')[1]
        num_ot_rounds = main.split(' ')[2].strip()
        return round_num, main_round_time, num_ot_rounds, ot_round_time
    else:
        main_round_time = rounds_info[1].strip(' ( )').split('-')[0]

        return round_num, main_round_time, 0, 0
df_ufc_fight_results[['Total_Rounds_Format', 'Main_round_time_Format', 'Num_overtime_rounds_Format', 'Overtime_round_time_Format']] = df_ufc_fight_results['TIME FORMAT'].apply(cv_time_format).apply(pd.Series)

In [15]:
df_ufc_fight_results

Unnamed: 0,EVENT,BOUT,OUTCOME,WEIGHTCLASS,METHOD,ROUND,TIME,TIME FORMAT,FIGHTER_1,FIGHTER_2,Total_Rounds_Format,Main_round_time_Format,Num_overtime_rounds_Format,Overtime_round_time_Format
0,ufc fight night: covington vs. buckley,colby covington vs. joaquin buckley,L/W,Welterweight Bout,tko - doctor's stoppage,3,282,5 Rnd (5-5-5-5-5),colby covington,joaquin buckley,5.0,5,0,0
1,ufc fight night: covington vs. buckley,cub swanson vs. billy quarantillo,W/L,Featherweight Bout,ko/tko,3,96,3 Rnd (5-5-5),cub swanson,billy quarantillo,3.0,5,0,0
2,ufc fight night: covington vs. buckley,manel kape vs. bruno silva,W/L,Flyweight Bout,ko/tko,3,117,3 Rnd (5-5-5),manel kape,bruno silva,3.0,5,0,0
3,ufc fight night: covington vs. buckley,vitor petrino vs. dustin jacoby,L/W,Light Heavyweight Bout,ko/tko,3,224,3 Rnd (5-5-5),vitor petrino,dustin jacoby,3.0,5,0,0
4,ufc fight night: covington vs. buckley,adrian yanez vs. daniel marcos,L/W,Bantamweight Bout,decision - split,3,300,3 Rnd (5-5-5),adrian yanez,daniel marcos,3.0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7930,ufc 2: no way out,orlando wiet vs. robert lucarelli,W/L,Open Weight Bout,ko/tko,1,170,No Time Limit,orlando wiet,robert lucarelli,,,,
7931,ufc 2: no way out,frank hamaker vs. thaddeus luster,W/L,Open Weight Bout,submission,1,292,No Time Limit,frank hamaker,thaddeus luster,,,,
7932,ufc 2: no way out,johnny rhodes vs. david levicki,W/L,Open Weight Bout,ko/tko,1,733,No Time Limit,johnny rhodes,david levicki,,,,
7933,ufc 2: no way out,patrick smith vs. ray wizard,W/L,Open Weight Bout,submission,1,58,No Time Limit,patrick smith,ray wizard,,,,


In [16]:
df_ufc_fight_results['METHOD'].unique()

array(["tko - doctor's stoppage", 'ko/tko', 'decision - split',
       'decision - unanimous', 'submission', 'could not continue',
       'decision - majority', 'dq', 'overturned', 'other'], dtype=object)

In [17]:
df_ufc_fight_results['WEIGHTCLASS'].unique()

array(['Welterweight Bout', 'Featherweight Bout', 'Flyweight Bout',
       'Light Heavyweight Bout', 'Bantamweight Bout', 'Lightweight Bout',
       "Women's Flyweight Bout", "Women's Strawweight Bout",
       'UFC Flyweight Title Bout', 'Heavyweight Bout',
       'Catch Weight Bout', 'UFC Heavyweight Title Bout',
       'Middleweight Bout', "Women's Bantamweight Bout",
       'UFC Light Heavyweight Title Bout',
       "UFC Women's Bantamweight Title Bout",
       'UFC Bantamweight Title Bout', "UFC Women's Flyweight Title Bout",
       'UFC Middleweight Title Bout', 'UFC Welterweight Title Bout',
       'UFC Interim Heavyweight Title Bout', 'UFC Lightweight Title Bout',
       "UFC Women's Strawweight Title Bout",
       'UFC Featherweight Title Bout', "Women's Featherweight Bout",
       'UFC Interim Featherweight Title Bout',
       'UFC Interim Flyweight Title Bout',
       'UFC Interim Bantamweight Title Bout',
       "UFC Women's Featherweight Title Bout",
       'UFC Interim Lig

In [18]:
df_ufc_fight_results['gender'] = df_ufc_fight_results['WEIGHTCLASS'].apply(lambda x: 1 if 'Women' in x else 0)
df_ufc_fight_results['is_title_bout'] = df_ufc_fight_results['WEIGHTCLASS'].apply(lambda x: 1 if 'Title' in x else 0)

In [19]:
weight_class_list = [
    'Lightweight', 'Strawweight', 'Featherweight', 'Flyweight', 'Bantamweight', 
    'Welterweight', 'Middleweight', 'Light Heavyweight', 'Heavyweight', 
    'Super Heavyweight', 'Catch Weight', 'Open Weight'
]
def cv_weight_class(weight_class):
    for wc in weight_class_list:
        if wc in weight_class:
            return wc
    return None

df_ufc_fight_results['weight_class'] = df_ufc_fight_results['WEIGHTCLASS'].apply(cv_weight_class)


In [20]:
check = df_ufc_fight_results[df_ufc_fight_results['weight_class'].isna()]
check['EVENT']

7821       ufc - ultimate ultimate '96
7839            ufc 10: the tournament
7848         ufc 9: motor city madness
7855           ufc 8: david vs goliath
7856           ufc 8: david vs goliath
7864       ufc - ultimate ultimate '95
7873       ufc 7: the brawl in buffalo
7874       ufc 7: the brawl in buffalo
7884        ufc 6: clash of the titans
7885        ufc 6: clash of the titans
7894    ufc 5: the return of the beast
7895    ufc 5: the return of the beast
7904    ufc 4: revenge of the warriors
7914         ufc 3: the american dream
7920                 ufc 2: no way out
Name: EVENT, dtype: object

#### Drop all the superfight championship since there was no weightclass being introduced back in the day these matches took place

In [21]:
df_ufc_fight_results = df_ufc_fight_results[~df_ufc_fight_results['WEIGHTCLASS'].str.contains('Superfight Championship Bout', na=False)]

In [25]:
df_ufc_fight_results['fighter_1_result'] = df_ufc_fight_results['OUTCOME'].apply(lambda x: 'Win' if x.startswith('W') else 'L')
df_ufc_fight_results['fighter_2_results'] = df_ufc_fight_results['OUTCOME'].apply(lambda x: 'Win' if x.startswith('W') else 'L')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ufc_fight_results['fighter_1_result'] = df_ufc_fight_results['OUTCOME'].apply(lambda x: 'Win' if x.startswith('W') else 'L')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ufc_fight_results['fighter_2_results'] = df_ufc_fight_results['OUTCOME'].apply(lambda x: 'Win' if x.startswith('W') else 'L')


In [27]:
df_ufc_fight_results.isnull().sum()

EVENT                          0
BOUT                           0
OUTCOME                        0
WEIGHTCLASS                    0
METHOD                         0
ROUND                          0
TIME                           0
TIME FORMAT                    0
FIGHTER_1                      0
FIGHTER_2                      0
Total_Rounds_Format           31
Main_round_time_Format        31
Num_overtime_rounds_Format    31
Overtime_round_time_Format    31
gender                         0
is_title_bout                  0
weight_class                  10
fighter_1_result               0
fighter_2_results              0
dtype: int64

In [28]:
df_ufc_fight_results.to_csv('ufc_fight_results_processed.csv')