<a href="https://colab.research.google.com/github/ayanga1998/UFC_Dashboard/blob/main/Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# UFC Dataset Cleaning

The purpose of this notebook is to establish set functions to clean the data we collected from the UFC Stats website and convert the raw data into information we can use for processing.

In [97]:
import pandas as pd
import numpy as np

In [98]:
dataset = pd.read_csv('/content/drive/MyDrive/Data Science/Github/UFC_Project/sample_ufc_dataset.csv', index_col=[0])

dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 0 to 19
Data columns (total 36 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   red_fighter     20 non-null     object
 1   blue_fighter    20 non-null     object
 2   red_kd          20 non-null     int64 
 3   blue_kd         20 non-null     int64 
 4   red_ss          20 non-null     object
 5   blue_ss         20 non-null     object
 6   red_ss_pct      20 non-null     object
 7   blue_ss_pct     20 non-null     object
 8   red_ts          20 non-null     object
 9   blue_ts         20 non-null     object
 10  red_td          20 non-null     object
 11  blue_td         20 non-null     object
 12  red_td_pct      20 non-null     object
 13  blue_td_pct     20 non-null     object
 14  red_sub_att     20 non-null     int64 
 15  blue_sub_att    20 non-null     int64 
 16  red_rev         20 non-null     int64 
 17  blue_rev        20 non-null     int64 
 18  red_ctrl_tim

**Key**

kd: knockdowns \
ss: significant strikes \
td: takedowns \
rev: ? \

In [99]:
dataset.head()

Unnamed: 0,red_fighter,blue_fighter,red_kd,blue_kd,red_ss,blue_ss,red_ss_pct,blue_ss_pct,red_ts,blue_ts,red_td,blue_td,red_td_pct,blue_td_pct,red_sub_att,blue_sub_att,red_rev,blue_rev,red_ctrl_time,blue_ctrl_time,red_head,blue_head,red_body,blue_body,red_leg,blue_leg,red_dist,blue_dist,red_clinch,blue_clinch,red_grnd,blue_grnd,result,method,round,time
0,Ken Shamrock,Felix Lee Mitchell,0,0,4 of 4,3 of 3,100%,100%,7 of 7,21 of 22,1 of 2,0 of 0,50%,---,1,0,0,0,--,--,1 of 1,0 of 0,2 of 2,1 of 1,1 of 1,2 of 2,0 of 0,0 of 0,3 of 3,3 of 3,1 of 1,0 of 0,Ken Shamrock,Submission,\n\n Round:\n \n 1\n,\n\n Time:\n \n 4:34\n\...
1,Royce Gracie,Kimo Leopoldo,0,0,2 of 6,6 of 9,33%,66%,17 of 21,6 of 10,0 of 2,1 of 1,0%,100%,1,0,1,1,--,--,1 of 1,3 of 6,0 of 2,2 of 2,1 of 3,1 of 1,0 of 0,0 of 1,2 of 6,3 of 3,0 of 0,3 of 5,Royce Gracie,Submission,\n\n Round:\n \n 1\n,\n\n Time:\n \n 4:40\n\...
2,Harold Howard,Roland Payne,1,0,9 of 12,3 of 4,75%,75%,12 of 15,3 of 4,0 of 1,1 of 2,0%,50%,0,0,0,0,--,--,9 of 12,0 of 0,0 of 0,2 of 2,0 of 0,1 of 2,1 of 4,2 of 2,4 of 4,1 of 2,4 of 4,0 of 0,Harold Howard,KO/TKO,\n\n Round:\n \n 1\n,\n\n Time:\n \n 0:46\n\...
3,Ken Shamrock,Christophe Leninger,0,0,8 of 9,1 of 2,88%,50%,16 of 20,21 of 23,1 of 1,0 of 0,100%,---,0,0,0,0,--,--,5 of 6,1 of 2,3 of 3,0 of 0,0 of 0,0 of 0,0 of 1,1 of 2,0 of 0,0 of 0,8 of 8,0 of 0,Ken Shamrock,KO/TKO,\n\n Round:\n \n 1\n,\n\n Time:\n \n 4:49\n\...
4,Keith Hackney,Emmanuel Yarborough,1,0,34 of 50,1 of 5,68%,20%,36 of 52,4 of 9,0 of 0,0 of 1,---,0%,0,0,0,0,--,--,29 of 45,1 of 5,0 of 0,0 of 0,5 of 5,0 of 0,6 of 8,0 of 2,2 of 3,0 of 0,26 of 39,1 of 3,Keith Hackney,KO/TKO,\n\n Round:\n \n 1\n,\n\n Time:\n \n 1:59\n\...


In [100]:
def string_to_num(data, colname):
    ''' 
    Clean columns associated with string i.e. (4 of 15 strikes) and create 
    separate columns for strikes thrown and landed respectively 
    '''

    col_thrown = colname + '_thrown'
    col_landed = colname + '_landed'
    
    thrown = data[colname].apply(lambda x: int(x.replace('of', '').split()[1]))
    landed = data[colname].apply(lambda x: int(x.replace('of', '').split()[0]))

    data[col_thrown] = thrown 
    data[col_landed] = landed
    
    data = data.drop(colname, axis=1)

    return data

def clean_pct(data, col):
    '''Clean columns associated with percentages'''

    data[col] = data[col].apply(lambda x: int(x.replace('%', '').replace('---', '0').strip())/100)

    return data

def clean_ctrl_time(data):
    if '--' in data:
        time = int(data.strip().replace('--', '0'))
    else:
        time = data.strip().split(':')
        time = int(time[0])*60 + int(time[1])
    
    return time

def get_seconds(value):
    time_list = value.replace('Time:','').replace('\n','').strip().split(':')
    time = int(time_list[0])*60 + int(time_list[1])
    return time

In [101]:
df = dataset

# Extract strikes thrown and landed and store into columns
cols = ['red_ss', 'blue_ss', 'red_ts', 'blue_ts', 'red_td', 'blue_td', 
        'red_head', 'blue_head', 'red_body', 'blue_body', 'red_leg', 'blue_leg', 'red_dist', 'blue_dist',
        'red_clinch', 'blue_clinch', 'red_grnd', 'blue_grnd']

for col in cols:
    df = string_to_num(df, col)


# Clean percentage columns
pct_cols = ['red_ss_pct', 'blue_ss_pct', 'red_td_pct', 'blue_td_pct']

for col in pct_cols:
    df = clean_pct(df, col)


# Clean round data
df['round'] = df['round'].apply(lambda x: int(x.replace('\n','').replace('Round:', '').strip()))


# Clean control time columns
ctrl_cols = ['red_ctrl_time', 'blue_ctrl_time']

for col in ctrl_cols:
    df[col] = df[col].apply(lambda x: clean_ctrl_time(x))


# Clean time column (convert to seconds)
df['time'] = df['time'].apply(lambda x: get_seconds(x))


# Calculate total fight time in seconds
df['fight_time'] = (df['round']-1)*60 + df['time']

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 0 to 19
Data columns (total 55 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   red_fighter         20 non-null     object 
 1   blue_fighter        20 non-null     object 
 2   red_kd              20 non-null     int64  
 3   blue_kd             20 non-null     int64  
 4   red_ss_pct          20 non-null     float64
 5   blue_ss_pct         20 non-null     float64
 6   red_td_pct          20 non-null     float64
 7   blue_td_pct         20 non-null     float64
 8   red_sub_att         20 non-null     int64  
 9   blue_sub_att        20 non-null     int64  
 10  red_rev             20 non-null     int64  
 11  blue_rev            20 non-null     int64  
 12  red_ctrl_time       20 non-null     int64  
 13  blue_ctrl_time      20 non-null     int64  
 14  result              20 non-null     object 
 15  method              20 non-null     object 
 16  round     