In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
df_fight_stats = pd.read_csv('ufc_fight_stats.csv')
df_fighter_details = pd.read_csv('ufc_fighter_details.csv')
df_ufc_fighter_tott = pd.read_csv('ufc_fighter_tott.csv')

In [3]:
df_fight_stats['EVENT'] = df_fight_stats['EVENT'].str.strip().str.lower()
df_fight_stats['BOUT'] = df_fight_stats['BOUT'].str.strip().str.lower()
df_fight_stats['FIGHTER'] = df_fight_stats['FIGHTER'].str.strip().str.lower()

In [4]:
df_fight_stats

Unnamed: 0,EVENT,BOUT,ROUND,FIGHTER,KD,SIG.STR.,SIG.STR. %,TOTAL STR.,TD,TD %,SUB.ATT,REV.,CTRL,HEAD,BODY,LEG,DISTANCE,CLINCH,GROUND
0,ufc fight night: covington vs. buckley,colby covington vs. joaquin buckley,Round 1,colby covington,0.0,10 of 51,19%,10 of 51,0 of 2,0%,0.0,0.0,0:22,5 of 42,4 of 6,1 of 3,10 of 51,0 of 0,0 of 0
1,ufc fight night: covington vs. buckley,colby covington vs. joaquin buckley,Round 2,colby covington,0.0,19 of 50,38%,25 of 56,1 of 3,33%,0.0,0.0,1:28,13 of 43,4 of 5,2 of 2,17 of 48,0 of 0,2 of 2
2,ufc fight night: covington vs. buckley,colby covington vs. joaquin buckley,Round 3,colby covington,0.0,8 of 23,34%,36 of 54,0 of 3,0%,0.0,0.0,1:50,8 of 23,0 of 0,0 of 0,6 of 21,0 of 0,2 of 2
3,ufc fight night: covington vs. buckley,colby covington vs. joaquin buckley,Round 1,joaquin buckley,0.0,23 of 55,41%,23 of 55,0 of 0,---,0.0,0.0,0:09,18 of 47,4 of 7,1 of 1,20 of 49,1 of 2,2 of 4
4,ufc fight night: covington vs. buckley,colby covington vs. joaquin buckley,Round 2,joaquin buckley,0.0,28 of 55,50%,29 of 56,0 of 1,0%,0.0,0.0,0:01,20 of 46,6 of 7,2 of 2,24 of 50,4 of 5,0 of 0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37297,ufc 2: no way out,johnny rhodes vs. david levicki,Round 1,david levicki,0.0,4 of 5,80%,95 of 102,0 of 0,---,0.0,0.0,--,4 of 5,0 of 0,0 of 0,1 of 2,2 of 2,1 of 1
37298,ufc 2: no way out,patrick smith vs. ray wizard,Round 1,patrick smith,0.0,1 of 1,100%,1 of 1,0 of 1,0%,1.0,0.0,--,0 of 0,1 of 1,0 of 0,0 of 0,1 of 1,0 of 0
37299,ufc 2: no way out,patrick smith vs. ray wizard,Round 1,ray wizard,0.0,1 of 1,100%,2 of 2,0 of 0,---,0.0,0.0,--,0 of 0,0 of 0,1 of 1,1 of 1,0 of 0,0 of 0
37300,ufc 2: no way out,scott morris vs. sean daugherty,Round 1,scott morris,0.0,1 of 1,100%,2 of 2,1 of 1,100%,1.0,0.0,--,1 of 1,0 of 0,0 of 0,0 of 0,1 of 1,0 of 0


In [5]:
def split_function(df, col_name, attempt_col, land_col):
    df[land_col] = None
    df[attempt_col] = None

    valid_rows = df[col_name].str.contains(r"^\d+ of \d+$", na = False)
    valid_data = df.loc[valid_rows,col_name].str.split(" of ", expand=True)

    df.loc[valid_rows, land_col] = pd.to_numeric(valid_data[0], errors = 'raise')
    df.loc[valid_rows, attempt_col] = pd.to_numeric(valid_data[1], errors = 'raise')

    invalid_rows = ~valid_rows
    if invalid_rows.any():
        print(invalid_rows.sum())
    return df

def cv_time(time):
    if isinstance(time, str):
        parts = time.split(':')
        if len(parts) == 2:
            minutes, seconds = parts
            if minutes.isdigit() and seconds.isdigit():
                return int(minutes) * 60 + int(seconds)
    return None

In [6]:
df_fight_stats = split_function(df_fight_stats, "SIG.STR.", "sig_str_land", "sig_str_attempt")
df_fight_stats = split_function(df_fight_stats, "TOTAL STR.", "total_str_land", "total_str_attempt")
df_fight_stats = split_function(df_fight_stats, "TD", "takedown_land", "touchdown_attempt")
df_fight_stats = split_function(df_fight_stats, "LEG", "total_leg_land", "total_leg_attempt")
df_fight_stats = split_function(df_fight_stats, "DISTANCE", "total_distance_strike_land", "total_distance_strike_attempt")
df_fight_stats = split_function(df_fight_stats, "BODY", "total_body_land", "total_body_attempt")
df_fight_stats = split_function(df_fight_stats, "CLINCH", "total_clinch_land", "total_clinch_attempt")
df_fight_stats = split_function(df_fight_stats, "GROUND", "total_ground_land", "total_ground_attempt")
df_fight_stats = split_function(df_fight_stats, "HEAD", "total_head_land", "total_head_attempt")



42
42
42
42
42
42
42
42
42


In [7]:
df_fight_stats= df_fight_stats.drop(columns=['SIG.STR.','TOTAL STR.', 'TD', 'LEG', 'DISTANCE', 'BODY', 'CLINCH', 'GROUND', 'HEAD'])

In [8]:
df_fight_stats['CTRL'] = df_fight_stats['CTRL'].apply(cv_time)


In [9]:
df_fight_stats['TD %'] = df_fight_stats['TD %'].replace('---', 0).str.replace('%', '', regex=True).astype(float) / 100
df_fight_stats['SIG.STR. %'] = df_fight_stats['SIG.STR. %'].replace('---', 0).str.replace('%', '', regex=True).astype(float) / 100

In [10]:
df_fight_stats['TD %'] = df_fight_stats['TD %'].fillna(0)
df_fight_stats['SIG.STR. %'] = df_fight_stats['SIG.STR. %'].fillna(0)
df_fight_stats['CTRL'] = df_fight_stats['CTRL'].fillna(0)


In [11]:
df_fight_stats.to_csv('ufc_fight_stats_processed.csv',index=False)