## Data Cleaning

In [1]:
import os
import sys
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

In [2]:
sys.path.append("../src/")

In [3]:
from utilities import *

### Events

In [38]:
def clean_events(df):
    res = df["location"].str.extract(r"(?:(?P<city>.*), )?(?P<state>.*), (?P<country>.*)")
    df = pd.concat([df, res], axis=1).drop("location", axis=1)
    
    df["city"] = df["city"].fillna("Unspecified")
    
    df["name"] = df["name"].astype("string")
    df["date"] = pd.to_datetime(df["date"], format="%B %d, %Y", errors="coerce")\
                .fillna(pd.to_datetime(df["date"], format="%b %d, %Y", errors="coerce"))
    df["url"] = df["url"].astype("string")
    df["city"] = df["city"].astype("category")
    df["state"] = df["state"].astype("category")
    df["country"] = df["country"].astype("category")
    
    return df

In [39]:
filepath = os.path.join(dir_dict["raw_csv"], "completed_events.csv")
completed_events = clean_events(pd.read_csv(filepath))

In [40]:
filepath = os.path.join(dir_dict["raw_csv"], "upcoming_events.csv")
upcoming_events = clean_events(pd.read_csv(filepath))

### Fights

In [104]:
filepath = os.path.join(dir_dict["raw_csv"], "completed_fights_data.csv")
df = pd.read_csv(filepath)
to_drop = [col for col in df.columns if "Details:" in col]
df = df.drop(to_drop, axis=1)

  df = pd.read_csv(filepath)


In [105]:
fighter1_cols = [col for col in df.columns if "Fighter1" in col]
fighter2_cols = [col for col in df.columns if "Fighter2" in col]
general_cols = [col for col in df.columns \
                        if col not in set(fighter1_cols).union(fighter2_cols)]

In [106]:
df2 = df.copy(deep=True)

In [107]:
df = df.drop(fighter2_cols, axis=1)
df2 = df2.drop(fighter1_cols, axis=1)

In [108]:
df.columns = lmap(lambda col: col.replace("Fighter1_",""), df.columns)
df.columns = lmap(lambda col: col.replace("Fighter1","Fighter"), df.columns)

df2.columns = lmap(lambda col: col.replace("Fighter2_",""), df2.columns)
df2.columns = lmap(lambda col: col.replace("Fighter2","Fighter"), df2.columns)

In [109]:
df = pd.concat([df, df2], axis=0)

In [116]:
df.shape

(13750, 116)

In [112]:
df.columns = lmap(lambda col: col.lower().replace(" ","_").replace(".",""), df.columns)

In [113]:
df["fighter_won"] = (df["fighter_status"] == "W").astype("uint8")

In [114]:
df

Unnamed: 0,event_name,event_url,fight_id,fighter_status,fighter_name,fighter_url,bout,method,round,time,...,round5_ctrl,round5_sig_str_ss,round5_sig_str_%_ss,round5_head_ss,round5_body_ss,round5_leg_ss,round5_distance_ss,round5_clinch_ss,round5_ground_ss,fighter_won
0,UFC Fight Night: Hermansson vs. Strickland,http://ufcstats.com/event-details/883922e5cd6d...,0730c4e6fb247fa1,L,Sam Alvey,http://ufcstats.com/fighter-details/d156513a19...,Light Heavyweight Bout,Submission,2,2:10,...,,,,,,,,,,0
1,UFC 100,http://ufcstats.com/event-details/0ee783aa00e4...,55833bf5054126e2,W,Jon Fitch,http://ufcstats.com/fighter-details/6f018c039b...,Welterweight Bout,Decision - Unanimous,3,5:00,...,,,,,,,,,,1
2,UFC 137: Penn vs Diaz,http://ufcstats.com/event-details/8788beb52889...,f19a56ecc6c150e3,L,Chris Camozzi,http://ufcstats.com/fighter-details/6c2030e0a1...,Middleweight Bout,Decision - Unanimous,3,5:00,...,,,,,,,,,,0
3,UFC 196: McGregor vs Diaz,http://ufcstats.com/event-details/db1f2ed63b54...,0bdea30d7a7322f7,W,Corey Anderson,http://ufcstats.com/fighter-details/5e4eec0889...,Light Heavyweight Bout,Decision - Unanimous,3,5:00,...,,,,,,,,,,1
4,UFC Fight Night: Costa vs. Vettori,http://ufcstats.com/event-details/8a9c6c4301f6...,acec5f4c6fee1ae2,W,Jeff Molina,http://ufcstats.com/fighter-details/008ea71027...,Flyweight Bout,KO/TKO,2,0:46,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6870,UFC Fight Night: Teixeira vs Saint Preux,http://ufcstats.com/event-details/b71667c778b6...,bd6d144f5c0b6cab,W,Amanda Nunes,http://ufcstats.com/fighter-details/80fa8218c9...,Women's Bantamweight Bout,Submission,1,2:53,...,,,,,,,,,,1
6871,UFC Fight Night: Figueiredo vs. Benavidez 2,http://ufcstats.com/event-details/ddbd0d6259ce...,147e31abbe3927d0,W,Serghei Spivac,http://ufcstats.com/fighter-details/e2f6b2769a...,Heavyweight Bout,Decision - Majority,3,5:00,...,,,,,,,,,,1
6872,UFC Fight Night: Boetsch vs Henderson,http://ufcstats.com/event-details/06dc1a586635...,5b0fcecc55bebd5c,L,Derrick Lewis,http://ufcstats.com/fighter-details/d3df1add9d...,Heavyweight Bout,KO/TKO,2,0:48,...,,,,,,,,,,0
6873,UFC Fight Night: Cowboy vs. Gaethje,http://ufcstats.com/event-details/4834ff149dc9...,dadaee9624256e07,W,Tristan Connelly,http://ufcstats.com/fighter-details/2ebfbe72ed...,Welterweight Bout,Decision - Unanimous,3,5:00,...,,,,,,,,,,1


In [119]:
df.groupby("fighter_name")["fighter_won"].sum().sort_values(ascending=False)

fighter_name
Jim Miller          24
Donald Cerrone      23
Andrei Arlovski     23
Demian Maia         22
Rafael Dos Anjos    21
                    ..
John Marsh           0
John Polakowski      0
John Teixeira        0
Johnny Rees          0
Zviad Lazishvili     0
Name: fighter_won, Length: 2278, dtype: uint8