In [1]:
import os
import re
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool

In [2]:
lmap = lambda funcion, iterable: list(map(funcion, iterable))
lfilter = lambda funcion, iterable: list(filter(funcion, iterable))

In [3]:
dir_dict = dict()

dir_dict["data"] = "../data/"
dir_dict["raw"] = os.path.join(dir_dict["data"], "raw")
dir_dict["mid"] = os.path.join(dir_dict["data"], "mid")
dir_dict["raw_csv"] = os.path.join(dir_dict["raw"], "csv")
dir_dict["raw_json"] = os.path.join(dir_dict["raw"], "json")
dir_dict["html"] = os.path.join(dir_dict["raw"], "html")
dir_dict["completed_html"] = os.path.join(dir_dict["html"], "completed")
dir_dict["upcoming_html"] = os.path.join(dir_dict["html"], "upcoming")
dir_dict["fighterlist_html"] = os.path.join(dir_dict["html"], "fighterlist")
dir_dict["fighters_html"] = os.path.join(dir_dict["html"], "fighters")
dir_dict["completed_eventlist_html"] = os.path.join(dir_dict["completed_html"], "eventlist")
dir_dict["completed_events_html"] = os.path.join(dir_dict["completed_html"], "events")
dir_dict["completed_fights_html"] = os.path.join(dir_dict["completed_html"], "fights")
dir_dict["upcoming_eventlist_html"] = os.path.join(dir_dict["upcoming_html"], "eventlist")
dir_dict["upcoming_events_html"] = os.path.join(dir_dict["upcoming_html"], "events")
dir_dict["upcoming_fights_html"] = os.path.join(dir_dict["upcoming_html"], "fights")

In [4]:
for folderpath in dir_dict.values():
    os.makedirs(folderpath, exist_ok=True)

In [5]:
filepath = os.path.join(dir_dict["raw_json"], "completed_fights_data.json")
with open(filepath, "r") as f:
    completed_fights_lod = json.load(f)

In [6]:
len(completed_fights_lod)

6875

In [7]:
def process_fight_dict(fight_dict):

    def merge_additional_data(data_header):
        additional_dict = fight_dict.pop(data_header)
        fighter_names = additional_dict["Fighter"]
        round_order = lmap(lambda x: x.replace(" ", ""), additional_dict["Round"])
        if (fighter_names[0] == fight_dict["Fighter1 Name"]) and (fighter_names[1] == fight_dict["Fighter2 Name"]):
            fighter_order = ["Fighter1", "Fighter2"]
        elif (fighter_names[0] == fight_dict["Fighter2 Name"]) and (fighter_names[1] == fight_dict["Fighter1 Name"]):
            fighter_order = ["Fighter1", "Fighter2"]
        else:
            raise Exception(f"Fighter names not congurent in {data_header} table")

        for key, lov in list(additional_dict.items())[1:-1]:
            for i, v in enumerate(lov):
                var = f"{key}_SS" if data_header == "Significant Strikes" else key
                fight_dict[f"{var}_{round_order[i]}_{fighter_order[(i % 2)]}"] = v

    fight_dict_keys = fight_dict.keys()
    additional_data_headers = ["Totals", "Significant Strikes"]
    for data_header in additional_data_headers:
        if data_header in fight_dict_keys:
            merge_additional_data(data_header)

    return fight_dict

In [8]:
res_fight_dict_list = lmap(process_fight_dict, completed_fights_lod)

In [9]:
len(res_fight_dict_list)

6875

In [10]:
fights_df = pd.DataFrame(res_fight_dict_list)

In [11]:
fights_df.memory_usage(deep=True).sum() / 10**6

77.408445

In [12]:
filepath = os.path.join(dir_dict["raw_csv"], "completed_fights_data.csv")
fights_df.to_csv(filepath, index=False)

In [13]:
fights_df = pd.read_csv(filepath)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [14]:
fights_df["Event ID"] = fights_df["Event Url"].map(lambda s: os.path.split(s)[1])

In [15]:
fights_df["Fighter1 ID"] = fights_df["Fighter1 Url"].map(lambda s: os.path.split(s)[1])
fights_df["Fighter2 ID"] = fights_df["Fighter2 Url"].map(lambda s: os.path.split(s)[1])

In [16]:
fights_df = fights_df.drop(columns=["Event Url", "Fighter1 Url", "Fighter2 Url"])

In [17]:
fights_df.head()

Unnamed: 0,Event Name,Fight ID,Fighter1 Status,Fighter1 Name,Fighter2 Status,Fighter2 Name,Bout,Method,Round,Time,...,Ground_SS_Round4_Fighter2,Ground_SS_Round5_Fighter1,Ground_SS_Round5_Fighter2,Details: Point Deducted,Details: Two Points Deducted,Details: Technical Decision,Details: Points Deducted,Event ID,Fighter1 ID,Fighter2 ID
0,UFC Fight Night: Covington vs. Woodley,859ca56770ac7672,W,Mackenzie Dern,L,Randa Markos,Women's Strawweight Bout,Submission,1,3:44,...,,,,,,,,831b937811804dad,7447e9f28508106a,4a57ebb14315b251
1,UFC Fight Night: Barboza vs. Lee,6d8ce1abde550bc6,W,Frankie Edgar,L,Cub Swanson,Featherweight Bout,Decision - Unanimous,3,5:00,...,,,,,,,,ad99fa5325519169,f2688492b9a525a3,d247691a6c0e9034
2,UFC on FX: Sotiropoulos vs Pearson,d4c85298623130de,W,Mike Pierce,L,Seth Baczynski,Welterweight Bout,Decision - Unanimous,3,5:00,...,,,,,,,,e8c170a64dc920ac,236a37d96d476164,fd5b6598a3b70c0a
3,UFC Fight Night: Cejudo vs. Dillashaw,0261adcfc6e49150,W,Gregor Gillespie,L,Yancy Medeiros,Lightweight Bout,KO/TKO,2,4:59,...,,,,,,,,d4da8995fc91e7ef,84ff027394f7e470,813550bc53b15fb0
4,UFC Fight Night: Barboza vs. Chikadze,089a17e450cc98aa,L,Kevin Lee,W,Daniel Rodriguez,Welterweight Bout,Decision - Unanimous,3,5:00,...,,,,,,,,0db9d2486d564a3c,ee9ebceabfd16fa7,8a1f3b5c526cd6e6


In [18]:
filepath = os.path.join(dir_dict["raw_csv"], "completed_fight_urls.csv")
fights_df2 = pd.read_csv(filepath)

In [19]:
fights_df2["Fight ID"] = fights_df2["Fight Url"].map(lambda s: os.path.split(s)[1])
fights_df2 = fights_df2.drop(columns=["Fight Url"])

In [20]:
fights_df2.head()

Unnamed: 0,Weight Class,Fight ID
0,Lightweight,81f287d035190506
1,Heavyweight,1f0254eeb2025e4f
2,Lightweight,4942323a5d5a6594
3,Women's Strawweight,95ead23dc0c73285
4,Featherweight,164f56eb25d6a9b4


In [21]:
fights_df = fights_df.merge(fights_df2, on="Fight ID", how="left")

In [22]:
fights_df["Weight Class"].isnull().sum()

0

In [23]:
filepath = os.path.join(dir_dict["raw_csv"], "completed_events.csv")
events_df = pd.read_csv(filepath)

In [24]:
events_df.head()

Unnamed: 0,name,date,location,url
0,UFC Fight Night: Cannonier vs. Strickland,"December 17, 2022","Las Vegas, Nevada, USA",http://ufcstats.com/event-details/56ec58954158...
1,UFC 282: Blachowicz vs. Ankalaev,"December 10, 2022","Las Vegas, Nevada, USA",http://ufcstats.com/event-details/f65a0eb902f9...
2,UFC Fight Night: Thompson vs. Holland,"December 03, 2022","Orlando, Florida, USA",http://ufcstats.com/event-details/b23388ff8ac6...
3,UFC Fight Night: Nzechukwu vs. Cutelaba,"November 19, 2022","Las Vegas, Nevada, USA",http://ufcstats.com/event-details/012fc7cd0779...
4,UFC 281: Adesanya vs. Pereira,"November 12, 2022","New York City, New York, USA",http://ufcstats.com/event-details/b3b6e80b7d5f...


In [25]:
events_df["Event ID"] = events_df["url"].map(lambda s: os.path.split(s)[1])
events_df = events_df.drop(columns=["name", "url"])

In [26]:
events_df.head()

Unnamed: 0,date,location,Event ID
0,"December 17, 2022","Las Vegas, Nevada, USA",56ec58954158966a
1,"December 10, 2022","Las Vegas, Nevada, USA",f65a0eb902f9476b
2,"December 03, 2022","Orlando, Florida, USA",b23388ff8ac6637b
3,"November 19, 2022","Las Vegas, Nevada, USA",012fc7cd0779c09a
4,"November 12, 2022","New York City, New York, USA",b3b6e80b7d5f8f0d


In [27]:
fights_df = fights_df.merge(events_df, on="Event ID", how="left")

In [28]:
fights_df.iloc[:,-2:].isnull().sum()

date        0
location    0
dtype: int64

In [30]:
fights_df.head()

Unnamed: 0,Event Name,Fight ID,Fighter1 Status,Fighter1 Name,Fighter2 Status,Fighter2 Name,Bout,Method,Round,Time,...,Details: Point Deducted,Details: Two Points Deducted,Details: Technical Decision,Details: Points Deducted,Event ID,Fighter1 ID,Fighter2 ID,Weight Class,date,location
0,UFC Fight Night: Covington vs. Woodley,859ca56770ac7672,W,Mackenzie Dern,L,Randa Markos,Women's Strawweight Bout,Submission,1,3:44,...,,,,,831b937811804dad,7447e9f28508106a,4a57ebb14315b251,Women's Strawweight,"September 19, 2020","Las Vegas, Nevada, USA"
1,UFC Fight Night: Barboza vs. Lee,6d8ce1abde550bc6,W,Frankie Edgar,L,Cub Swanson,Featherweight Bout,Decision - Unanimous,3,5:00,...,,,,,ad99fa5325519169,f2688492b9a525a3,d247691a6c0e9034,Featherweight,"April 21, 2018","Atlantic City, New Jersey, USA"
2,UFC on FX: Sotiropoulos vs Pearson,d4c85298623130de,W,Mike Pierce,L,Seth Baczynski,Welterweight Bout,Decision - Unanimous,3,5:00,...,,,,,e8c170a64dc920ac,236a37d96d476164,fd5b6598a3b70c0a,Welterweight,"December 14, 2012","Gold Coast, Queensland, Australia"
3,UFC Fight Night: Cejudo vs. Dillashaw,0261adcfc6e49150,W,Gregor Gillespie,L,Yancy Medeiros,Lightweight Bout,KO/TKO,2,4:59,...,,,,,d4da8995fc91e7ef,84ff027394f7e470,813550bc53b15fb0,Lightweight,"January 19, 2019","Brooklyn, New York, USA"
4,UFC Fight Night: Barboza vs. Chikadze,089a17e450cc98aa,L,Kevin Lee,W,Daniel Rodriguez,Welterweight Bout,Decision - Unanimous,3,5:00,...,,,,,0db9d2486d564a3c,ee9ebceabfd16fa7,8a1f3b5c526cd6e6,Welterweight,"August 28, 2021","Las Vegas, Nevada, USA"


In [32]:
filepath = os.path.join(dir_dict["mid"], "completed_fights.csv")
fights_df.to_csv(filepath, index=False)