## Data Parsing

In [148]:
import os
import re
import time
import json
import httpx
import random
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from multiprocessing import Pool

In [138]:
lmap = lambda funcion, iterable: list(map(funcion, iterable))
lfilter = lambda funcion, iterable: list(filter(funcion, iterable))

In [139]:
data_dir = "../data/"
raw_dir = os.path.join(data_dir, "raw")
raw_csv_dir = os.path.join(raw_dir, "csv")
raw_json_dir = os.path.join(raw_dir, "json")
html_dir = os.path.join(raw_dir, "html")
completed_html_dir = os.path.join(html_dir, "completed")
upcoming_html_dir = os.path.join(html_dir, "upcoming")
fighterlist_html_dir = os.path.join(html_dir, "fighterlist")
fighters_html_dir = os.path.join(html_dir, "fighters")
completed_eventlist_html_dir = os.path.join(completed_html_dir, "eventlist")
completed_events_html_dir = os.path.join(completed_html_dir, "events")
completed_fights_html_dir = os.path.join(completed_html_dir, "fights")
upcoming_eventlist_html_dir = os.path.join(upcoming_html_dir, "eventlist")
upcoming_events_html_dir = os.path.join(upcoming_html_dir, "events")
upcoming_fights_html_dir = os.path.join(upcoming_html_dir, "fights")

In [140]:
dirs = [raw_csv_dir, raw_json_dir, fighters_html_dir, fighterlist_html_dir,
        completed_eventlist_html_dir, 
        completed_events_html_dir,
        upcoming_eventlist_html_dir, upcoming_events_html_dir,
       completed_fights_html_dir, upcoming_fights_html_dir]

for folderpath in dirs:
    os.makedirs(folderpath, exist_ok=True)

### Parse Fight page

In [149]:
def indiv_fight_data_extractor(fight_id_html):
    fight_id, fight_html = fight_id_html 
    fight_dict = {}

    soup = BeautifulSoup(fight_html)
    title_a_elem = soup.find("h2", class_="b-content__title").find("a")
    fight_dict["Event Name"] = title_a_elem.text.strip()
    fight_dict["Event Url"] = title_a_elem["href"]
    fight_dict["Fight ID"] = fight_id
    fighters = soup.find_all("div", class_="b-fight-details__person")
    for idx, fighter in enumerate(fighters, start=1):
        fight_dict[f"Fighter{idx} Status"] = fighter.find("i", class_="b-fight-details__person-status").text.strip()

        a_elem = fighter.find("a", class_="b-link b-fight-details__person-link")
        fight_dict[f"Fighter{idx} Name"] = a_elem.text.strip()
        fight_dict[f"Fighter{idx} Url"] = a_elem["href"]

    fight_dict["Bout"] = soup.find("i", class_="b-fight-details__fight-title").text.strip()

    fight_details_div = soup.find("div", class_="b-fight-details__content")

    method_elem = fight_details_div.find("p").find("i", class_="b-fight-details__text-item_first")

    def get_details(i_elem):
        i_text = i_elem.text.replace("\n", "").strip()
        m = re.search(r"(.*):\s+(.*)", i_text)
        if m:
            return (m.group(1), m.group(2))

    label_elems = fight_details_div.find_all("i", class_="b-fight-details__label")
    detail_elems = lmap(lambda e: e.parent, label_elems)
    detail_tups = lfilter(lambda t: t != None, map(get_details, detail_elems))

    for label, text in detail_tups:
        fight_dict[label] = text

    fight_dict

    details_text = fight_details_div.find_all("p")[1].text.replace("\n", "").strip()

    m = re.search(r"(.*):\s+(.*)", details_text)
    if m:
        fight_dict[m.group(1)] = m.group(2)

    fight_dict

    tables = soup.find_all("table")

    len(tables)

    def fight_tables_to_dicts(page_html: str):

        def extract_table(tables):
            data_dict = {key:[] for key in map(lambda x: x.text.strip(), tables[0].find("thead").find_all("th"))}
            data_dict["Round"] = []

            for i, table in enumerate(tables[:2]):
                rows = table.find("tbody").find_all("tr")

                for j, row in enumerate(rows, start=1):
                    for col, elem in zip(data_dict.keys(), row.find_all("td")):
                        if col == "Fighter":
                            for a_elem in elem.find_all("a"):
                                data_dict[col].append(a_elem.text.strip())
                        else:
                            for p_elem in elem.find_all("p"):
                                val = p_elem.text.strip()
                                val = val if val != "---" else None
                                data_dict[col].append(val)

                    if i == 0:
                        data_dict["Round"].extend(["Overall", "Overall"])
                    else:
                        data_dict["Round"].extend([f"Round {j}", f"Round {j}"])


            return data_dict


        soup = BeautifulSoup(page_html, features="lxml")
        tables = soup.find_all("table")
        
        return extract_table(tables[:2]), extract_table(tables[2:])
         

    try:
        fight_dict["Totals"], fight_dict["Significant Strikes"] = fight_tables_to_dicts(fight_html)
    except IndexError:
        print("Table IndexError")
        pass 
    
    return fight_dict

In [152]:
def all_fight_data_extractor(fights_html_dir):
    fights_html_dict = {}

    fight_files = lfilter(lambda x: x.endswith(".html"), os.listdir(fights_html_dir))
    for filename in tqdm(fight_files):
        filepath = os.path.join(fights_html_dir, filename)
        with open(filepath, "r") as f:
            html_str = f.read()
            fight_id = filename.replace(".html","")
            fights_html_dict[fight_id] = html_str

    # fights_dict_list = []
    
    # for fight_id, fight_html in tqdm(fights_html_dict.items()):
    #     fights_dict_list.append(indiv_fight_data_extractor(fight_id, fight_html))
    
    with Pool(16) as p:
        fights_dict_list = p.map(indiv_fight_data_extractor, fights_html_dict.items())
        
    return fights_dict_list

In [157]:
completed_fights_dict_list = all_fight_data_extractor(completed_fights_html_dir)

100%|██████████| 6875/6875 [00:00<00:00, 26644.24it/s]


Table IndexError
Table IndexError
Table IndexError
Table IndexError
Table IndexError
Table IndexError
Table IndexError
Table IndexError
Table IndexError
Table IndexError
Table IndexError
Table IndexError
Table IndexError
Table IndexError
Table IndexError


In [158]:
len(completed_fights_dict_list)

6875

In [159]:
filepath = os.path.join(raw_json_dir, "completed_fights_data.json")
with open(filepath, "w") as f:
    json.dump(completed_fights_dict_list, f, indent=4)