### Scrape Fighters Data

In [8]:
import requests
import bs4
import helpers
import string
import pandas as pd

In [9]:
BASE_URL = "http://ufcstats.com/statistics/fighters"

In [10]:
# The page is organized by fighters names so I decided to
# Loop through different pages using an alphabets list
alphabets = list(string.ascii_lowercase)
for letter in alphabets:
    fighters_DOM_per_letter = helpers.cached_request(
        f"{BASE_URL}?char={letter}")
    soup = bs4.BeautifulSoup(fighters_DOM_per_letter, 'html.parser')
    pagination = soup.find('ul', class_='b-statistics__paginate')
    n_pages = len(pagination.find_all('li'))

    # Initialize the fighters dict
    if letter == 'a':
        headers = [th.get_text(strip=True)
                   for th in soup.select("table thead th")]
        fighters_data = {header.title(): [] for header in headers}

    # Not adding 1 to the number of list items because
    # there is a link for "all"
    for i in range(1, n_pages):
        current_page = helpers.cached_request(
            f"{BASE_URL}?char={letter}&page={i}")
        soup_1 = bs4.BeautifulSoup(current_page, 'html.parser')

        # Scraping fighters tabular data
        for row in soup_1.select("table tbody tr"):
            cells = row.find_all('td', class_='b-statistics__table-col')

            if len(cells) == 0:
                continue

            while len(cells) < len(headers):
                cells.append(None)

            for header, cell in zip(headers, cells):
                header = header.title()
                if cell:
                    fighters_data[header].append(cell.get_text(strip=True))
                else:
                    fighters_data[header].append(None)

In [None]:
fighters_df = pd.DataFrame(fighters_data)

In [12]:
fighters_df.head()

Unnamed: 0,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,Belt
0,Tom,Aaron,,--,155 lbs.,--,,5,3,0,
1,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4,6,0,
2,Nariman,Abbasov,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28,4,0,
3,Darion,Abbey,,"6' 2""",265 lbs.,"80.0""",Orthodox,9,5,0,
4,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10,15,0,


In [None]:
fighters_df.to_csv('raw_data/raw_fighters.csv')

### Scrape Events

In [69]:
BASE_URL = "http://ufcstats.com/statistics/events/completed?page=all"

In [70]:
events_html = helpers.cached_request("http://ufcstats.com/statistics/events/completed?page=all")

In [71]:
def get_event_data(event):
    first_td = event.select_one('td.b-statistics__table-col')
    event_link = first_td.select_one('a.b-link')
    event_name = event_link.get_text(strip=True)
    event_id = event_link['href'].split('/')[-1]
    event_date = first_td.select_one('span.b-statistics__date').get_text(strip=True)

    location = event.select_one('td.b-statistics__table-col_style_big-top-padding').get_text(strip=True)

    return [event_id,event_name,event_date,location]

In [72]:
events_soup = bs4.BeautifulSoup(events_html, 'html.parser')

# Get table headers
events_table_headers = [th.get_text(strip=True)
                        for th in events_soup.select('table thead th')]
# The first header is Name/Date so it is better to split it into separate name & date
events_table_headers = ['event_id']+events_table_headers[0].split(
    '/')+events_table_headers[1:]
events_dict = {header.title(): [] for header in events_table_headers}
events_rows = events_soup.select('table tbody tr.b-statistics__table-row')
# The first element is an empty row
events_rows.pop(0)
for event in events_rows:
    event_details = get_event_data(event)
    for header, cell in zip(events_table_headers, event_details):
        header = header.title()
        events_dict[header].append(cell)

In [101]:
# Storing events in a dataframe 
events_df = pd.DataFrame(events_dict)
events_df.set_index('Event_Id',inplace=True)
events_df.head()

Unnamed: 0_level_0,Name,Date,Location
Event_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
754968e325d6f60d,UFC Fight Night: Walker vs. Zhang,"August 23, 2025","Shanghai, Hebei, China"
421ccfc6ddb17958,UFC 319: Du Plessis vs. Chimaev,"August 16, 2025","Chicago, Illinois, USA"
6cd3dfc54f01287f,UFC Fight Night: Dolidze vs. Hernandez,"August 09, 2025","Las Vegas, Nevada, USA"
f2c934689243fe4e,UFC Fight Night: Taira vs. Park,"August 02, 2025","Las Vegas, Nevada, USA"
28d8638ea0a71908,UFC Fight Night: Whittaker vs. De Ridder,"July 26, 2025","Abu Dhabi, Abu Dhabi, United Arab Emirates"


In [None]:
events_df.to_csv('raw_data/raw_events.csv')

### Scrape Events Details (Fights)

In [105]:
def get_fights_data(cells):
    result_flag = cells[0].get_text(strip=True)
    fighters = [a.get_text(strip=True)
                for a in cells[1].select("a.b-link")]
    kd = [p.get_text(strip=True) for p in cells[2].select("p")]
    strikes = [p.get_text(strip=True) for p in cells[3].select("p")]
    td = [p.get_text(strip=True) for p in cells[4].select("p")]
    sub = [p.get_text(strip=True) for p in cells[5].select("p")]
    weight_class = cells[6].get_text(strip=True)
    method = " ".join([p.get_text(strip=True)
                       for p in cells[7].select("p") if p.get_text(strip=True)])
    round_num = cells[8].get_text(strip=True)
    fight_time = cells[9].get_text(strip=True)

    return [result_flag, *fighters, *kd, *strikes, *
            td, *sub, weight_class, method, round_num, fight_time]

In [106]:
def extract_fights_from_event(event):
    first_td = event.select_one('td.b-statistics__table-col')
    event_link = first_td.select_one('a.b-link').get('href')
    # Visit each event link
    event_html = helpers.cached_request(event_link)
    event_soup = bs4.BeautifulSoup(event_html, "html.parser")
    # Fights in each event share the same fight_id
    event_id = event_link.split('/')[-1]
    # For each event, fights are arranged in tables
    fights = event_soup.select(
        "table.b-fight-details__table tbody tr.b-fight-details__table-row")
    return (event_id,fights)

In [111]:
# Initializing the dataframe dict
fights_headers = ["Fight_Id",
                  "Win/No Contest/Draw",
                  "Fighter_1",
                  "Fighter_2",
                  "KD_1",
                  "KD_2",
                  "STR_1",
                  "STR_2",
                  "TD_1",
                  "TD_2",
                  "SUB_1",
                  "SUB_2",
                  "Weight_Class",
                  "Method",
                  "Round",
                  "Fight_Time",
                  "Event_Id"
                  ]
fights_dict = {header: [] for header in fights_headers}

In [112]:
# I already have events_rows
# Loop through them
for event in events_rows:
    event_id,fight_rows = extract_fights_from_event(event)
    for fight_row in fight_rows:
        cells = fight_row.select('td')
        fight_info = get_fights_data(cells)
        fight_id = fight_row['data-link'].split('/')[-1]
        fight_info.insert(0, fight_id)
        fight_info.append(event_id)
        for key, val in zip(fights_headers, fight_info):
            fights_dict[key].append(val)

In [113]:
fights_df = pd.DataFrame(fights_dict)
fights_df.set_index('Fight_Id',inplace=True)
fights_df.head()

Unnamed: 0_level_0,Win/No Contest/Draw,Fighter_1,Fighter_2,KD_1,KD_2,STR_1,STR_2,TD_1,TD_2,SUB_1,SUB_2,Weight_Class,Method,Round,Fight_Time,Event_Id
Fight_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2eecf0c36192e40c,win,Johnny Walker,Zhang Mingyang,1,0,50,20,0,0,0,0,Light Heavyweight,KO/TKO Punches,2,2:37,754968e325d6f60d
58080e8989927500,win,Aljamain Sterling,Brian Ortega,0,0,124,55,3,0,0,0,Catch Weight,U-DEC,5,5:00,754968e325d6f60d
5538d15c4eeac3b3,win,Sergei Pavlovich,Waldo Cortes-Acosta,0,0,61,45,0,0,0,0,Heavyweight,U-DEC,3,5:00,754968e325d6f60d
ce6bd165bef82bd7,win,Sumudaerji,Kevin Borjas,0,0,73,16,0,0,0,0,Flyweight,U-DEC,3,5:00,754968e325d6f60d
1452c20d964c0a63,win,Taiyilake Nueraji,Kiefer Crosbie,0,0,27,2,1,0,0,0,Welterweight,KO/TKO Elbows,1,3:33,754968e325d6f60d


In [None]:
fights_df.to_csv('raw_data/raw_fights.csv')

There are more details about fights at the "/fight-details" route

### Scrape Fights Details

In [None]:
def extract_cells(table,fight_dict={},cols_to_ignore=0):
    cells = table.select('td')
    del cells[:cols_to_ignore]
    
    # Format table headers
    headers = [formatted_header.title() for header in table.select('th') for formatted_header in (
        f"{header.get_text(strip=True)}_1", f"{header.get_text(strip=True)}_2")]
    del headers[:cols_to_ignore*2]

    cells = [s for cell in cells for s in cell.stripped_strings]

    for header, cell in zip(headers, cells):
        fight_dict[header] = cell

In [None]:
def parse_fight_details(fight_html):
    fight_dict = {}
    
    soup = bs4.BeautifulSoup(fight_html, 'html.parser')

    # Get Result for each fighter W/L
    fighters_result_div = soup.select('.b-fight-details__person')
    fight_dict['Result_1'] = fighters_result_div[0].select_one(
        '.b-fight-details__person-status').get_text(strip=True)
    fight_dict['Result_2'] = fighters_result_div[1].select_one(
        '.b-fight-details__person-status').get_text(strip=True)

    fight_details_text = soup.select_one('.b-fight-details__content')
    paras = fight_details_text.find_all('p')

    if paras:
        # Get referee name and fight's time format
        first_para = paras[0]
        first_para_i_tags = first_para.find_all('i', recursive=False)
        fight_dict['Time Format'] = first_para_i_tags[3].get_text(
            strip=True).split(':')[1]
        fight_dict['Referee'] = first_para_i_tags[4].get_text(strip=True).split(':')[
            1]

        second_para = paras[1]
        # Remove the i tag in order to get only the text of the method details
        second_para.select_one('i').decompose()
        fight_dict['Method Details'] = second_para.get_text(strip=True)

    tables_soup = soup.select('table')
    # Ignoring this condition cost me waiting for 47m 28.6s then getting an error  :(
    if tables_soup:

        # Totals table
        totals_table = tables_soup[0]
        extract_cells(totals_table,fight_dict)
        
        # Significant strikes table
        sig_str_table = tables_soup[2]
        extract_cells(sig_str_table,fight_dict,)
            
    return fight_dict

In [None]:
# I am printing dicts too much. I need to do it in a prettier way
from pprint import pprint
from tqdm.auto import tqdm

In [None]:
fights_list = []
events_total = len(events_rows)
# Adding tqdm to this loop made the waiting process less boring
for event in tqdm(
    events_rows,
    total=events_total,
    desc="Events",
    unit="event",
    position=0,
    leave=True,
    dynamic_ncols=True,
):
    fights = extract_fights_from_event(event)
    fights_total = len(fights)
    for fight in tqdm(
        fights,
        total=fights_total,
        desc="Fights",
        unit="fight",
        position=1,
        leave=False,
        dynamic_ncols=True,
    ):
        fight_details_link = fight['data-link']
        fight_details_html = helpers.cached_request(fight_details_link)
        fight_dict = parse_fight_details(fight_details_html)
        fight_dict['fight_id'] = fight_details_link.split('/')[-1]
        fights_list.append(fight_dict)

Events: 100%|██████████| 744/744 [15:23<00:00,  1.24s/event]


In [None]:
details_df = pd.DataFrame(fights_list)
details_df.set_index('fight_id',inplace=True)
details_df.to_csv('raw_data/raw_details.csv')

### Join Fights with their details

In [None]:
len(details_df) == len(fights_df)

True

In [None]:
# Join fights with their details on fight_id and set it as the index
combined_df = fights_df.merge(details_df, on='fight_id', how='left')
combined_df.set_index('fight_id', inplace=True)
combined_df.head()

Unnamed: 0_level_0,Win/No Contest/Draw,Fighter_1_x,Fighter_2_x,KD_1,KD_2,STR_1,STR_2,TD_1,TD_2,SUB_1,...,Body_1,Body_2,Leg_1,Leg_2,Distance_1,Distance_2,Clinch_1,Clinch_2,Ground_1,Ground_2
fight_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2eecf0c36192e40c,win,Johnny Walker,Zhang Mingyang,1,0,50,20,0,0,0,...,1 of 5,6 of 6,13 of 13,6 of 6,15 of 26,15 of 39,2 of 4,4 of 6,33 of 37,1 of 2
58080e8989927500,win,Aljamain Sterling,Brian Ortega,0,0,124,55,3,0,0,...,20 of 29,21 of 26,5 of 7,26 of 26,54 of 173,100 of 198,0 of 0,7 of 7,1 of 1,17 of 22
5538d15c4eeac3b3,win,Sergei Pavlovich,Waldo Cortes-Acosta,0,0,61,45,0,0,0,...,12 of 14,4 of 10,14 of 15,14 of 19,60 of 131,45 of 114,1 of 2,0 of 3,0 of 0,0 of 0
ce6bd165bef82bd7,win,Sumudaerji,Kevin Borjas,0,0,73,16,0,0,0,...,17 of 25,5 of 7,23 of 29,4 of 7,67 of 121,10 of 37,6 of 8,6 of 7,0 of 0,0 of 0
1452c20d964c0a63,win,Taiyilake Nueraji,Kiefer Crosbie,0,0,27,2,1,0,0,...,2 of 3,1 of 1,0 of 0,0 of 1,2 of 3,2 of 3,0 of 0,0 of 0,25 of 35,0 of 0


In [None]:
combined_df.to_csv('raw_data/raw_fights_detailed.csv')