### Scrape Fighters Data

In [5]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
scripts_path = os.path.join(parent_dir, "scripts")

if scripts_path not in sys.path:
  sys.path.insert(0, scripts_path)

In [6]:
import requests
import bs4
from helpers import cached_request,base_path
import string
import pandas as pd
# Set up autoreload for modules
%load_ext autoreload
%autoreload 2

In [7]:
BASE_URL = "http://ufcstats.com/statistics/fighters"

In [8]:
# Remove BeautifulSoup warnings
from bs4 import MarkupResemblesLocatorWarning
import warnings

warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

In [None]:
# The page is organized by fighters names so I decided to
# Loop through different pages using an alphabets list
alphabets = list(string.ascii_lowercase)
for letter in alphabets:
    fighters_DOM_per_letter = cached_request(
        f"{BASE_URL}?char={letter}")
    soup = bs4.BeautifulSoup(fighters_DOM_per_letter, 'html.parser')
    pagination = soup.find('ul', class_='b-statistics__paginate')
    n_pasges = 1
    if pagination:
        n_pages = len(pagination.find_all('li')) if pagination else 1
    # Initialize the fighters dict
    if letter == 'a':
        headers = ['Fighter_Id']+[th.get_text(strip=True)
                                  for th in soup.select("table thead th")]
        fighters_data = {header.title(): [] for header in headers}

    # Not adding 1 to the number of list items because
    # there is a link for "all"
    # for i in range(1, n_pages+1):
    # The enumerated pages does not have all the fighters listed. I noticed that when I looked for Khabib & didn't find him
    current_page = cached_request(
        f"{BASE_URL}?char={letter}&page=all")
    soup_1 = bs4.BeautifulSoup(current_page, 'html.parser')

    # Scraping fighters tabular data
    for row in soup_1.select("table tbody tr"):
        cells = row.find_all('td', class_='b-statistics__table-col')

        if len(cells) == 0:
            continue

        while len(cells) < len(headers):
            cells.append(None)

        fighter_id = cells[0].select_one('a')['href'].split('/')[-1]
        fighters_data['Fighter_Id'].append(fighter_id)
        for header, cell in zip(headers[1:], cells):
            header = header.title()
            if cell:
                if header == 'Belt':
                    if len(cell.find_all()):
                        fighters_data[header].append(True)
                    else:
                        fighters_data[header].append(False)
                    continue
                fighters_data[header].append(cell.get_text(strip=True))
            else:
                fighters_data[header].append(None)

In [10]:
fighters_df = pd.DataFrame(fighters_data)

In [11]:
fighters_df.head()

Unnamed: 0,Fighter_Id,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,Belt
0,93fe7332d16c6ad9,Tom,Aaron,,--,155 lbs.,--,,5,3,0,False
1,15df64c02b6b0fde,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4,6,0,False
2,59a9d6dac61c2540,Nariman,Abbasov,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28,4,0,False
3,4961467134abd8be,Darion,Abbey,,"6' 2""",265 lbs.,"80.0""",Orthodox,9,5,0,False
4,b361180739bed4b0,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10,15,0,False


In [12]:
fighters_df.to_csv(base_path('raw_data/raw_fighters.csv'),index=False)

### Scrape Events

In [13]:
BASE_URL = "http://ufcstats.com/statistics/events/completed?page=all"

In [14]:
# This should not be a cashed request because the page is always updating but the url is the same 
events_html = requests.get(BASE_URL).text

In [15]:
def get_event_data(event):
    first_td = event.select_one('td.b-statistics__table-col')
    event_link = first_td.select_one('a.b-link')
    event_name = event_link.get_text(strip=True)
    event_id = event_link['href'].split('/')[-1]
    event_date = first_td.select_one('span.b-statistics__date').get_text(strip=True)

    location = event.select_one('td.b-statistics__table-col_style_big-top-padding').get_text(strip=True)

    return [event_id,event_name,event_date,location]

In [16]:
events_soup = bs4.BeautifulSoup(events_html, 'html.parser')

# Get table headers
events_table_headers = [th.get_text(strip=True)
                        for th in events_soup.select('table thead th')]
# The first header is Name/Date so it is better to split it into separate name & date
events_table_headers = ['event_id']+events_table_headers[0].split(
    '/')+events_table_headers[1:]
events_dict = {header.title(): [] for header in events_table_headers}
events_rows = events_soup.select('table tbody tr.b-statistics__table-row')
# The first element is an empty row
events_rows.pop(0)
for event in events_rows:
    event_details = get_event_data(event)
    for header, cell in zip(events_table_headers, event_details):
        header = header.title()
        events_dict[header].append(cell)

In [17]:
# Storing events in a dataframe 
events_df = pd.DataFrame(events_dict)
events_df.set_index('Event_Id',inplace=True)
events_df.head()

Unnamed: 0_level_0,Name,Date,Location
Event_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bd92cf5da5413d2a,UFC 323: Dvalishvili vs. Yan 2,"December 06, 2025","Las Vegas, Nevada, USA"
92c96df8bdab5fea,UFC Fight Night: Tsarukyan vs. Hooker,"November 22, 2025","Doha, Qatar"
8db1b36dde268ef6,UFC 322: Della Maddalena vs. Makhachev,"November 15, 2025","New York City, New York, USA"
6436029b50a9c255,UFC Fight Night: Bonfim vs. Brown,"November 08, 2025","Las Vegas, Nevada, USA"
0e2c2daf11b5d8f2,UFC Fight Night: Garcia vs. Onama,"November 01, 2025","Las Vegas, Nevada, USA"


In [18]:
events_df.to_csv(base_path('raw_data/raw_events.csv'))

### Scrape Events Details (Fights)

In [19]:
def get_fights_data(cells):
    result_flag = cells[0].get_text(strip=True)
    fighters_ids = [a['href'].split('/')[-1]
                    for a in cells[1].select("a.b-link")]
    fighters = [a.get_text(strip=True)
                for a in cells[1].select("a.b-link")]
    kd = [p.get_text(strip=True) for p in cells[2].select("p")]
    strikes = [p.get_text(strip=True) for p in cells[3].select("p")]
    td = [p.get_text(strip=True) for p in cells[4].select("p")]
    sub = [p.get_text(strip=True) for p in cells[5].select("p")]
    weight_class = cells[6].get_text(strip=True)
    method = " ".join([p.get_text(strip=True)
                       for p in cells[7].select("p") if p.get_text(strip=True)])
    round_num = cells[8].get_text(strip=True)
    fight_time = cells[9].get_text(strip=True)

    return [result_flag, *fighters_ids, *fighters, *kd, *strikes, *
            td, *sub, weight_class, method, round_num, fight_time]

In [20]:
def extract_fights_from_event(event):
    first_td = event.select_one('td.b-statistics__table-col')
    event_link = first_td.select_one('a.b-link').get('href')
    # Visit each event link
    event_html = cached_request(event_link)
    event_soup = bs4.BeautifulSoup(event_html, "html.parser")
    # Fights in each event share the same fight_id
    event_id = event_link.split('/')[-1]
    # For each event, fights are arranged in tables
    fights = event_soup.select(
        "table.b-fight-details__table tbody tr.b-fight-details__table-row")
    return (event_id,fights)

In [None]:
# Initializing the dataframe dict
fights_headers = ["Fight_Id",
                  "Win/No Contest/Draw",
                  "Fighter_Id_1",
                  "Fighter_Id_2",
                  "Fighter_1",
                  "Fighter_2",
                  "KD_1",
                  "KD_2",
                  "STR_1",
                  "STR_2",
                  "TD_1",
                  "TD_2",
                  "SUB_1",
                  "SUB_2",
                  "Weight_Class",
                  "Method",
                  "Round",
                  "Fight_Time",
                  "Event_Id"
                  ]
fights_dict = {header: [] for header in fights_headers}

In [22]:
# I already have events_rows
# Loop through them
for event in events_rows:
    event_id,fight_rows = extract_fights_from_event(event)
    for fight_row in fight_rows:
        cells = fight_row.select('td')
        fight_info = get_fights_data(cells)
        fight_id = fight_row['data-link'].split('/')[-1]
        fight_info.insert(0, fight_id)
        fight_info.append(event_id)
        for key, val in zip(fights_headers, fight_info):
            fights_dict[key].append(val)

In [23]:
fights_df = pd.DataFrame(fights_dict)
fights_df.set_index('Fight_Id',inplace=True)
fights_df.head()

Unnamed: 0_level_0,Win/No Contest/Draw,Fighter_1_id,Fighter_2_id,Fighter_1,Fighter_2,KD_1,KD_2,STR_1,STR_2,TD_1,TD_2,SUB_1,SUB_2,Weight_Class,Method,Round,Fight_Time,Event_Id
Fight_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
4a0db214d9721d6e,win,d661ce4da776fc20,c03520b5c88ed6b4,Petr Yan,Merab Dvalishvili,0,0,139,134,5,2,0,2,Bantamweight,U-DEC,5,5:00,bd92cf5da5413d2a
dfa692db6d39330c,win,17e97649403ba428,a0f0004aadf10b71,Joshua Van,Alexandre Pantoja,0,0,2,6,0,0,0,0,Flyweight,KO/TKO,1,0:26,bd92cf5da5413d2a
fbbb9e72900b71f5,win,4461d7e47375a895,792be9a24df82ed6,Tatsuro Taira,Brandon Moreno,0,0,28,9,1,0,0,1,Flyweight,KO/TKO Punches,2,2:24,bd92cf5da5413d2a
1dc29f4c6fcdd356,win,6e743a33d56bdaa4,056c493bbd76a918,Payton Talbott,Henry Cejudo,1,0,134,60,3,1,0,0,Bantamweight,U-DEC,3,5:00,bd92cf5da5413d2a
6d6ab10cbaa45e8c,drawdraw,99df7d0a2a08a8a8,ef5dcb10d2bd4b0f,Jan Blachowicz,Bogdan Guskov,1,1,84,80,0,0,1,0,Light Heavyweight,M-DEC,3,5:00,bd92cf5da5413d2a


In [24]:
fights_df.to_csv(base_path('raw_data/raw_fights.csv'))

There are more details about fights at the "/fight-details" route

### Scrape Fights Details

In [25]:
def extract_cells(table,fight_dict={},cols_to_ignore=0):
    cells = table.select('td')
    del cells[:cols_to_ignore]
    
    # Format table headers
    headers = [formatted_header.title() for header in table.select('th') for formatted_header in (
        f"{header.get_text(strip=True)}_1", f"{header.get_text(strip=True)}_2")]
    del headers[:cols_to_ignore*2]

    cells = [s for cell in cells for s in cell.stripped_strings]

    for header, cell in zip(headers, cells):
        fight_dict[header] = cell

In [26]:
def parse_fight_details(fight_html):
    fight_dict = {}
    
    soup = bs4.BeautifulSoup(fight_html, 'html.parser')

    # Get Result for each fighter W/L
    fighters_result_div = soup.select('.b-fight-details__person')
    fight_dict['Result_1'] = fighters_result_div[0].select_one(
        '.b-fight-details__person-status').get_text(strip=True)
    fight_dict['Result_2'] = fighters_result_div[1].select_one(
        '.b-fight-details__person-status').get_text(strip=True)

    fight_details_text = soup.select_one('.b-fight-details__content')
    paras = fight_details_text.find_all('p')

    if paras:
        # Get referee name and fight's time format
        first_para = paras[0]
        first_para_i_tags = first_para.find_all('i', recursive=False)
        fight_dict['Time Format'] = first_para_i_tags[3].get_text(
            strip=True).split(':')[1]
        fight_dict['Referee'] = first_para_i_tags[4].get_text(strip=True).split(':')[
            1]

        second_para = paras[1]
        # Remove the i tag in order to get only the text of the method details
        second_para.select_one('i').decompose()
        fight_dict['Method Details'] = second_para.get_text(strip=True)

    tables_soup = soup.select('table')
    # Ignoring this condition cost me waiting for 47m 28.6s then getting an error  :(
    if tables_soup:

        # Totals table
        totals_table = tables_soup[0]
        extract_cells(totals_table,fight_dict)
        
        # Significant strikes table
        sig_str_table = tables_soup[2]
        extract_cells(sig_str_table,fight_dict,cols_to_ignore=3)
            
    return fight_dict

In [27]:
# I am printing dicts too much. I need to do it in a prettier way
from pprint import pprint
from tqdm.autonotebook import tqdm

In [28]:
fights_list = []
events_total = len(events_rows)
# Adding tqdm to this loop made the waiting process less boring
for event in tqdm(
    events_rows,
    total=events_total,
    desc="Events",
    unit="event",
    position=0,
    leave=True,
    dynamic_ncols=True,
):
    event_id,fights = extract_fights_from_event(event)
    fights_total = len(fights)
    for fight in tqdm(
        fights,
        total=fights_total,
        desc="Fights",
        unit="fight",
        position=1,
        leave=False,
        dynamic_ncols=True,
    ):
        fight_details_link = fight['data-link']
        fight_details_html = cached_request(fight_details_link)
        fight_dict = parse_fight_details(fight_details_html)
        fight_dict['Fight_Id'] = fight_details_link.split('/')[-1]
        fight_dict['Event_Id'] = event_id
        fights_list.append(fight_dict)

Events: 100%|██████████| 756/756 [16:18<00:00,  1.29s/event]  


In [29]:
details_df = pd.DataFrame(fights_list)
details_df.set_index('Fight_Id',inplace=True)
details_df.to_csv(base_path('raw_data/raw_details.csv'))

### Join Fights with their details

In [30]:
len(details_df) == len(fights_df)

True

In [31]:

# fights_df[['Fighter_1', 'Fighter_2']] = details_df[['Fighter_1', 'Fighter_2']]
# details_df.drop(columns=['Fighter_1', 'Fighter_2'],inplace=True)
# Join fights with their details on fight_id and keep it as the index
combined_df = fights_df.merge(details_df, on='Fight_Id', how='left')
combined_df.head()

Unnamed: 0_level_0,Win/No Contest/Draw,Fighter_1_id,Fighter_2_id,Fighter_1_x,Fighter_2_x,KD_1,KD_2,STR_1,STR_2,TD_1,...,Body_2,Leg_1,Leg_2,Distance_1,Distance_2,Clinch_1,Clinch_2,Ground_1,Ground_2,Event_Id_y
Fight_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4a0db214d9721d6e,win,d661ce4da776fc20,c03520b5c88ed6b4,Petr Yan,Merab Dvalishvili,0,0,139,134,5,...,17 of 19,7 of 8,13 of 16,116 of 353,119 of 204,18 of 30,18 of 24,0 of 0,2 of 2,bd92cf5da5413d2a
dfa692db6d39330c,win,17e97649403ba428,a0f0004aadf10b71,Joshua Van,Alexandre Pantoja,0,0,2,6,0,...,0 of 0,1 of 2,0 of 0,5 of 10,2 of 3,1 of 1,0 of 1,0 of 0,0 of 0,bd92cf5da5413d2a
fbbb9e72900b71f5,win,4461d7e47375a895,792be9a24df82ed6,Tatsuro Taira,Brandon Moreno,0,0,28,9,1,...,3 of 3,1 of 1,2 of 3,7 of 15,7 of 19,0 of 0,2 of 4,2 of 2,19 of 25,bd92cf5da5413d2a
1dc29f4c6fcdd356,win,6e743a33d56bdaa4,056c493bbd76a918,Payton Talbott,Henry Cejudo,1,0,134,60,3,...,34 of 39,17 of 21,11 of 11,51 of 104,90 of 168,7 of 10,28 of 31,2 of 2,16 of 18,bd92cf5da5413d2a
6d6ab10cbaa45e8c,drawdraw,99df7d0a2a08a8a8,ef5dcb10d2bd4b0f,Jan Blachowicz,Bogdan Guskov,1,1,84,80,0,...,2 of 3,17 of 20,9 of 10,74 of 135,52 of 96,2 of 3,0 of 0,8 of 11,28 of 44,bd92cf5da5413d2a


In [32]:
# Using details_df fighter names because they are compatible with the stats order
# If fighter_1_x == fighter_1_y keep the stats as they are and just remove fighter_1_y and keep the fighter_1_x with renaming it to fighter_1
# If fighter_1_x == fighter_2_y . swap only the fights_df columns and keep the details_df columns:
#   col_1,col_2 =  col_2,col_1
cols_to_swap = ['KD_', 'STR_', 'TD_', 'SUB_']

for idx in combined_df.index:
    fighter_1_x = combined_df.loc[idx, 'Fighter_1_x']
    fighter_2_x = combined_df.loc[idx, 'Fighter_2_x']
    fighter_1_y = combined_df.loc[idx, 'Fighter_1_y']
    fighter_2_y = combined_df.loc[idx, 'Fighter_2_y']

    # If fighter_1_x matches fighter_2_y, we need to swap all _y columns
    if fighter_1_x == fighter_2_y and fighter_2_x == fighter_1_y:
        combined_df.loc[idx, ['Fighter_1_x', 'Fighter_2_x']] = combined_df.loc[idx, [
            'Fighter_2_x', 'Fighter_1_x']].values

        # Swap stats columns
        for col in cols_to_swap:
            col1 = f'{col}1'
            col2 = f'{col}2'
            combined_df.loc[idx, [col1, col2]
                            ] = combined_df.loc[idx, [col2, col1]].values

combined_df.drop(columns=['Fighter_1_y', 'Fighter_2_y'], inplace=True)
combined_df.rename(columns={
    'Fighter_1_x': 'Fighter_1',
    'Fighter_2_x': 'Fighter_2'
}, inplace=True)

In [33]:
combined_df.to_csv(base_path('raw_data/raw_fights_detailed.csv'))