### Scraping Fighters Data

In [None]:
import requests
import bs4
import helpers
import string
import pandas as pd

In [7]:
BASE_URL = "http://ufcstats.com/statistics/fighters"

In [8]:
# The page is organized by fighters names so I decided to
# Loop through different pages using an alphabets list
alphabets = list(string.ascii_lowercase)
for letter in alphabets:
    fighters_DOM_per_letter = helpers.cached_request(
        f"{BASE_URL}?char={letter}")
    soup = bs4.BeautifulSoup(fighters_DOM_per_letter, 'html.parser')
    pagination = soup.find('ul', class_='b-statistics__paginate')
    n_pages = len(pagination.find_all('li'))

    # Initialize the fighters dict
    if letter == 'a':
        headers = [th.get_text(strip=True)
                   for th in soup.select("table thead th")]
        fighters_data = {header.title(): [] for header in headers}

    # Not adding 1 to the number of list items because
    # there is a link for "all"
    for i in range(1, n_pages):
        current_page = helpers.cached_request(
            f"{BASE_URL}?char={letter}&page={i}")
        soup_1 = bs4.BeautifulSoup(current_page, 'html.parser')

        # Scraping fighters tabular data
        for row in soup_1.select("table tbody tr"):
            cells = row.find_all('td', class_='b-statistics__table-col')

            if len(cells) == 0:
                continue

            while len(cells) < len(headers):
                cells.append(None)

            for header, cell in zip(headers, cells):
                header = header.title()
                if cell:
                    fighters_data[header].append(cell.get_text(strip=True))
                else:
                    fighters_data[header].append(None)

In [9]:
fighters_df = pd.DataFrame(fighters_data)

In [10]:
fighters_df.head()

Unnamed: 0,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,Belt
0,Tom,Aaron,,--,155 lbs.,--,,5,3,0,
1,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4,6,0,
2,Nariman,Abbasov,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28,4,0,
3,Darion,Abbey,,"6' 2""",265 lbs.,"80.0""",Orthodox,9,5,0,
4,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10,15,0,


In [11]:
fighters_df.to_csv('fighters.csv',index=False)

### Scraping Events

In [12]:
BASE_URL = "http://ufcstats.com/statistics/events/completed?page=all"

In [13]:
events_html = helpers.cached_request("http://ufcstats.com/statistics/events/completed?page=all")

In [14]:
def get_event_data(event):
    first_td = event.select_one('td.b-statistics__table-col')
    event_name = first_td.select_one('a.b-link').get_text(strip=True)
    event_date = first_td.select_one('span.b-statistics__date').get_text(strip=True)

    location = event.select_one('td.b-statistics__table-col_style_big-top-padding').get_text(strip=True)

    return [event_name,event_date,location]

In [15]:
events_soup = bs4.BeautifulSoup(events_html, 'html.parser')

# Get table headers
events_table_headers = [th.get_text(strip=True)
                        for th in events_soup.select('table thead th')]
# The first header is Name/Date so it is better to split it into separate name & date
events_table_headers = events_table_headers[0].split(
    '/')+events_table_headers[1:]
events_dict = {header.title(): [] for header in events_table_headers}
events_rows = events_soup.select('table tbody tr.b-statistics__table-row')
# The first element is an empty row
events_rows.pop(0)
for event in events_rows:
    event_details = get_event_data(event)
    for header, cell in zip(events_table_headers, event_details):
        header = header.title()
        events_dict[header].append(cell)

In [16]:
# Storing events in a dataframe 
events_df = pd.DataFrame(events_dict)
events_df.head()

Unnamed: 0,Name,Date,Location
0,UFC Fight Night: Walker vs. Zhang,"August 23, 2025","Shanghai, Hebei, China"
1,UFC 319: Du Plessis vs. Chimaev,"August 16, 2025","Chicago, Illinois, USA"
2,UFC Fight Night: Dolidze vs. Hernandez,"August 09, 2025","Las Vegas, Nevada, USA"
3,UFC Fight Night: Taira vs. Park,"August 02, 2025","Las Vegas, Nevada, USA"
4,UFC Fight Night: Whittaker vs. De Ridder,"July 26, 2025","Abu Dhabi, Abu Dhabi, United Arab Emirates"


In [17]:
events_df.to_csv('events.csv',index=False)

### Scraping Events Details (Fights)

In [18]:
def get_fights_data(cells,):    
    result_flag = cells[0].get_text(strip=True)
    fighters = [a.get_text(strip=True)
                for a in cells[1].select("a.b-link")]
    kd = [p.get_text(strip=True) for p in cells[2].select("p")]
    strikes = [p.get_text(strip=True) for p in cells[3].select("p")]
    td = [p.get_text(strip=True) for p in cells[4].select("p")]
    sub = [p.get_text(strip=True) for p in cells[5].select("p")]
    weight_class = cells[6].get_text(strip=True)
    method = " ".join([p.get_text(strip=True)
                        for p in cells[7].select("p") if p.get_text(strip=True)])
    round_num = cells[8].get_text(strip=True)
    fight_time = cells[9].get_text(strip=True)

    return [result_flag, *fighters, *kd, *strikes, *
                    td, *sub, weight_class, method, round_num, fight_time]


In [19]:
def extract_fights_from_event(event):
    first_td = event.select_one('td.b-statistics__table-col')
    event_link = first_td.select_one('a.b-link').get('href')
    # Visit each event link
    event_html = helpers.cached_request(event_link)
    event_soup = bs4.BeautifulSoup(event_html, "html.parser")
    # For each event, fights are arranged in tables
    return event_soup.select(
        "table.b-fight-details__table tbody tr.b-fight-details__table-row")

In [20]:
# Initializing the dataframe dict
fights_headers = ["Win/No Contest/Draw",
                    "Fighter_1",
                    "Fighter_2",
                    "KD_1",
                    "KD_2",
                    "STR_1",
                    "STR_2",
                    "TD_1",
                    "TD_2",
                    "SUB_1",
                    "SUB_2",
                    "Weight_Class",
                    "Method",
                    "Round",
                    "Fight_Time"
                    ]
fights_dict = {header: [] for header in fights_headers}

In [21]:
# I already have events_rows
# Loop through them
for event in events_rows:
    fight_rows = extract_fights_from_event(event)
    for fight_row in fight_rows:
        cells = fight_row.select('td')
        fight_info = get_fights_data(cells)
        for key, val in zip(fights_headers, fight_info):
            fights_dict[key].append(val)
            

In [22]:
fights_df = pd.DataFrame(fights_dict)
fights_df.head()

Unnamed: 0,Win/No Contest/Draw,Fighter_1,Fighter_2,KD_1,KD_2,STR_1,STR_2,TD_1,TD_2,SUB_1,SUB_2,Weight_Class,Method,Round,Fight_Time
0,win,Johnny Walker,Zhang Mingyang,1,0,50,20,0,0,0,0,Light Heavyweight,KO/TKO Punches,2,2:37
1,win,Aljamain Sterling,Brian Ortega,0,0,124,55,3,0,0,0,Catch Weight,U-DEC,5,5:00
2,win,Sergei Pavlovich,Waldo Cortes-Acosta,0,0,61,45,0,0,0,0,Heavyweight,U-DEC,3,5:00
3,win,Sumudaerji,Kevin Borjas,0,0,73,16,0,0,0,0,Flyweight,U-DEC,3,5:00
4,win,Taiyilake Nueraji,Kiefer Crosbie,0,0,27,2,1,0,0,0,Welterweight,KO/TKO Elbows,1,3:33


In [23]:
fights_df.to_csv('fights.csv',index=False)

There are more details about fights at the "/fight-details" route

### Scrape Fights Details

In [None]:
def parse_fight_details(fight_html):
    fight_dict = {}
    
    soup = bs4.BeautifulSoup(fight_html, 'html.parser')

    # Get Result for each fighter W/L
    fighters_result_div = soup.select('.b-fight-details__person')
    fight_dict['Result_1'] = fighters_result_div[0].select_one(
        '.b-fight-details__person-status').get_text(strip=True)
    fight_dict['Result_2'] = fighters_result_div[1].select_one(
        '.b-fight-details__person-status').get_text(strip=True)

    fight_details_text = soup.select_one('.b-fight-details__content')
    paras = fight_details_text.find_all('p')

    if paras:
        # Get referee name and fight's time format
        first_para = paras[0]
        first_para_i_tags = first_para.find_all('i', recursive=False)
        fight_dict['Time Format'] = first_para_i_tags[3].get_text(
            strip=True).split(':')[1]
        fight_dict['Referee'] = first_para_i_tags[4].get_text(strip=True).split(':')[
            1]

        second_para = paras[1]
        # Remove the i tag in order to get only the text of the method details
        second_para.select_one('i').decompose()
        fight_dict['Method Details'] = second_para.get_text(strip=True)

    tables_soup = soup.select('table')
    # Ignoring this condition cost me waiting for 47m 28.6s then getting an error  :(
    if tables_soup:
        # Totals table
        totals_table = tables_soup[0]
        cells = totals_table.select('td')
        cells.pop(0)

        # ! I Already got this
        # fighters = [fighter.get_text(strip=True) for fighter in cells[0].select('a')]

        # Format totals_table headers
        headers = [formatted_header.title() for header in totals_table.select('th') for formatted_header in (
            f"{header.get_text(strip=True)}_1", f"{header.get_text(strip=True)}_2")]
        del headers[:2]

        cells = [s for cell in cells for s in cell.stripped_strings]

        for header, cell in zip(headers, cells):
            fight_dict[header] = cell

        # Significant strikes table
        sig_str_table = tables_soup[2]

        # Format sig_str_table headers
        headers = [formatted_header.title() for header in sig_str_table.select('th') for formatted_header in (
            f"{header.get_text(strip=True)}_1", f"{header.get_text(strip=True)}_2")]
        del headers[:6]

        cells = sig_str_table.select('td')
        del cells[:3]
        cells = [s for cell in cells for s in cell.stripped_strings]

        for header, cell in zip(headers, cells):
            fight_dict[header] = cell
        
    
    return fight_dict


In [25]:
# fights_data = []
# for event in events_rows:
#     fight_rows = extract_fights_from_event(event)
#     # Loop through fights
#     for fight in fight_rows :
#         # Send a GET request to the link in data-link attribute
#         fight_details_link = fight['data-link']
#         fight_details_html = helpers.cached_request(fight_details_link)
#         soup = bs4.BeautifulSoup(fight_details_html, 'html.parser')
#         # Get W/L status for each fighter
#         # Get fight bonuses if they exist
#         # Get referee name
#         # Get more details about how the fight ended (method)
#         # More fight statistics (strikes,ground,control time,...)
#         fights_data.append(parse_fight_details(fight_details_html))

In [None]:
# I am printing dicts too much. I need to do it in a prettier way
from pprint import pprint
from tqdm.auto import tqdm

In [60]:
fights_list = []
events_total = len(events_rows)
# Adding tqdm to this loop made the waiting process less boring
for event in tqdm(
    events_rows,
    total=events_total,
    desc="Events",
    unit="event",
    position=0,
    leave=True,
    dynamic_ncols=True,
):
    fights = extract_fights_from_event(event)
    fights_total = len(fights)
    for fight in tqdm(
        fights,
        total=fights_total,
        desc="Fights",
        unit="fight",
        position=1,
        leave=False,
        dynamic_ncols=True,
    ):
        fight_details_link = fight['data-link']
        fight_details_html = helpers.cached_request(fight_details_link)
        fight_dict = parse_fight_details(fight_details_html)
        fights_list.append(fight_dict)

Events: 100%|██████████| 744/744 [15:58<00:00,  1.29s/event]


In [53]:
details_df = pd.DataFrame(fights_list)
details_df.to_csv('details_df.csv')