In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os # Added import for os module

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# --- Configuration for scraping --- #
scrape_new_data_only = False # Set to True to only scrape new events, False to re-scrape all events
events_csv_path = '/content/drive/MyDrive/df_all_events.csv'

# --- Load existing data to check for new events --- #
existing_df_all_events = pd.DataFrame()
existing_event_urls = set() # Using a set for efficient lookup

if os.path.exists(events_csv_path):
    try:
        existing_df_all_events = pd.read_csv(events_csv_path)
        existing_event_urls = set(existing_df_all_events['event_url'])
        print(f"Loaded {len(existing_df_all_events)} existing events from {events_csv_path}")
    except Exception as e:
        print(f"Error loading existing events CSV: {e}. Starting with empty existing events.")
else:
    print("No existing events CSV found. Starting with empty existing events.")

Loaded 755 existing events from /content/drive/MyDrive/df_all_events.csv


# Define Functions

In [None]:
def scrape_events_from_page(soup_obj):
    """
    Extracts event names, URLs, and dates from a BeautifulSoup object representing a UFC events page.
    Args:
        soup_obj (BeautifulSoup): The BeautifulSoup object of the page.
    Returns:
        list: A list of dictionaries, each containing 'event_name', 'event_url', and 'event_date'.
    """
    event_data = []
    if soup_obj:
        potential_event_container = soup_obj.find('div', class_='b-statistics__sub-entry')

        if potential_event_container:
            events_table = potential_event_container.find('table', class_='b-statistics__table-events')

            if events_table:
                rows = events_table.find('tbody').find_all('tr')

                for row in rows:
                    # Check if the row contains the specified image. If it does, skip this row.
                    if row.find('img', src='http://1e49bc5171d173577ecd-1323f4090557a33db01577564f60846c.r80.cf1.rackcdn.com/next.png', class_='b-statistics__icon'):
                        continue

                    first_td = row.find('td')
                    if first_td:
                        event_link_tag = first_td.find('a')
                        event_date_tag = first_td.find('span', class_='b-statistics__date')

                        if event_link_tag and event_date_tag:
                            event_name = event_link_tag.text.strip()
                            event_url = event_link_tag['href']
                            event_date = event_date_tag.text.strip()
                            event_data.append({'event_name': event_name, 'event_url': event_url, 'event_date': event_date})
    return event_data

def clean_and_split_numeric(text_content):
    cleaned_text = ' '.join(text_content.split()).replace('\n', ' ')
    parts = cleaned_text.split()
    val1 = int(parts[0]) if parts and parts[0].isdigit() else 0
    val2 = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
    return val1, val2

def scrape_fight_details(soup_event_detail):
    """
    Extracts fight details from a BeautifulSoup object of a UFC event detail page.
    Args:
        soup_event_detail (BeautifulSoup): The BeautifulSoup object of the event detail page.
    Returns:
        list: A list of dictionaries, each containing details for one fight.
    """
    fight_details = []
    fights_table = soup_event_detail.find('table', class_='js-fight-table')

    if fights_table:
        rows = fights_table.find('tbody').find_all('tr', class_='b-fight-details__table-row')

        # Define bonus image mappings
        bonus_mapping = {
            'perf.png': 'Performance of the Night',
            'fight.png': 'Fight of the Night',
            'sub.png': 'Submission of the Night',
            'ko.png': 'Knockout of the Night'
        }

        for i, row in enumerate(rows):
            # Check if the row contains the 'next fight' image, which indicates it's not a fight row
            if row.find('img', src='http://1e49bc5171d173577ecd-1323f4090557a33db01577564f60846c.r80.cf1.rackcdn.com/next.png', class_='b-statistics__icon'):
                continue

            cols = row.find_all('td', class_='b-fight-details__table-col')

            if len(cols) < 10: # Ensure we have enough columns for expected data
                continue

            fight_info = {}

            # Fighter names and URLs - 1st column
            fighters_div = cols[1].find_all('p', class_='b-fight-details__table-text')
            if len(fighters_div) == 2:
                fighter1_tag = fighters_div[0].find('a')
                fighter2_tag = fighters_div[1].find('a')
                fight_info['fighter1_name'] = fighter1_tag.text.strip() if fighter1_tag else ''
                fight_info['fighter1_url'] = fighter1_tag['href'] if fighter1_tag else ''
                fight_info['fighter2_name'] = fighter2_tag.text.strip() if fighter2_tag else ''
                fight_info['fighter2_url'] = fighter2_tag['href'] if fighter2_tag else ''
            else:
                fight_info['fighter1_name'] = ''
                fight_info['fighter1_url'] = ''
                fight_info['fighter2_name'] = ''
                fight_info['fighter2_url'] = ''

            # Extract Kd, Str, Td, Sub (columns 2, 3, 4, 5 respectively)
            fight_info['fighter1_kd'], fight_info['fighter2_kd'] = clean_and_split_numeric(cols[2].text)
            fight_info['fighter1_str'], fight_info['fighter2_str'] = clean_and_split_numeric(cols[3].text)
            fight_info['fighter1_td'], fight_info['fighter2_td'] = clean_and_split_numeric(cols[4].text)
            fight_info['fighter1_sub'], fight_info['fighter2_sub'] = clean_and_split_numeric(cols[5].text)

            # Weight class - 6th column
            fight_info['weight_class'] = cols[6].text.strip()

            # Method - 7th column
            fight_info['method'] = cols[7].text.strip()

            # Round - 8th column
            fight_info['round'] = cols[8].text.strip()

            # Time - 9th column
            fight_info['time'] = cols[9].text.strip()

            # Extract bonus information
            fight_info['bonus'] = None

            # Search for img tags based on src attribute, without class
            bonus_images = []
            for keyword in bonus_mapping.keys():
                found_imgs = row.find_all('img', src=lambda s: s and s.endswith(keyword))
                bonus_images.extend(found_imgs)

            for img_tag in bonus_images:
                img_src = img_tag.get('src')
                for key, value in bonus_mapping.items():
                    if img_src and img_src.endswith(key):
                        fight_info['bonus'] = value
                        break
                if fight_info['bonus'] is not None: # Stop searching if a bonus is found
                    break

            # Add championship detection
            fight_info['is_championship'] = 'Championship' if row.find('img', src=lambda s: s and s.endswith('belt.png')) else None

            # Extract fight_details_url from the 'data-link' attribute of the row
            fight_info['fight_details_url'] = row.get('data-link')

            fight_details.append(fight_info)
    return fight_details

def scrape_general_fight_details(soup_fight_detail):
    """
    Extracts general fight details (event name, date, location, decision, referee)
    from a BeautifulSoup object of a UFC fight detail page.
    Args:
        soup_fight_detail (BeautifulSoup): The BeautifulSoup object of the fight detail page.
    Returns:
        dict: A dictionary containing the extracted general fight details.
    """
    general_fight_details = {
        'event_name': '',
        'event_date': '',
        'event_location': '',
        'decision_method': '',
        'referee': ''
    }

    # Extract Event Name
    event_name_tag = soup_fight_detail.find('h2', class_='b-content__title')
    if event_name_tag:
        event_link = event_name_tag.find('a', class_='b-link')
        if event_link:
            general_fight_details['event_name'] = event_link.text.strip()

    # Event Date and Location are NOT on this page. They should be retrieved from df_all_events via merge.
    # The previous attempts to find them on this page were incorrect due to their absence.

    # Extract Decision Method and Referee (found within the fight details section)
    main_container = soup_fight_detail.find('div', class_='b-fight-details__fight')

    if main_container:
        info_paragraphs = main_container.find_all('p', class_='b-fight-details__text')
        for p_tag in info_paragraphs:
            # Search for Method
            # Use a lambda function for string matching to be more flexible with whitespace
            method_label = p_tag.find('i', class_='b-fight-details__label', string=lambda text: text and 'Method:' in text)
            if method_label:
                # Find the next <i> tag that has a 'font-style: normal' attribute (or just any <i> tag if style not present)
                method_value_tag = method_label.find_next_sibling('i', style='font-style: normal')
                if not method_value_tag: # Fallback if specific style not found, try any immediate <i> sibling
                    method_value_tag = method_label.find_next_sibling('i')

                if method_value_tag: # If an <i> tag was found
                    general_fight_details['decision_method'] = method_value_tag.text.strip()
                else: # If no <i> tag, check if the next sibling is text
                    next_s = method_label.next_sibling
                    if next_s and isinstance(next_s, str): # Check if it's a string (text node)
                        general_fight_details['decision_method'] = next_s.strip()

            # Search for Referee
            # Use a lambda function for string matching to be more flexible with whitespace
            referee_label = p_tag.find('i', class_='b-fight-details__label', string=lambda text: text and 'Referee:' in text)
            if referee_label:
                # Find the next <span> tag
                referee_value_tag = referee_label.find_next_sibling('span')
                if referee_value_tag: # If a <span> tag was found
                    general_fight_details['referee'] = referee_value_tag.text.strip()
                else: # If no <span> tag, check if the next sibling is text
                    next_s = referee_label.next_sibling
                    if next_s and isinstance(next_s, str): # Check if it's a string (text node)
                        general_fight_details['referee'] = next_s.strip()

    return general_fight_details

def parse_strike_attempt_data(text):
    """
    Parses text like '4 of 18' into (landed, attempted) integers.
    Returns (0, 0) if parsing fails.
    """
    if not text or 'of' not in text:
        return 0, 0
    parts = text.split(' of ')
    try:
        landed = int(parts[0].strip())
        attempted = int(parts[1].strip())
        return landed, attempted
    except ValueError:
        return 0, 0

def parse_percentage(text):
    """
    Parses text like '22%' into an integer percentage. Returns 0 if parsing fails.
    """
    if not text or '%' not in text:
        return 0
    try:
        return int(text.replace('%', '').strip())
    except ValueError:
        return 0

def parse_control_time(text):
    """
    Parses control time text 'MM:SS' into total seconds.
    Returns 0 if parsing fails or '---'.
    """
    if not text or text == '---':
        return 0
    try:
        minutes, seconds = map(int, text.split(':'))
        return minutes * 60 + seconds
    except ValueError:
        return 0

def scrape_round_by_round_stats(soup_fight_detail):
    """
    Extracts round-by-round fight statistics from a BeautifulSoup object of a UFC fight detail page.
    Args:
        soup_fight_detail (BeautifulSoup): The BeautifulSoup object of the fight detail page.
    Returns:
        list: A list of dictionaries, each containing detailed stats for one fighter in one round.
    """
    all_round_stats = []
    # Find the first table with this class (general stats)
    fight_details_table = soup_fight_detail.find('table', class_='b-fight-details__table js-fight-table')

    if not fight_details_table:
        return []

    rows = fight_details_table.find('tbody').find_all(['thead', 'tr'])

    current_round = 0
    fighter_names = ['', ''] # To store names for the current round

    for row in rows:
        if 'b-fight-details__table-row_type_head' in row.get('class', []):
            # This is a round header row
            current_round_text = row.find('th').text.strip()
            if 'Round' in current_round_text:
                current_round = int(current_round_text.split(' ')[1])
            continue

        if 'b-fight-details__table-row' in row.get('class', []):
            # This is a data row
            cols = row.find_all('td', class_='b-fight-details__table-col')
            if not cols or len(cols) < 10: # Ensure enough columns
                continue

            # Extract fighter names from the first column
            fighter_divs = cols[0].find_all('p', class_='b-fight-details__table-text')
            if len(fighter_divs) == 2:
                fighter_names[0] = fighter_divs[0].find('a').text.strip()
                fighter_names[1] = fighter_divs[1].find('a').text.strip()
            else:
                continue # Skip if fighter names can't be parsed correctly

            # Extract data for fighter 1 and fighter 2
            for i in range(2):
                fighter_stats = {'round': current_round, 'fighter_name': fighter_names[i]}

                # KD (Column 2)
                kd_text = cols[1].find_all('p', class_='b-fight-details__table-text')[i].text.strip()
                fighter_stats['kd'] = int(kd_text) if kd_text.isdigit() else 0

                # Sig. str. (Column 3)
                sig_str_landed, sig_str_attempted = parse_strike_attempt_data(cols[2].find_all('p', class_='b-fight-details__table-text')[i].text.strip())
                fighter_stats['sig_str_landed'] = sig_str_landed
                fighter_stats['sig_str_attempted'] = sig_str_attempted

                # Sig. str. % (Column 4)
                fighter_stats['sig_str_percent'] = parse_percentage(cols[3].find_all('p', class_='b-fight-details__table-text')[i].text.strip())

                # Total str. (Column 5)
                total_str_landed, total_str_attempted = parse_strike_attempt_data(cols[4].find_all('p', class_='b-fight-details__table-text')[i].text.strip())
                fighter_stats['total_str_landed'] = total_str_landed
                fighter_stats['total_str_attempted'] = total_str_attempted

                # Td (Column 6)
                td_landed, td_attempted = parse_strike_attempt_data(cols[5].find_all('p', class_='b-fight-details__table-text')[i].text.strip())
                fighter_stats['td_landed'] = td_landed
                fighter_stats['td_attempted'] = td_attempted

                # Td % (Column 7)
                fighter_stats['td_percent'] = parse_percentage(cols[6].find_all('p', class_='b-fight-details__table-text')[i].text.strip())

                # Sub. att (Column 8)
                sub_att_text = cols[7].find_all('p', class_='b-fight-details__table-text')[i].text.strip()
                fighter_stats['sub_att'] = int(sub_att_text) if sub_att_text.isdigit() else 0

                # Rev. (Column 9)
                rev_text = cols[8].find_all('p', class_='b-fight-details__table-text')[i].text.strip()
                fighter_stats['rev'] = int(rev_text) if rev_text.isdigit() else 0

                # Ctrl (Column 10)
                fighter_stats['control_time_seconds'] = parse_control_time(cols[9].find_all('p', class_='b-fight-details__table-text')[i].text.strip())

                all_round_stats.append(fighter_stats)

    return all_round_stats

def scrape_detailed_strike_stats(soup_fight_detail):
    """
    Extracts detailed strike location statistics (Head, Body, Leg, Distance, Clinch, Ground)
    from the second table with class 'b-fight-details__table js-fight-table'.
    Args:
        soup_fight_detail (BeautifulSoup): The BeautifulSoup object of the fight detail page.
    Returns:n        list: A list of dictionaries, each containing detailed strike stats for one fighter in one round.
    """
    all_strike_stats = []
    all_tables = soup_fight_detail.find_all('table', class_='b-fight-details__table js-fight-table')

    strike_detail_table = None
    # Identify the table that contains strike breakdown by checking for 'Head' header
    for table in all_tables:
        header_row = table.find('thead', class_='b-fight-details__table-head_rnd')
        # Use a more robust check for the 'Head' header
        if header_row and header_row.find('th', text=lambda t: t and 'Head' in t.strip()):
            strike_detail_table = table
            break

    if not strike_detail_table:
        return []

    rows = strike_detail_table.find('tbody').find_all(['thead', 'tr'])

    current_round = 0
    fighter_names = ['', '']

    for row in rows:
        if 'b-fight-details__table-row_type_head' in row.get('class', []):
            current_round_text = row.find('th').text.strip()
            if 'Round' in current_round_text:
                current_round = int(current_round_text.split(' ')[1])
            continue

        if 'b-fight-details__table-row' in row.get('class', []):
            cols = row.find_all('td', class_='b-fight-details__table-col')
            # We expect 9 columns in the header for this table:
            # Fighter, Sig. str, Sig. str. %, Head, Body, Leg, Distance, Clinch, Ground
            if not cols or len(cols) < 9:
                continue

            fighter_divs = cols[0].find_all('p', class_='b-fight-details__table-text')
            if len(fighter_divs) == 2:
                fighter_names[0] = fighter_divs[0].find('a').text.strip()
                fighter_names[1] = fighter_divs[1].find('a').text.strip()
            else:
                continue

            for i in range(2):
                fighter_stats = {'round': current_round, 'fighter_name': fighter_names[i]}

                # Head (Column 3)
                head_landed, head_attempted = parse_strike_attempt_data(cols[3].find_all('p', class_='b-fight-details__table-text')[i].text.strip())
                fighter_stats['sig_str_head_landed'] = head_landed
                fighter_stats['sig_str_head_attempted'] = head_attempted

                # Body (Column 4)
                body_landed, body_attempted = parse_strike_attempt_data(cols[4].find_all('p', class_='b-fight-details__table-text')[i].text.strip())
                fighter_stats['sig_str_body_landed'] = body_landed
                fighter_stats['sig_str_body_attempted'] = body_attempted

                # Leg (Column 5)
                leg_landed, leg_attempted = parse_strike_attempt_data(cols[5].find_all('p', class_='b-fight-details__table-text')[i].text.strip())
                fighter_stats['sig_str_leg_landed'] = leg_landed
                fighter_stats['sig_str_leg_attempted'] = leg_attempted

                # Distance (Column 6)
                distance_landed, distance_attempted = parse_strike_attempt_data(cols[6].find_all('p', class_='b-fight-details__table-text')[i].text.strip())
                fighter_stats['sig_str_distance_landed'] = distance_landed
                fighter_stats['sig_str_distance_attempted'] = distance_attempted

                # Clinch (Column 7)
                clinch_landed, clinch_attempted = parse_strike_attempt_data(cols[7].find_all('p', class_='b-fight-details__table-text')[i].text.strip())
                fighter_stats['sig_str_clinch_landed'] = clinch_landed
                fighter_stats['sig_str_clinch_attempted'] = clinch_attempted

                # Ground (Column 8)
                ground_landed, ground_attempted = parse_strike_attempt_data(cols[8].find_all('p', class_='b-fight-details__table-text')[i].text.strip())
                fighter_stats['sig_str_ground_landed'] = ground_landed
                fighter_stats['sig_str_ground_attempted'] = ground_attempted

                all_strike_stats.append(fighter_stats)
    return all_strike_stats

# Scrape Events

In [None]:
all_page_url = "http://www.ufcstats.com/statistics/events/completed?page=all"

print(f"Fetching events from the 'all' page: {all_page_url}")

try:
    response_all_page = requests.get(all_page_url)
    response_all_page.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
    soup_all_page = BeautifulSoup(response_all_page.content, 'html.parser')

    scraped_events_data_from_all_page = scrape_events_from_page(soup_all_page)
    print(f"Successfully collected {len(scraped_events_data_from_all_page)} events from the 'all' page.")

    # Filter for new events if scrape_new_data_only is True
    if scrape_new_data_only:
        new_events_data = [event for event in scraped_events_data_from_all_page if event['event_url'] not in existing_event_urls]
        events_to_scrape_fights_from = pd.DataFrame(new_events_data)
        print(f"Found {len(events_to_scrape_fights_from)} new events to add.")
    else:
        events_to_scrape_fights_from = pd.DataFrame(scraped_events_data_from_all_page)
        print(f"All {len(events_to_scrape_fights_from)} events will be used.")

except requests.exceptions.RequestException as e:
    print(f"Error fetching the 'all' page ({all_page_url}): {e}")
    events_to_scrape_fights_from = pd.DataFrame() # Initialize as empty DataFrame in case of error

Fetching events from the 'all' page: http://www.ufcstats.com/statistics/events/completed?page=all
Successfully collected 755 events from the 'all' page.
All 755 events will be used.


In [None]:
# Combine existing events with newly scraped events if applicable
if scrape_new_data_only and not existing_df_all_events.empty:
    # Concatenate new events with existing events, avoiding duplicates based on 'event_url'
    updated_df_all_events = pd.concat([existing_df_all_events, events_to_scrape_fights_from]).drop_duplicates(subset=['event_url']).reset_index(drop=True)
    print(f"Combined existing ({len(existing_df_all_events)}) and new events ({len(events_to_scrape_fights_from)}) into {len(updated_df_all_events)} events.")
else:
    updated_df_all_events = events_to_scrape_fights_from
    print(f"Created df_all_events with {len(updated_df_all_events)} events.")

df_all_events = updated_df_all_events
print("Successfully created/updated Pandas DataFrame for all events.")
print(f"Total events in df_all_events: {len(df_all_events)}")
display(df_all_events.head())

Created df_all_events with 755 events.
Successfully created/updated Pandas DataFrame for all events.
Total events in df_all_events: 755


Unnamed: 0,event_name,event_url,event_date
0,UFC Fight Night: Tsarukyan vs. Hooker,http://www.ufcstats.com/event-details/92c96df8...,"November 22, 2025"
1,UFC 322: Della Maddalena vs. Makhachev,http://www.ufcstats.com/event-details/8db1b36d...,"November 15, 2025"
2,UFC Fight Night: Bonfim vs. Brown,http://www.ufcstats.com/event-details/6436029b...,"November 08, 2025"
3,UFC Fight Night: Garcia vs. Onama,http://www.ufcstats.com/event-details/0e2c2daf...,"November 01, 2025"
4,UFC 321: Aspinall vs. Gane,http://www.ufcstats.com/event-details/7956f026...,"October 25, 2025"


In [None]:
df_all_events.to_csv(events_csv_path, index=False)

# Scrape Fights


In [None]:
all_fights_data = [] # Initialize an empty list to store all fight details

# --- Configuration for scraping --- #
fights_csv_path = '/content/drive/MyDrive/df_fights.csv'

# --- Load existing data to check for new events --- #
existing_df_fights = pd.DataFrame()
existing_fights_urls = set() # Using a set for efficient lookup

if os.path.exists(fights_csv_path):
    try:
        existing_df_fights = pd.read_csv(fights_csv_path)
        existing_fights_urls = set(existing_df_fights['fight_details_url'])
        print(f"Loaded {len(existing_df_fights)} existing fights from {fights_csv_path}")
    except Exception as e:
        print(f"Error loading existing fights CSV: {e}. Starting with empty existing fights.")
else:
    print("No existing fights CSV found. Starting with empty existing fights.")

# Determine which events need fights scraped
events_df_to_iterate = pd.DataFrame()
if scrape_new_data_only:
    events_df_to_iterate = events_to_scrape_fights_from
    print(f"Starting to scrape fight details for {len(events_df_to_iterate)} newly added events...")
else:
    events_df_to_iterate = df_all_events
    print(f"Starting to scrape fight details for {len(events_df_to_iterate)} all events...")

# Iterate through each event_url in the selected DataFrame
for index, row in events_df_to_iterate.iterrows():
    event_name = row['event_name']
    event_url = row['event_url']

    try:
        # Make an HTTP GET request to the event_url
        response = requests.get(event_url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)

        # Parse the response content using BeautifulSoup
        soup_event_detail = BeautifulSoup(response.content, 'html.parser')

        # Call the scrape_fight_details() function
        current_event_fights = scrape_fight_details(soup_event_detail)

        # Add event_name and event_url to each fight detail for context
        for fight in current_event_fights:
            fight['event_name'] = event_name
            fight['event_url'] = event_url

            # Filter for new fights if scrape_new_data_only is True
            if scrape_new_data_only and fight['fight_details_url'] in existing_fights_urls:
                continue # Skip existing fight

            all_fights_data.append(fight)

    except requests.exceptions.RequestException as e:
        # print(f"Error fetching event details for '{event_name}' ({event_url}): {e}") # Debug print removed
        continue # Continue to the next URL if an error occurs

newly_scraped_df_fights = pd.DataFrame(all_fights_data)

# Combine existing events with newly scraped events if applicable
if scrape_new_data_only and not existing_df_fights.empty:
    # Concatenate new events with existing events, avoiding duplicates based on 'event_url'
    updated_df_fights = pd.concat([existing_df_fights, newly_scraped_df_fights]).drop_duplicates(subset=['fight_details_url']).reset_index(drop=True)
    print(f"Combined existing ({len(existing_df_fights)}) and new fights ({len(newly_scraped_df_fights)}) into {len(updated_df_fights)} fights.")
else:
    updated_df_fights = newly_scraped_df_fights
    print(f"Created df_fights with {len(updated_df_fights)} fights.")

df_fights = updated_df_fights

print(f"\nFinished scraping all events. Total fights collected: {len(df_fights)}.")

print("Number of fights: ", len(df_fights))
display(df_fights.head())

Loaded 8468 existing fights from /content/drive/MyDrive/df_fights.csv
Starting to scrape fight details for 755 all events...
Created df_fights with 8468 fights.

Finished scraping all events. Total fights collected: 8468.
Number of fights:  8468


Unnamed: 0,fighter1_name,fighter1_url,fighter2_name,fighter2_url,fighter1_kd,fighter2_kd,fighter1_str,fighter2_str,fighter1_td,fighter2_td,...,fighter2_sub,weight_class,method,round,time,bonus,is_championship,fight_details_url,event_name,event_url
0,Arman Tsarukyan,http://www.ufcstats.com/fighter-details/eae48f...,Dan Hooker,http://www.ufcstats.com/fighter-details/193b9d...,0,0,42,10,2,0,...,2,Lightweight,SUB\n\n \n\n Arm Triangle,2,3:34,Performance of the Night,,http://www.ufcstats.com/fight-details/5f5b626e...,UFC Fight Night: Tsarukyan vs. Hooker,http://www.ufcstats.com/event-details/92c96df8...
1,Ian Machado Garry,http://www.ufcstats.com/fighter-details/442c90...,Belal Muhammad,http://www.ufcstats.com/fighter-details/b1b072...,0,0,72,56,0,0,...,0,Welterweight,U-DEC,3,5:00,,,http://www.ufcstats.com/fight-details/b2218930...,UFC Fight Night: Tsarukyan vs. Hooker,http://www.ufcstats.com/event-details/92c96df8...
2,Volkan Oezdemir,http://www.ufcstats.com/fighter-details/0845c8...,Alonzo Menifield,http://www.ufcstats.com/fighter-details/a495f5...,1,0,13,2,0,0,...,0,Light Heavyweight,KO/TKO\n\n \n\n Punches,1,1:27,,,http://www.ufcstats.com/fight-details/870d374f...,UFC Fight Night: Tsarukyan vs. Hooker,http://www.ufcstats.com/event-details/92c96df8...
3,Myktybek Orolbai,http://www.ufcstats.com/fighter-details/bf2c8e...,Jack Hermansson,http://www.ufcstats.com/fighter-details/0a1942...,1,0,15,19,0,0,...,0,Welterweight,KO/TKO\n\n \n\n Punch,1,2:46,,,http://www.ufcstats.com/fight-details/8ed609d8...,UFC Fight Night: Tsarukyan vs. Hooker,http://www.ufcstats.com/event-details/92c96df8...
4,Waldo Cortes Acosta,http://www.ufcstats.com/fighter-details/fc0809...,Shamil Gaziev,http://www.ufcstats.com/fighter-details/6747cc...,2,0,10,6,0,0,...,0,Heavyweight,KO/TKO\n\n \n\n Punch,1,1:22,Performance of the Night,,http://www.ufcstats.com/fight-details/e8307c76...,UFC Fight Night: Tsarukyan vs. Hooker,http://www.ufcstats.com/event-details/92c96df8...


In [None]:
df_fights.to_csv(fights_csv_path, index=False)

# Scrape Stats

In [None]:
all_detailed_fight_stats = [] # Initialize an empty list to store all general fight details
all_detailed_strike_stats = [] # Initialize an empty list to store detailed strike stats

# --- Configuration for scraping --- #
merged_stats_csv_path = '/content/drive/MyDrive/df_merged_fight_stats.csv'

# --- Load existing data to check for new events --- #
existing_df_merged_fight_stats = pd.DataFrame()
existing_merged_fight_urls = set() # Using a set for efficient lookup

if os.path.exists(merged_stats_csv_path):
    try:
        existing_df_merged_fight_stats = pd.read_csv(merged_stats_csv_path)
        existing_merged_fight_urls = set(existing_df_merged_fight_stats['fight_details_url'])
        print(f"Loaded {len(existing_df_merged_fight_stats)} existing merged fight stats from {merged_stats_csv_path}")
    except Exception as e:
        print(f"Error loading existing merged fight stats CSV: {e}. Starting with empty existing merged fight stats.")
else:
    print("No existing merged fight stats CSV found. Starting with empty existing merged fight stats.")

# Determine which fights need stats scraped
fights_to_scrape_stats_for = pd.DataFrame()
if scrape_new_data_only:
    # Filter the newly scraped fights (from the previous section) against existing merged stats
    # Ensure newly_scraped_df_fights is not empty before attempting iteration
    if not newly_scraped_df_fights.empty:
        new_stats_fights = [fight for index, fight in newly_scraped_df_fights.iterrows() if fight['fight_details_url'] not in existing_merged_fight_urls]
        fights_to_scrape_stats_for = pd.DataFrame(new_stats_fights)
    print(f"Found {len(fights_to_scrape_stats_for)} newly scraped fights (from previous section) to scrape stats for.")
else:
    fights_to_scrape_stats_for = df_fights
    print(f"All {len(fights_to_scrape_stats_for)} fights will be scraped for stats.")

print(f"Starting to scrape round-by-round and detailed strike stats for {len(fights_to_scrape_stats_for)} fights...")

# Iterate through each fight_details_url in the df_fights DataFrame
for index, row in fights_to_scrape_stats_for.iterrows():
    fight_details_url = row['fight_details_url']
    fighter1_name = row['fighter1_name']
    fighter2_name = row['fighter2_name']

    try:
        # Make an HTTP GET request to the fight_details_url
        response = requests.get(fight_details_url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)

        # Parse the response content using BeautifulSoup
        soup_event_detail = BeautifulSoup(response.content, 'html.parser')

        # Call the scrape_round_by_round_stats() function for general stats
        current_general_stats = scrape_round_by_round_stats(soup_event_detail)
        # Add fight context to each general fight detail for consistency
        for fight_stat in current_general_stats:
            fight_stat['fight_details_url'] = fight_details_url
            if fight_stat['fighter_name'] == fighter1_name:
                fight_stat['opponent_name'] = fighter2_name
            elif fight_stat['fighter_name'] == fighter2_name:
                fight_stat['opponent_name'] = fighter1_name
            else:
                fight_stat['opponent_name'] = 'Unknown'
        all_detailed_fight_stats.extend(current_general_stats)

        # Call the new scrape_detailed_strike_stats() function for detailed strike stats
        current_strike_details = scrape_detailed_strike_stats(soup_event_detail)
        # Add fight context to each detailed strike stat for consistency
        for strike_stat in current_strike_details:
            strike_stat['fight_details_url'] = fight_details_url
            if strike_stat['fighter_name'] == fighter1_name:
                strike_stat['opponent_name'] = fighter2_name
            elif strike_stat['fighter_name'] == fighter2_name:
                strike_stat['opponent_name'] = fighter1_name
            else:
                strike_stat['opponent_name'] = 'Unknown'
        all_detailed_strike_stats.extend(current_strike_details)

    except requests.exceptions.RequestException as e:
        # print(f"Error fetching fight details for '{fight_details_url}': {e}") # Debug print removed
        continue # Continue to the next URL if an error occurs

newly_scraped_df_fight_details = pd.DataFrame(all_detailed_fight_stats)
newly_scraped_df_strike_details = pd.DataFrame(all_detailed_strike_stats)

# Merge the two DataFrames on common keys, handling empty DataFrames
if not newly_scraped_df_fight_details.empty and not newly_scraped_df_strike_details.empty:
    newly_merged_fight_stats = pd.merge(newly_scraped_df_fight_details, newly_scraped_df_strike_details,
                                     on=['fight_details_url', 'round', 'fighter_name', 'opponent_name'],
                                     how='left')
elif not newly_scraped_df_fight_details.empty:
    # If only general fight details were scraped (e.g., strike details table was missing)
    newly_merged_fight_stats = newly_scraped_df_fight_details
elif not newly_scraped_df_strike_details.empty:
    # This case is less likely if general stats are always present, but included for completeness
    newly_merged_fight_stats = newly_scraped_df_strike_details
else:
    # Both are empty, create an empty DataFrame with expected columns to avoid merge errors later
    # Define columns that would be present after a successful merge
    columns_if_merged = ['fight_details_url', 'round', 'fighter_name', 'opponent_name',
                         'kd', 'sig_str_landed', 'sig_str_attempted', 'sig_str_percent',
                         'total_str_landed', 'total_str_attempted', 'td_landed', 'td_attempted',
                         'td_percent', 'sub_att', 'rev', 'control_time_seconds',
                         'sig_str_head_landed', 'sig_str_head_attempted',
                         'sig_str_body_landed', 'sig_str_body_attempted',
                         'sig_str_leg_landed', 'sig_str_leg_attempted',
                         'sig_str_distance_landed', 'sig_str_distance_attempted',
                         'sig_str_clinch_landed', 'sig_str_clinch_attempted',
                         'sig_str_ground_landed', 'sig_str_ground_attempted']
    newly_merged_fight_stats = pd.DataFrame(columns=columns_if_merged)

# Combine existing events with newly scraped events if applicable
if scrape_new_data_only and not existing_df_merged_fight_stats.empty:
    # Concatenate new events with existing events, avoiding duplicates based on 'fight_details_url'
    updated_df_merged_fight_stats = pd.concat([existing_df_merged_fight_stats, newly_merged_fight_stats]).drop_duplicates(subset=['fight_details_url', 'round', 'fighter_name']).reset_index(drop=True)
    print(f"Combined existing ({len(existing_df_merged_fight_stats)}) and new merged fight stats ({len(newly_merged_fight_stats)}) into {len(updated_df_merged_fight_stats)} entries.")
else:
    updated_df_merged_fight_stats = newly_merged_fight_stats
    print(f"Created df_merged_fight_stats with {len(updated_df_merged_fight_stats)} entries.")

df_merged_fight_stats = updated_df_merged_fight_stats

print(f"\nFinished scraping all fight details. Total general round stats collected: {len(df_merged_fight_stats)}.")

print("\nSuccessfully converted collected detailed fight stats into a Pandas DataFrame.")
print(f"Number of merged fight stats entries: {len(df_merged_fight_stats)}")
display(df_merged_fight_stats.head())

Loaded 39612 existing merged fight stats from /content/drive/MyDrive/df_merged_fight_stats.csv
All 8468 fights will be scraped for stats.
Starting to scrape round-by-round and detailed strike stats for 8468 fights...


  if header_row and header_row.find('th', text=lambda t: t and 'Head' in t.strip()):


Created df_merged_fight_stats with 39810 entries.

Finished scraping all fight details. Total general round stats collected: 39810.

Successfully converted collected detailed fight stats into a Pandas DataFrame.
Number of merged fight stats entries: 39810


Unnamed: 0,round,fighter_name,kd,sig_str_landed,sig_str_attempted,sig_str_percent,total_str_landed,total_str_attempted,td_landed,td_attempted,...,sig_str_body_landed,sig_str_body_attempted,sig_str_leg_landed,sig_str_leg_attempted,sig_str_distance_landed,sig_str_distance_attempted,sig_str_clinch_landed,sig_str_clinch_attempted,sig_str_ground_landed,sig_str_ground_attempted
0,1,Arman Tsarukyan,0,19,30,63,32,45,1,3,...,2,3,5,5,15,26,2,2,2,2
1,1,Dan Hooker,0,5,20,25,9,25,0,0,...,2,6,1,1,4,17,1,3,0,0
2,2,Arman Tsarukyan,0,23,30,76,37,44,1,1,...,1,2,1,1,3,5,0,0,20,25
3,2,Dan Hooker,0,5,13,38,8,16,0,0,...,4,6,0,0,5,13,0,0,0,0
4,1,Belal Muhammad,0,24,57,42,30,69,0,1,...,5,5,2,3,23,55,1,2,0,0


In [None]:
df_merged_fight_stats.to_csv(merged_stats_csv_path, index=False)

# Scrape Fight Details

In [None]:
all_general_fight_details = []

for index, row in newly_scraped_df_fights.iterrows():
    fight_details_url = row['fight_details_url']

    try:
        # Make an HTTP GET request to the fight_details_url
        response = requests.get(fight_details_url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)

        # Parse the response content using BeautifulSoup
        soup_fight_detail = BeautifulSoup(response.content, 'html.parser')

        # Call the scrape_general_fight_details() function
        current_general_details = scrape_general_fight_details(soup_fight_detail)

        # Add the fight_details_url to the dictionary for context
        current_general_details['fight_details_url'] = fight_details_url

        all_general_fight_details.append(current_general_details)

    except requests.exceptions.RequestException as e:
        # print(f"Error fetching fight general details for '{fight_details_url}': {e}") # Debug print removed
        continue # Continue to the next URL if an error occurs

print(f"Finished scraping general fight details for {len(all_general_fight_details)} entries.")

Finished scraping general fight details for 8468 entries.


In [None]:
newly_scraped_df_general_fight_details = pd.DataFrame(all_general_fight_details)

# Combine existing events with newly scraped events if applicable
if scrape_new_data_only and not existing_df_general_fight_details.empty:
    updated_df_general_fight_details = pd.concat([existing_df_general_fight_details, newly_scraped_df_general_fight_details]).drop_duplicates(subset=['fight_details_url']).reset_index(drop=True)
    print(f"Combined existing ({len(existing_df_general_fight_details)}) and new general fight details ({len(newly_scraped_df_general_fight_details)}) into {len(updated_df_general_fight_details)} entries.")
else:
    updated_df_general_fight_details = newly_scraped_df_general_fight_details
    print(f"Created df_general_fight_details with {len(updated_df_general_fight_details)} entries.")

df_general_fight_details = updated_df_general_fight_details

print("\nSuccessfully converted collected general fight details into a Pandas DataFrame.")
print(f"Number of general fight details entries: {len(df_general_fight_details)}")
display(df_general_fight_details.head())

Created df_general_fight_details with 8468 entries.

Successfully converted collected general fight details into a Pandas DataFrame.
Number of general fight details entries: 8468


Unnamed: 0,event_name,event_date,event_location,decision_method,referee,fight_details_url
0,UFC Fight Night: Tsarukyan vs. Hooker,,,Submission,Marc Goddard,http://www.ufcstats.com/fight-details/5f5b626e...
1,UFC Fight Night: Tsarukyan vs. Hooker,,,Decision - Unanimous,Rich Mitchell,http://www.ufcstats.com/fight-details/b2218930...
2,UFC Fight Night: Tsarukyan vs. Hooker,,,KO/TKO,Marc Goddard,http://www.ufcstats.com/fight-details/870d374f...
3,UFC Fight Night: Tsarukyan vs. Hooker,,,KO/TKO,Daniel Movahedi,http://www.ufcstats.com/fight-details/8ed609d8...
4,UFC Fight Night: Tsarukyan vs. Hooker,,,KO/TKO,Lukasz Bosacki,http://www.ufcstats.com/fight-details/e8307c76...


In [None]:
# 0. Clean up existing columns from df_fights that will be replaced or clarified.
#    This prevents conflicting column names with suffixes later on, and ensures we use the most authoritative source for these details.
columns_to_drop_if_exist = [
    'event_name', 'method', 'event_date', 'event_location', 'referee',
    'event_date_x', 'event_date_y', 'decision_method_x', 'decision_method_y',
    'referee_x', 'referee_y'
]
for col in columns_to_drop_if_exist:
    if col in df_fights.columns:
        df_fights = df_fights.drop(columns=[col])

# 1. Merge df_fights with df_general_fight_details to get decision_method and referee.
#    These columns should be new to df_fights after dropping old ones.
df_fights = pd.merge(
    df_fights,
    df_general_fight_details[['fight_details_url', 'decision_method', 'referee']],
    on='fight_details_url',
    how='left'
)

# 2. Merge df_fights with df_all_events to obtain the correct event_name and event_date.
#    'event_location' is NOT available from df_all_events and is handled separately.
df_fights = pd.merge(
    df_fights,
    df_all_events[['event_url', 'event_name', 'event_date']],
    on='event_url',
    how='left'
)

# 3. Add an empty 'event_location' column as it's not available from current scraping sources.
df_fights['event_location'] = ''

print("Successfully cleaned up df_fights and merged event details.")
print(f"Number of fights after cleanup and re-merge: {len(df_fights)}")
display(df_fights.head())
display(df_fights.info())

Successfully cleaned up df_fights and merged event details.
Number of fights after cleanup and re-merge: 8468


Unnamed: 0,fighter1_name,fighter1_url,fighter2_name,fighter2_url,fighter1_kd,fighter2_kd,fighter1_str,fighter2_str,fighter1_td,fighter2_td,...,time,bonus,is_championship,fight_details_url,event_url,decision_method,referee,event_name,event_date,event_location
0,Arman Tsarukyan,http://www.ufcstats.com/fighter-details/eae48f...,Dan Hooker,http://www.ufcstats.com/fighter-details/193b9d...,0,0,42,10,2,0,...,3:34,Performance of the Night,,http://www.ufcstats.com/fight-details/5f5b626e...,http://www.ufcstats.com/event-details/92c96df8...,Submission,Marc Goddard,UFC Fight Night: Tsarukyan vs. Hooker,"November 22, 2025",
1,Ian Machado Garry,http://www.ufcstats.com/fighter-details/442c90...,Belal Muhammad,http://www.ufcstats.com/fighter-details/b1b072...,0,0,72,56,0,0,...,5:00,,,http://www.ufcstats.com/fight-details/b2218930...,http://www.ufcstats.com/event-details/92c96df8...,Decision - Unanimous,Rich Mitchell,UFC Fight Night: Tsarukyan vs. Hooker,"November 22, 2025",
2,Volkan Oezdemir,http://www.ufcstats.com/fighter-details/0845c8...,Alonzo Menifield,http://www.ufcstats.com/fighter-details/a495f5...,1,0,13,2,0,0,...,1:27,,,http://www.ufcstats.com/fight-details/870d374f...,http://www.ufcstats.com/event-details/92c96df8...,KO/TKO,Marc Goddard,UFC Fight Night: Tsarukyan vs. Hooker,"November 22, 2025",
3,Myktybek Orolbai,http://www.ufcstats.com/fighter-details/bf2c8e...,Jack Hermansson,http://www.ufcstats.com/fighter-details/0a1942...,1,0,15,19,0,0,...,2:46,,,http://www.ufcstats.com/fight-details/8ed609d8...,http://www.ufcstats.com/event-details/92c96df8...,KO/TKO,Daniel Movahedi,UFC Fight Night: Tsarukyan vs. Hooker,"November 22, 2025",
4,Waldo Cortes Acosta,http://www.ufcstats.com/fighter-details/fc0809...,Shamil Gaziev,http://www.ufcstats.com/fighter-details/6747cc...,2,0,10,6,0,0,...,1:22,Performance of the Night,,http://www.ufcstats.com/fight-details/e8307c76...,http://www.ufcstats.com/event-details/92c96df8...,KO/TKO,Lukasz Bosacki,UFC Fight Night: Tsarukyan vs. Hooker,"November 22, 2025",


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8468 entries, 0 to 8467
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   fighter1_name      8468 non-null   object
 1   fighter1_url       8468 non-null   object
 2   fighter2_name      8468 non-null   object
 3   fighter2_url       8468 non-null   object
 4   fighter1_kd        8468 non-null   int64 
 5   fighter2_kd        8468 non-null   int64 
 6   fighter1_str       8468 non-null   int64 
 7   fighter2_str       8468 non-null   int64 
 8   fighter1_td        8468 non-null   int64 
 9   fighter2_td        8468 non-null   int64 
 10  fighter1_sub       8468 non-null   int64 
 11  fighter2_sub       8468 non-null   int64 
 12  weight_class       8468 non-null   object
 13  round              8468 non-null   int64 
 14  time               8468 non-null   object
 15  bonus              2176 non-null   object
 16  is_championship    472 non-null    object


None

In [None]:
df_fights.to_csv(fights_csv_path, index=False)
print(f"Updated df_fights saved to {fights_csv_path}")

Updated df_fights saved to /content/drive/MyDrive/df_fights.csv
