In [2]:
# script for scraping and cleaning ufc data

import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time
import random

base_url = "http://www.ufcstats.com/statistics/events/completed?page=all"
headers = {'User-Agent': 'Mozilla/5.0'}

response = requests.get(base_url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

event_links = []
for a in soup.select('td.b-statistics__table-col a'):
    event_links.append(a['href'])

In [3]:
def parse_event_page(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Event name
    event_name = soup.find('h2').text

    # Extracting the table with all the fights
    table = soup.find('table', {'class': 'b-fight-details__table'})

    fight_data = []

    # Parsing each row in the table
    for index, row in enumerate(table.find_all('tr')[1:]):  # skipping the header row
        cols = row.find_all('td')

        # Split names using the HTML 'p' tag
        names = cols[1].find_all('p')
        a_fighter = names[0].get_text(strip=True) if len(names) > 1 else ""
        b_fighter = names[1].get_text(strip=True) if len(names) > 1 else ""

        # Similarly for other data columns, we need to correctly identify the HTML tags
        # and extract data from them
        fight_link = row.get('data-link')
        a_KD = cols[2].find_all('p')[0].get_text(strip=True) if len(cols[2].find_all('p')) > 1 else ""
        b_KD = cols[2].find_all('p')[1].get_text(strip=True) if len(cols[2].find_all('p')) > 1 else ""
        a_STR = cols[3].find_all('p')[0].get_text(strip=True) if len(cols[3].find_all('p')) > 1 else ""
        b_STR = cols[3].find_all('p')[1].get_text(strip=True) if len(cols[3].find_all('p')) > 1 else ""
        a_TD = cols[4].find_all('p')[0].get_text(strip=True) if len(cols[4].find_all('p')) > 1 else ""
        b_TD = cols[4].find_all('p')[1].get_text(strip=True) if len(cols[4].find_all('p')) > 1 else ""
        a_SUB = cols[5].find_all('p')[0].get_text(strip=True) if len(cols[5].find_all('p')) > 1 else ""
        b_SUB = cols[5].find_all('p')[1].get_text(strip=True) if len(cols[5].find_all('p')) > 1 else ""

        # fighter links
        links = cols[1].find_all('a')
        a_fighter_link = links[0]['href'] if links else None
        b_fighter_link = links[1]['href'] if len(links) > 1 else None

        details = {
            'fight_link': fight_link,
            'event_name': event_name,
            # 'winner': cols[0].get_text(strip=True), # Does not make sense for this data collection
            'a_fighter': a_fighter,
            'b_fighter': b_fighter,
            'a_KD': a_KD,
            'b_KD': b_KD,
            'a_STR': a_STR,
            'b_STR':b_STR,
            'a_TD': a_TD,
            'b_TD':b_TD,
            'a_SUB': a_SUB,
            'b_SUB':b_SUB,
            'a_fighter_link': a_fighter_link,
            'b_fighter_link': b_fighter_link
        }

        fight_data.append(details)

    return fight_data

all_fights = []

for link in event_links[1:3]:
    all_fights.extend(parse_event_page(link))
    print('parsed: '+ link)
    time.sleep(random.uniform(1, 7))

df = pd.DataFrame(all_fights)

df.head()

parsed: http://www.ufcstats.com/event-details/b6c6d1731ff00eeb
parsed: http://www.ufcstats.com/event-details/7abe471b61725980


Unnamed: 0,fight_link,event_name,a_fighter,b_fighter,a_KD,b_KD,a_STR,b_STR,a_TD,b_TD,a_SUB,b_SUB,a_fighter_link,b_fighter_link
0,http://www.ufcstats.com/fight-details/69bc7ca8...,\n\n UFC 289: Nunes vs. Aldana\n ...,Amanda Nunes,Irene Aldana,0,0,142,41,6,0,0,0,http://www.ufcstats.com/fighter-details/80fa82...,http://www.ufcstats.com/fighter-details/578ef1...
1,http://www.ufcstats.com/fight-details/40e8bf8c...,\n\n UFC 289: Nunes vs. Aldana\n ...,Charles Oliveira,Beneil Dariush,1,0,26,12,0,0,0,0,http://www.ufcstats.com/fighter-details/07225b...,http://www.ufcstats.com/fighter-details/08af93...
2,http://www.ufcstats.com/fight-details/e9d5ffca...,\n\n UFC 289: Nunes vs. Aldana\n ...,Mike Malott,Adam Fugitt,1,0,19,9,2,0,1,0,http://www.ufcstats.com/fighter-details/dd6103...,http://www.ufcstats.com/fighter-details/a01a62...
3,http://www.ufcstats.com/fight-details/46c67efd...,\n\n UFC 289: Nunes vs. Aldana\n ...,Dan Ige,Nate Landwehr,1,0,88,74,0,0,0,0,http://www.ufcstats.com/fighter-details/82a515...,http://www.ufcstats.com/fighter-details/583ee1...
4,http://www.ufcstats.com/fight-details/5311298f...,\n\n UFC 289: Nunes vs. Aldana\n ...,Marc-Andre Barriault,Eryk Anders,1,0,95,83,0,1,0,0,http://www.ufcstats.com/fighter-details/8e9eb3...,http://www.ufcstats.com/fighter-details/cad244...


In [4]:
df['fight_link'][1]

'http://www.ufcstats.com/fight-details/40e8bf8ce508c436'

In [54]:
def scrape_fight_page(url):
    response = requests.get(url, headers=headers)

    soup = BeautifulSoup(response.content, 'html.parser')

    tables = soup.find_all('table', {'class': 'b-fight-details__table js-fight-table'})

    # TOTALS table
    totals_data = []
    table1 = tables[0]
    # Find the rows within the table
    rows = table1.find_all('tr', {'class': 'b-fight-details__table-row'})
    # Iterate over each row
    for index, row in enumerate(rows[1:]):
        # Find all the columns in each row
        cols = row.find_all('td', {'class': 'b-fight-details__table-col'})

        round_number = index + 1
        a_KD = cols[1].find_all('p')[0].get_text(strip=True) if len(cols[1].find_all('p')) > 1 else ""
        b_KD = cols[1].find_all('p')[1].get_text(strip=True) if len(cols[1].find_all('p')) > 1 else ""
        a_SIG_STR = cols[2].find_all('p')[0].get_text(strip=True) if len(cols[2].find_all('p')) > 1 else ""
        b_SIG_STR = cols[2].find_all('p')[1].get_text(strip=True) if len(cols[2].find_all('p')) > 1 else ""
        a_TOT_STR = cols[4].find_all('p')[0].get_text(strip=True) if len(cols[4].find_all('p')) > 1 else ""
        b_TOT_STR = cols[4].find_all('p')[1].get_text(strip=True) if len(cols[4].find_all('p')) > 1 else ""
        a_TD = cols[5].find_all('p')[0].get_text(strip=True) if len(cols[5].find_all('p')) > 1 else ""
        b_TD = cols[5].find_all('p')[1].get_text(strip=True) if len(cols[5].find_all('p')) > 1 else ""
        a_SUB = cols[7].find_all('p')[0].get_text(strip=True) if len(cols[7].find_all('p')) > 1 else ""
        b_SUB = cols[7].find_all('p')[1].get_text(strip=True) if len(cols[7].find_all('p')) > 1 else ""
        a_REV = cols[8].find_all('p')[0].get_text(strip=True) if len(cols[8].find_all('p')) > 1 else ""
        b_REV = cols[8].find_all('p')[1].get_text(strip=True) if len(cols[8].find_all('p')) > 1 else ""
        a_CTRL = cols[9].find_all('p')[0].get_text(strip=True) if len(cols[9].find_all('p')) > 1 else ""
        b_CTRL = cols[9].find_all('p')[1].get_text(strip=True) if len(cols[9].find_all('p')) > 1 else ""
        
        details = {
            'round_number': round_number,
            'a_KD':a_KD,
            'b_KD':b_KD,
            'a_SIG_STR':a_SIG_STR,
            'b_SIG_STR':b_SIG_STR,
            'a_TOT_STR':a_TOT_STR,
            'b_TOT_STR':b_TOT_STR,
            'a_TD':a_TD,
            'b_TD':b_TD,
            'a_SUB':a_SUB,
            'b_SUB':b_SUB,
            'a_REV':a_REV,
            'b_REV':b_REV,
            'a_CTRL':a_CTRL,
            'b_CTRL':b_CTRL,
        }

        totals_data.append(details)


    # SIGNIFICANT STRIKES table
    sig_strikes_data = []
    table2 = tables[1]
    # Find the rows within the table
    rows = table2.find_all('tr', {'class': 'b-fight-details__table-row'})
    # Iterate over each row
    for index, row in enumerate(rows[1:]):
        # Find all the columns in each row
        cols = row.find_all('td', {'class': 'b-fight-details__table-col'})

        round_number = index + 1
        a_HEAD = cols[3].find_all('p')[0].get_text(strip=True) if len(cols[3].find_all('p')) > 1 else ""
        b_HEAD = cols[3].find_all('p')[1].get_text(strip=True) if len(cols[3].find_all('p')) > 1 else ""
        a_BODY = cols[4].find_all('p')[0].get_text(strip=True) if len(cols[4].find_all('p')) > 1 else ""
        b_BODY = cols[4].find_all('p')[1].get_text(strip=True) if len(cols[4].find_all('p')) > 1 else ""
        a_LEG = cols[5].find_all('p')[0].get_text(strip=True) if len(cols[5].find_all('p')) > 1 else ""
        b_LEG = cols[5].find_all('p')[1].get_text(strip=True) if len(cols[5].find_all('p')) > 1 else ""
        a_DISTANCE = cols[6].find_all('p')[0].get_text(strip=True) if len(cols[6].find_all('p')) > 1 else ""
        b_DISTANCE = cols[6].find_all('p')[1].get_text(strip=True) if len(cols[6].find_all('p')) > 1 else ""
        a_CLINCH = cols[7].find_all('p')[0].get_text(strip=True) if len(cols[7].find_all('p')) > 1 else ""
        b_CLINCH = cols[7].find_all('p')[1].get_text(strip=True) if len(cols[7].find_all('p')) > 1 else ""
        a_GROUND = cols[8].find_all('p')[0].get_text(strip=True) if len(cols[8].find_all('p')) > 1 else ""
        b_GROUND = cols[8].find_all('p')[1].get_text(strip=True) if len(cols[8].find_all('p')) > 1 else ""

        details = {
            'round_number': round_number,
            'a_HEAD':a_HEAD,
            'b_HEAD':b_HEAD,
            'a_BODY':a_BODY,
            'b_BODY':b_BODY,
            'a_LEG':a_LEG,
            'b_LEG':b_LEG,
            'a_DISTANCE':a_DISTANCE,
            'b_DISTANCE':b_DISTANCE,
            'a_CLINCH':a_CLINCH,
            'b_CLINCH':b_CLINCH,
            'a_GROUND':a_GROUND,
            'b_GROUND':b_GROUND,
        }

        sig_strikes_data.append(details)

    # join tables
    fight_data = pd.merge(pd.DataFrame(totals_data), pd.DataFrame(sig_strikes_data), on='round_number', how='inner')

    # make a single row
    df_stacked = fight_data.set_index('round_number').stack()

    # Convert the multi-level index to a single-level index
    df_stacked.index = df_stacked.index.map('{0[0]}_{0[1]}'.format)  

    # Transpose the DataFrame to get a single row
    df_single_row = df_stacked.to_frame().T

    return df_single_row

In [55]:
test_lists = scrape_fight_page('http://ufcstats.com/fight-details/256894b49303537b')
test_lists.head()

Unnamed: 0,1_a_KD,1_b_KD,1_a_SIG_STR,1_b_SIG_STR,1_a_TOT_STR,1_b_TOT_STR,1_a_TD,1_b_TD,1_a_SUB,1_b_SUB,...,5_a_BODY,5_b_BODY,5_a_LEG,5_b_LEG,5_a_DISTANCE,5_b_DISTANCE,5_a_CLINCH,5_b_CLINCH,5_a_GROUND,5_b_GROUND
0,0,0,10 of 16,11 of 23,18 of 25,14 of 26,1 of 1,0 of 0,0,0,...,2 of 2,4 of 7,0 of 0,2 of 2,4 of 12,12 of 25,4 of 4,3 of 4,0 of 0,5 of 7
