# **1 - Scraping**

## Importing Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Scraping Fighters
**Example Fighters**: http://www.ufcstats.com/statistics/fighters?char=a&page=all

**Example Fighter**: http://www.ufcstats.com/fighter-details/93fe7332d16c6ad9

### Fetching

In [2]:
fighters_soups = []
# Fighters' Pages Sorted by Last Initial
for letter in 'abcdefghijklmnopqrstuvwxyz':
    fighters_url = f'http://www.ufcstats.com/statistics/fighters?char={letter}&page=all'
    response = requests.get(fighters_url)
    response.raise_for_status
    page = response.text
    soup = BeautifulSoup(page, 'html.parser')
    # print(soup.prettify())
    fighters_soups.append(soup)

### Scraping

In [3]:
# Create DataFrame
fighters = pd.DataFrame(columns = ['Name', 'Record', 'Height', 'Weight', 'Reach', 'Stance', 'Birthday', 'SLPM', 'SACC', 'SAPM', 'SD', 'TDAVG', 'TDACC', 'TDD', 'SAVG' ])

# Loop Through Fighters' Pages (Last Initial)
for soup in fighters_soups:
    fighters_table = soup.find('table')
    fighters_hyperlinks = fighters_table.find_all('a')
    # Loop Through Fighters w/ Same Last Initial
    for fighters_hyperlink in fighters_hyperlinks[::3]:
        fighters_url = fighters_hyperlink['href']
        fighters_response = requests.get(fighters_url)
        fighters_response.raise_for_status()
        fighters_page = fighters_response.text
        fighters_soup = BeautifulSoup(fighters_page, 'html.parser')
    
        # Name
        name  = fighters_soup.find('span', class_ = 'b-content__title-highlight').text.strip()
        # Record
        record = fighters_soup.find('span', class_ = 'b-content__title-record').text.strip().replace('Record: ', '')
        # Height
        height = fighters_soup.find('i', string = lambda text: text and 'height' in text.lower()).next_sibling.strip()
        # Weight
        weight = fighters_soup.find('i', string = lambda text: text and 'weight' in text.lower()).next_sibling.strip()
        # Reach
        reach = fighters_soup.find('i', string = lambda text: text and 'reach' in text.lower()).next_sibling.strip()
        # Stance
        stance = fighters_soup.find('i', string = lambda text: text and 'stance' in text.lower()).next_sibling.strip()
        # Birthday
        birthday = fighters_soup.find('i', string = lambda text: text and 'dob' in text.lower()).next_sibling.strip()
        # SLPM
        slpm = fighters_soup.find('i', string = lambda text: text and 'slpm' in text.lower()).next_sibling.strip()
        # SACC
        sacc = fighters_soup.find('i', string = lambda text: text and 'str. acc' in text.lower()).next_sibling.strip()
        # SAPM
        sapm = fighters_soup.find('i', string = lambda text: text and 'sapm' in text.lower()).next_sibling.strip()
        # SD
        sd = fighters_soup.find('i', string = lambda text: text and 'str. def' in text.lower()).next_sibling.strip()
        # TDAVG
        tdavg = fighters_soup.find('i', string = lambda text: text and 'td avg' in text.lower()).next_sibling.strip()
        # TDACC
        tdacc = fighters_soup.find('i', string = lambda text: text and 'td acc' in text.lower()).next_sibling.strip()
        # TDD
        tdd = fighters_soup.find('i', string = lambda text: text and 'td def' in text.lower()).next_sibling.strip()
        # SAVG
        savg = fighters_soup.find('i', string = lambda text: text and 'sub. avg' in text.lower()).next_sibling.strip()

        # Add to DataFrame
        fighters.loc[len(fighters)] = [name, record, height, weight, reach, stance, birthday, slpm, sacc, sapm, sd, tdavg, tdacc, tdd, savg]

display(fighters.head())
display(fighters.tail())

Unnamed: 0,Name,Record,Height,Weight,Reach,Stance,Birthday,SLPM,SACC,SAPM,SD,TDAVG,TDACC,TDD,SAVG
0,Tom Aaron,5-3-0,--,155 lbs.,--,,"Jul 13, 1978",0.0,0%,0.0,0%,0.0,0%,0%,0.0
1,Danny Abbadi,4-6-0,"5' 11""",155 lbs.,--,Orthodox,"Jul 03, 1983",3.29,38%,4.41,57%,0.0,0%,77%,0.0
2,Nariman Abbasov,28-4-0,"5' 8""",155 lbs.,"66""",Orthodox,"Feb 01, 1994",3.0,20%,5.67,46%,0.0,0%,66%,0.0
3,David Abbott,10-15-0,"6' 0""",265 lbs.,--,Switch,--,1.35,30%,3.55,38%,1.07,33%,66%,0.0
4,Hamdy Abdelwahab,5-0-0 (1 NC),"6' 2""",264 lbs.,"72""",Southpaw,"Jan 22, 1993",3.87,52%,3.13,59%,3.0,75%,0%,0.0


Unnamed: 0,Name,Record,Height,Weight,Reach,Stance,Birthday,SLPM,SACC,SAPM,SD,TDAVG,TDACC,TDD,SAVG
4247,Dave Zitanick,5-7-0 (1 NC),--,170 lbs.,--,,"Mar 05, 1980",0.0,0%,0.0,0%,0.0,0%,0%,0.0
4248,Alex Zuniga,6-3-0,--,145 lbs.,--,,--,0.0,0%,0.0,0%,0.0,0%,0%,0.0
4249,George Zuniga,3-1-0,"5' 9""",185 lbs.,--,,--,7.64,38%,5.45,37%,0.0,0%,100%,0.0
4250,Allan Zuniga,13-1-0,"5' 7""",155 lbs.,"70""",Orthodox,"Apr 04, 1992",3.93,52%,1.8,61%,0.0,0%,57%,1.0
4251,Virgil Zwicker,15-6-1,"6' 2""",205 lbs.,"74""",,"Jun 26, 1982",3.34,48%,4.87,39%,1.31,30%,50%,0.0


### Exporting

In [4]:
fighters.to_csv('fighters.csv', index = False)

## Scraping Fights

**Example Events**: http://www.ufcstats.com/statistics/events/completed?page=all

**Example Fights**: http://www.ufcstats.com/statistics/events/completed

**Example Fight**: http://www.ufcstats.com/fight-details/15805ae1eea3343e

### Fetching

In [5]:
fights_url = 'http://www.ufcstats.com/statistics/events/completed?page=all'
response = requests.get(fights_url)
response.raise_for_status
page = response.text
soup = BeautifulSoup(page, 'html.parser')
# print(soup.prettify())

### Scraping

In [6]:
# Create DataFrame
fights = pd.DataFrame(columns = ['A', 'B', 'Winner', 'Event', 'Date', 'Location', 'Division', 'Method', 'Round', 'Time', 'Format', 'Referee',
'A_KD', 'A_SS', 'A_SSP', 'A_TS', 'A_TD', 'A_TDP', 'A_SA', 'A_REV', 'A_CTRL', 'A_H', 'A_B', 'A_L', 'A_D', 'A_C', 'A_G',
'B_KD', 'B_SS', 'B_SSP', 'B_TS', 'B_TD', 'B_TDP', 'B_SA', 'B_REV', 'B_CTRL', 'B_H', 'B_B', 'B_L', 'B_D', 'B_C', 'B_G'
])

events_table = soup.find('table')
events_table_rows = events_table.find_all('tr', class_ = 'b-statistics__table-row')

# Loop Through Events
for events_table_row in events_table_rows[2:]:
    event_name = events_table_row.find('a').text.strip() # Event
    event_date = events_table_row.find('span').text.strip() # Date
    event_location = events_table_row.find('td', class_ = 'b-statistics__table-col b-statistics__table-col_style_big-top-padding').text.strip() # Location

    event_link = events_table_row.find('a')['href']
    event_page = requests.get(event_link).text # Click Event Hyperlink (e.g., http://www.ufcstats.com/event-details/be8ad887e4d674b0)
    event_soup = BeautifulSoup(event_page, 'html')

    # Loop Through Fights in Event
    fight_hyperlinks = event_soup.find_all('tr', class_ = 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click')
    for fight_hyperlink in fight_hyperlinks: 
        fight_hyperlink = fight_hyperlink['data-link'] # Get Fight Hyperlink
        fights_page = requests.get(fight_hyperlink).text # Click Fight Hyperink (e.g., http://www.ufcstats.com/fight-details/15805ae1eea3343e)
        fight_soup = BeautifulSoup(fights_page, 'html')

        # Header
        a = fight_soup.find_all('a', class_ = 'b-link b-fight-details__person-link')[0].text.strip() # A
        b = fight_soup.find_all('a', class_ = 'b-link b-fight-details__person-link')[1].text.strip() # B

        # Winner
        if fight_soup.find('div', class_ = 'b-fight-details__person').find().text.strip() == 'W':
            winner = a
        elif fight_soup.find('div', class_ = 'b-fight-details__person').find().text.strip() == 'D':
            winner = 'Draw'
        else:
            winner = b

        division = fight_soup.find('i', class_ = 'b-fight-details__fight-title').text.strip() # Division
        method = fight_soup.find('i', string = lambda text: text and 'method' in text.lower()).find_next('i').text.strip() # Method
        round = fight_soup.find('i', string = lambda text: text and 'round' in text.lower()).next_sibling.strip() # Round
        time = fight_soup.find('i', string = lambda text: text and 'time' in text.lower()).next_sibling.strip() # Time
        format = fight_soup.find('i', string = lambda text: text and 'format' in text.lower()).next_sibling.strip() # Format
        referee = fight_soup.find('i', string = lambda text: text and 'referee' in text.lower()).find_next('span').text.strip() # Referee

        # Skips Fights w/o Table 1 and 2
        if fight_soup.find('section', class_ = 'b-fight-details__section js-fight-section').text.strip() == 'Round-by-round stats not currently available.':
            fights.loc[len(fights)] = [a, b, winner, event_name, event_date, event_location, division, method, round, time, format, referee,
        None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
        None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
        ]
            
        else:
            # Table 1
            fight_body_1_soup = fight_soup.find_all('tbody', class_ = 'b-fight-details__table-body')[0]
            table_1_columns = fight_body_1_soup.find_all('p', class_ = 'b-fight-details__table-text')
            del table_1_columns[0:2] # Delete Irrelevant Columns
            a_kd = table_1_columns[0].text.strip()
            b_kd = table_1_columns[1].text.strip()
            a_ss = table_1_columns[2].text.strip()
            b_ss = table_1_columns[3].text.strip()
            a_ssp = table_1_columns[4].text.strip()
            b_ssp = table_1_columns[5].text.strip()
            a_ts = table_1_columns[6].text.strip()
            b_ts = table_1_columns[7].text.strip()
            a_td = table_1_columns[8].text.strip()
            b_td = table_1_columns[9].text.strip()
            a_tdp = table_1_columns[10].text.strip()
            b_tdp = table_1_columns[11].text.strip()
            a_sa = table_1_columns[12].text.strip()
            b_sa = table_1_columns[13].text.strip()
            a_rev = table_1_columns[14].text.strip()
            b_rev = table_1_columns[15].text.strip()
            a_ctrl = table_1_columns[16].text.strip()
            b_ctrl = table_1_columns[17].text.strip()

            # Table 2
            fight_body_2_soup = fight_soup.find_all('tbody', class_ = 'b-fight-details__table-body')[2]
            table_2_columns = fight_body_2_soup.find_all('p', class_ = 'b-fight-details__table-text')
            del table_2_columns[0:6] # Delete Irrelevant Columns
            a_h = table_2_columns[0].text.strip()
            b_h = table_2_columns[1].text.strip()
            a_b = table_2_columns[2].text.strip()
            b_b = table_2_columns[3].text.strip()
            a_l = table_2_columns[4].text.strip()
            b_l = table_2_columns[5].text.strip()
            a_d = table_2_columns[6].text.strip()
            b_d = table_2_columns[7].text.strip()
            a_c = table_2_columns[8].text.strip()
            b_c = table_2_columns[9].text.strip()
            a_g = table_2_columns[10].text.strip()
            b_g = table_2_columns[11].text.strip()

            # Add to DataFrame
            fights.loc[len(fights)] = [a, b, winner, event_name, event_date, event_location, division, method, round, time, format, referee,
            a_kd, a_ss, a_ssp, a_ts, a_td, a_tdp, a_sa, a_rev, a_ctrl, a_h, a_b, a_l, a_d, a_c, a_g,
            b_kd, b_ss, b_ssp, b_ts, b_td, b_tdp, b_sa, b_rev, b_ctrl, b_h, b_b, b_l, b_d, b_c, b_g
            ]

print('Head:')
display(fights.head())
print('Tail:')
display(fights.tail())

Head:


Unnamed: 0,A,B,Winner,Event,Date,Location,Division,Method,Round,Time,Format,Referee,A_KD,A_SS,A_SSP,A_TS,A_TD,A_TDP,A_SA,A_REV,A_CTRL,A_H,A_B,A_L,A_D,A_C,A_G,B_KD,B_SS,B_SSP,B_TS,B_TD,B_TDP,B_SA,B_REV,B_CTRL,B_H,B_B,B_L,B_D,B_C,B_G
0,Sean O'Malley,Merab Dvalishvili,Merab Dvalishvili,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",UFC Bantamweight Title Bout,Decision - Unanimous,5,5:00,5 Rnd (5-5-5-5-5),Herb Dean,0,47 of 89,52%,49 of 91,0 of 0,---,0,0,0:00,22 of 48,23 of 39,2 of 2,45 of 85,0 of 1,2 of 3,0,82 of 164,50%,214 of 310,6 of 15,40%,0,0,10:03,55 of 130,4 of 6,23 of 28,29 of 85,8 of 14,45 of 65
1,Alexa Grasso,Valentina Shevchenko,Valentina Shevchenko,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",UFC Women's Flyweight Title Bout,Decision - Unanimous,5,5:00,5 Rnd (5-5-5-5-5),Mark Smith,0,18 of 100,18%,153 of 245,1 of 4,25%,2,0,0:49,8 of 74,3 of 10,7 of 16,16 of 98,1 of 1,1 of 1,0,45 of 80,56%,193 of 233,8 of 12,66%,0,0,16:04,38 of 71,3 of 5,4 of 4,39 of 72,2 of 2,4 of 6
2,Brian Ortega,Diego Lopes,Diego Lopes,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",Featherweight Bout,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Marc Goddard,0,63 of 195,32%,69 of 201,0 of 2,0%,0,0,0:06,52 of 180,4 of 8,7 of 7,61 of 192,1 of 1,1 of 2,0,106 of 206,51%,113 of 214,1 of 1,100%,0,0,2:45,75 of 172,13 of 15,18 of 19,75 of 161,6 of 7,25 of 38
3,Daniel Zellhuber,Esteban Ribovics,Esteban Ribovics,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",Lightweight Bout,Decision - Split,3,5:00,3 Rnd (5-5-5),Jason Herzog,1,121 of 308,39%,121 of 308,0 of 1,0%,0,0,0:15,83 of 255,33 of 48,5 of 5,117 of 304,4 of 4,0 of 0,0,156 of 345,45%,156 of 345,0 of 0,---,0,0,0:00,103 of 276,32 of 48,21 of 21,154 of 342,2 of 3,0 of 0
4,Ronaldo Rodriguez,Ode Osbourne,Ronaldo Rodriguez,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",Flyweight Bout,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Chris Tognoni,0,35 of 69,50%,68 of 117,1 of 2,50%,0,1,9:14,29 of 61,1 of 2,5 of 6,10 of 31,0 of 0,25 of 38,1,37 of 68,54%,55 of 93,1 of 3,33%,2,1,1:07,28 of 59,4 of 4,5 of 5,22 of 46,0 of 0,15 of 22


Tail:


Unnamed: 0,A,B,Winner,Event,Date,Location,Division,Method,Round,Time,Format,Referee,A_KD,A_SS,A_SSP,A_TS,A_TD,A_TDP,A_SA,A_REV,A_CTRL,A_H,A_B,A_L,A_D,A_C,A_G,B_KD,B_SS,B_SSP,B_TS,B_TD,B_TDP,B_SA,B_REV,B_CTRL,B_H,B_B,B_L,B_D,B_C,B_G
7831,Orlando Wiet,Robert Lucarelli,Orlando Wiet,UFC 2: No Way Out,"March 11, 1994","Denver, Colorado, USA",Open Weight Bout,KO/TKO,1,2:50,No Time Limit,John McCarthy,0,8 of 12,66%,11 of 15,0 of 0,---,0,0,--,7 of 11,1 of 1,0 of 0,1 of 3,0 of 0,7 of 9,0,2 of 6,33%,2 of 6,1 of 1,100%,1,0,--,1 of 2,0 of 1,1 of 3,2 of 6,0 of 0,0 of 0
7832,Frank Hamaker,Thaddeus Luster,Frank Hamaker,UFC 2: No Way Out,"March 11, 1994","Denver, Colorado, USA",Open Weight Bout,Submission,1,4:52,No Time Limit,John McCarthy,0,2 of 3,66%,14 of 15,1 of 1,100%,3,1,--,2 of 3,0 of 0,0 of 0,1 of 1,0 of 0,1 of 2,0,0 of 0,---,0 of 0,0 of 1,0%,0,0,--,0 of 0,0 of 0,0 of 0,0 of 0,0 of 0,0 of 0
7833,Johnny Rhodes,David Levicki,Johnny Rhodes,UFC 2: No Way Out,"March 11, 1994","Denver, Colorado, USA",Open Weight Bout,KO/TKO,1,12:13,No Time Limit,John McCarthy,0,11 of 17,64%,74 of 86,1 of 1,100%,0,0,--,9 of 15,1 of 1,1 of 1,1 of 1,1 of 1,9 of 15,0,4 of 5,80%,95 of 102,0 of 0,---,0,0,--,4 of 5,0 of 0,0 of 0,1 of 2,2 of 2,1 of 1
7834,Patrick Smith,Ray Wizard,Patrick Smith,UFC 2: No Way Out,"March 11, 1994","Denver, Colorado, USA",Open Weight Bout,Submission,1,0:58,No Time Limit,John McCarthy,0,1 of 1,100%,1 of 1,0 of 1,0%,1,0,--,0 of 0,1 of 1,0 of 0,0 of 0,1 of 1,0 of 0,0,1 of 1,100%,2 of 2,0 of 0,---,0,0,--,0 of 0,0 of 0,1 of 1,1 of 1,0 of 0,0 of 0
7835,Scott Morris,Sean Daugherty,Scott Morris,UFC 2: No Way Out,"March 11, 1994","Denver, Colorado, USA",Open Weight Bout,Submission,1,0:20,No Time Limit,John McCarthy,0,1 of 1,100%,2 of 2,1 of 1,100%,1,0,--,1 of 1,0 of 0,0 of 0,0 of 0,1 of 1,0 of 0,0,0 of 4,0%,1 of 5,0 of 0,---,0,0,--,0 of 2,0 of 0,0 of 2,0 of 3,0 of 1,0 of 0


### Exporting

In [7]:
fights.to_csv('fights.csv', index = False)