In [44]:
#Import libraries for web-scraping and saving to CSV file.
import requests
import bs4
import re
import csv
import os
import pandas as pd
from Scraper import get_urls, events, fights, fightstats, fighters, upcoming_events, normalise_tables
from datetime import datetime
import time
import inspect
import logging
from functools import wraps

### get fight urls from event urls

In [13]:
main_url = requests.get('http://ufcstats.com/statistics/events/completed?page=all')
main_event_soup = bs4.BeautifulSoup(main_url.text, 'lxml')


#Adds href to list if href contains a link with keyword 'event-details'
all_event_urls = [item.get('href') for item in  main_event_soup.find_all('a') 
                    if type(item.get('href')) == str 
                    and 'event-details' in item.get('href')]

all_event_urls = all_event_urls[:2]


In [14]:
#Iterates through each event URL
all_fight_urls = []
for url in all_event_urls:
    event_url = requests.get(url)
    event_soup = bs4.BeautifulSoup(event_url.text,'lxml')

    #Scrapes fight URLs from event pages and adds to list
    for item in event_soup.find_all('a', class_='b-flag b-flag_style_green'):
        all_fight_urls.append(item.get('href'))

### get fight info from fight urls

In [62]:
def safe_fight_info_get(default_return=None):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            try:
                return func(*args, **kwargs)
            except (IndexError, AttributeError) as e:
                logging.warning(f"Known error for {func.__name__}: {type(e).__name__}: {e}")
                return default_return
            except Exception as e:
                logging.error(f"Unexpected error for {func.__name__}: {type(e).__name__}: {e}")
                return default_return
        return wrapper
    return decorator

@safe_fight_info_get(default_return='NULL')
def get_referee(overview):
    return overview[3].text.split(':')[1]

@safe_fight_info_get(default_return=('NULL','NULL'))
def get_fighters(fight_details,fight_soup):
    try:
        return fight_details[0].text, fight_details[1].text
    except:
        links = fight_soup.select('a.b-fight-details__person-link')
        return links[0].text, links[1].text

@safe_fight_info_get(default_return=('NULL','NULL'))
def get_aliases(nicknames):
    n1 = nicknames[0].text if len(nicknames[0].text.strip())>0 else 'NULL'
    n2 = nicknames[1].text if len(nicknames[1].text.strip())>0 else 'NULL'
    return n1,n2
    
        
@safe_fight_info_get(default_return='NULL')
def get_winner(win_lose,f_1,f_2):
    if (win_lose[0].text.strip()=='W') | (win_lose[1].text.strip()=='W'):
        if (win_lose[0].text.strip()=='W'):
            return f_1
        else:
            return f_2
    else:
        return 'NULL'

@safe_fight_info_get(default_return='NULL')
def get_title_fight(fight_type):
    if 'Title' in fight_type[0].text:
        return 'T'
    else:
        return 'F'

@safe_fight_info_get(default_return='NULL')
def get_weight_class(fight_type):
    if 'Light Heavyweight' in fight_type[0].text.strip():
        return 'Light Heavyweight'
        
    elif 'Women' in fight_type[0].text.strip():
        return "Women's " + re.findall('\w*weight',fight_type[0].text.strip())[0]
        
    elif 'Catch Weight' in fight_type[0].text.strip():
        return 'Catch Weight'
            
    elif 'Open Weight' in fight_type[0].text.strip():
        return 'Open Weight' 
    else:   
        try:
            return re.findall('\w*weight',fight_type[0].text.strip())[0]
        except: 
            return 'NULL'

@safe_fight_info_get(default_return='NULL')
def get_gender(fight_type):
    if 'women' in str.lower(fight_type[0].text):
        return 'F'
    else:
        return 'M'

@safe_fight_info_get(default_return='NULL')
def get_result(select_result,select_result_details):
    if 'Decision' in select_result[0].text.split(':')[1]:
        return select_result[0].text.split(':')[1].split()[0], select_result[0].text.split(':')[1].split()[-1]
    else:
        return select_result[0].text.split(':')[1], select_result_details[1].text.split(':')[-1]

  return "Women's " + re.findall('\w*weight',fight_type[0].text.strip())[0]
  return re.findall('\w*weight',fight_type[0].text.strip())[0]


In [15]:
all_fight_urls

['http://ufcstats.com/fight-details/e733f148060bef2a',
 'http://ufcstats.com/fight-details/d05cb4c4135ce402',
 'http://ufcstats.com/fight-details/d3be5a4e0ec273e2',
 'http://ufcstats.com/fight-details/8c540eb4afe8c43e',
 'http://ufcstats.com/fight-details/b2d731415bd367df',
 'http://ufcstats.com/fight-details/fc43d60cbd6b0e6a',
 'http://ufcstats.com/fight-details/0eccebee160137b1',
 'http://ufcstats.com/fight-details/11eb27a1ac74d225',
 'http://ufcstats.com/fight-details/08e7a39eb7482ebf',
 'http://ufcstats.com/fight-details/4bbda6c1f6cf9d4e',
 'http://ufcstats.com/fight-details/fd98843926965cbd',
 'http://ufcstats.com/fight-details/b87d6f71d901355b',
 'http://ufcstats.com/fight-details/0e71e69359db4d1e']

In [49]:
#Scrapes details of each UFC fight and appends to file 'ufc_fight_data.csv'

url = all_fight_urls[0]

fight_url = requests.get(url)
fight_soup = bs4.BeautifulSoup(fight_url.text,'lxml')

#Define key select statements
overview = fight_soup.select('i.b-fight-details__text-item')
select_result = fight_soup.select('i.b-fight-details__text-item_first')
select_result_details = fight_soup.select('p.b-fight-details__text')
fight_details = fight_soup.select('p.b-fight-details__table-text')
fight_type = fight_soup.select('i.b-fight-details__fight-title')
win_lose = fight_soup.select('i.b-fight-details__person-status')
nicknames = fight_soup.select('p.b-fight-details__person-title')


In [61]:
nicknames[0].text.strip()

'"The Great"'

In [63]:
#Scrape fight details
event_name = fight_soup.h2.text
referee = get_referee(overview)
f_1,f_2 = get_fighters(fight_details,fight_soup)
f_1_alias,f_2_alias= get_aliases(nicknames)
num_rounds = overview[2].text.split(':')[1].strip()[0]
title_fight = get_title_fight(fight_type)
weight_class = get_weight_class(fight_type)
gender = get_gender(fight_type)  
result,result_details = get_result(select_result,select_result_details)
finish_round = overview[0].text.split(':')[1]
finish_time = re.findall('\d:\d\d',overview[1].text)[0]
winner = get_winner(win_lose,f_1,f_2)

  finish_time = re.findall('\d:\d\d',overview[1].text)[0]


In [66]:
field_list = [
    event_name,
    referee,
    f_1,
    f_2,
    f_1_alias,
    f_2_alias,
    num_rounds,
    title_fight,
    weight_class,
    gender,
    result,
    result_details,
    finish_round,
    finish_time,
    winner
]

In [68]:
[i.strip() for i in field_list]

['UFC 314: Volkanovski vs. Lopes',
 'Marc Goddard',
 'Alexander Volkanovski',
 'Diego Lopes',
 '"The Great"',
 'NULL',
 '5',
 'T',
 'Featherweight',
 'M',
 'Decision',
 'Unanimous',
 '5',
 '5:00',
 'Alexander Volkanovski']

### get fight stat info from fight urls

In [69]:
def get_fighter_id(fight_soup,fight_stats,fighter):
    if fighter == 1:
        try:
            return fight_stats[0].text
        except:
            return fight_soup.select('a.b-fight-details__person-link')[0].text
        
    elif fighter == 2:
        try:
            return fight_stats[1].text
        except:
            return fight_soup.select('a.b-fight-details__person-link')[1].text
        
def get_striking_stats(fight_stats,fighter):
    if fighter == 1:
        try:
            return (#Knockdowns
            fight_stats[2].text, 
            #Total strikes attempted
            fight_stats[8].text.split(' of ')[1],
            #Total strikes successful
            fight_stats[8].text.split(' of ')[0],
            #Significant strikes attempted
            fight_stats[4].text.split(' of ')[1],
            #Significant strikes successful
            fight_stats[4].text.split(' of ')[0])
        
        except:
            return (#Knockdowns
            'NULL', 
            #Total strikes attempted
            'NULL',
            #Total strikes successful
            'NULL',
            #Significant strikes attempted
            'NULL',
            #Significant strikes successful
            'NULL')
        
    elif fighter == 2:
        try:
            return (#Knockdowns
            fight_stats[3].text, 
            #Total strikes attempted
            fight_stats[9].text.split(' of ')[1],
            #Total strikes successful
            fight_stats[9].text.split(' of ')[0],
            #Significant strikes attempted
            fight_stats[5].text.split(' of ')[1],
            #Significant strikes successful
            fight_stats[5].text.split(' of ')[0])
        
        except:
            return (#Knockdowns
            'NULL', 
            #Total strikes attempted
            'NULL',
            #Total strikes successful
            'NULL',
            #Significant strikes attempted
            'NULL',
            #Significant strikes successful
            'NULL')

def get_grappling_stats(fight_stats,fighter):
    if fighter == 1:

        try:
            return (#Takedowns attempted
            fight_stats[10].text.split(' of ')[1],
            #Takedowns successful
            fight_stats[10].text.split(' of ')[0],
            #Submissions attempted
            fight_stats[14].text,
            #Reversals
            fight_stats[16].text,
            #Control time
            fight_stats[18].text)
        
        except:
            return (#Takedowns attempted
            'NULL',
            #Takedowns successful
            'NULL',
            #Submissions attempted
            'NULL',
            #Reversals
            'NULL',
            #Control time
            'NULL')
        
    elif fighter == 2:

        try:
            return (#Takedowns attempted
            fight_stats[11].text.split(' of ')[1],
            #Takedowns successful
            fight_stats[11].text.split(' of ')[0],
            #Submissions attempted
            fight_stats[15].text,
            #Reversals
            fight_stats[17].text,
            #Control time
            fight_stats[19].text)
        
        except:
            return (#Takedowns attempted
            'NULL',
            #Takedowns successful
            'NULL',
            #Submissions attempted
            'NULL',
            #Reversals
            'NULL',
            #Control time
            'NULL')
        
def get_strike_stats_distributed(fight_stats,fighter):
    if fighter == 1:

        try:
            return (#Head strikes attempted
            fight_stats[6].text.split(' of ')[1],
            #Head strikes successful
            fight_stats[6].text.split(' of ')[0],
            #Body strikes attempted
            fight_stats[8].text.split(' of ')[1],
            #Body strikes landed
            fight_stats[8].text.split(' of ')[0],
            #Leg strikes attempted
            fight_stats[10].text.split(' of ')[1],
            #Leg strikes landed
            fight_stats[10].text.split(' of ')[0],
            #Distance strikes attempted
            fight_stats[12].text.split(' of ')[1],
            #Distance strikes landed
            fight_stats[12].text.split(' of ')[0],
            #Clinch strikes attempted
            fight_stats[14].text.split(' of ')[1],
            #Clinch strikes landed
            fight_stats[14].text.split(' of ')[0],
            #Ground strikes attempted
            fight_stats[16].text.split(' of ')[1],
            #Ground strikes landed
            fight_stats[16].text.split(' of ')[0]
            )
        
        except:
            return (
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL')
        
    elif fighter == 2:

        try:
            return (#Head strikes attempted
            fight_stats[7].text.split(' of ')[1],
            #Head strikes successful
            fight_stats[7].text.split(' of ')[0],
            #Body strikes attempted
            fight_stats[9].text.split(' of ')[1],
            #Body strikes landed
            fight_stats[9].text.split(' of ')[0],
            #Leg strikes attempted
            fight_stats[11].text.split(' of ')[1],
            #Leg strikes landed
            fight_stats[11].text.split(' of ')[0],
            #Distance strikes attempted
            fight_stats[13].text.split(' of ')[1],
            #Distance strikes landed
            fight_stats[13].text.split(' of ')[0],
            #Clinch strikes attempted
            fight_stats[15].text.split(' of ')[1],
            #Clinch strikes landed
            fight_stats[15].text.split(' of ')[0],
            #Ground strikes attempted
            fight_stats[17].text.split(' of ')[1],
            #Ground strikes landed
            fight_stats[17].text.split(' of ')[0]
            )
        
        except:
            return (
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL',
            'NULL')

In [76]:

# The first tag of the table head - "Totals"
fight_stats_table_totals = fight_soup.select('p.b-fight-details__table-text')
if not fight_stats_table_totals:
    # catch some sort of exception
    # continue
    print('here')
    
# Find the tag containing "Significant Strikes"
significant_strikes_start = fight_soup.find("p", {"class": "b-fight-details__collapse-link_tot"},string = '\n        Significant Strikes\n\n      ')

section_tag = significant_strikes_start.find_next('tbody', class_='b-fight-details__table-body')
fight_stats_table2 = section_tag.select('p.b-fight-details__table-text')

#Scrape fight stats for first fighter 
fighter_name = get_fighter_id(fight_soup,fight_stats_table_totals,1)
(knockdowns,
    total_strikes_att,
    total_strikes_succ,
    sig_strikes_att,
    sig_strikes_succ) = get_striking_stats(fight_stats_table_totals,1)
(takedown_att,
    takedown_succ,
    submission_att,
    reversals,
    ctrl_time) = get_grappling_stats(fight_stats_table_totals,1)
(head_strikes_att,
    head_strikes_succ,
    body_strikes_att,
    body_strikes_succ,
    leg_strikes_att,
    leg_strikes_succ,
    distance_strikes_att,
    distance_strikes_succ,
    clinch_strikes_att,
    clinch_strikes_succ,
    ground_strikes_att,
    ground_strikes_succ
) = get_strike_stats_distributed(fight_stats_table2,1)

    
    
#Scrape fight stats for second fighter 
fighter_name = get_fighter_id(fight_soup,fight_stats_table_totals,2)
(knockdowns,
    total_strikes_att,
    total_strikes_succ,
    sig_strikes_att,
    sig_strikes_succ) = get_striking_stats(fight_stats_table_totals,2)
(takedown_att,
    takedown_succ,
    submission_att,
    reversals,
    ctrl_time) = get_grappling_stats(fight_stats_table_totals,2)
(head_strikes_att,
    head_strikes_succ,
    body_strikes_att,
    body_strikes_succ,
    leg_strikes_att,
    leg_strikes_succ,
    distance_strikes_att,
    distance_strikes_succ,
    clinch_strikes_att,
    clinch_strikes_succ,
    ground_strikes_att,
    ground_strikes_succ
) = get_strike_stats_distributed(fight_stats_table2,2)

In [77]:
fight_stats_table_totals

[<p class="b-fight-details__table-text">
 <a class="b-link b-link_style_black" href="http://ufcstats.com/fighter-details/e1248941344b3288">Alexander Volkanovski </a>
 </p>,
 <p class="b-fight-details__table-text">
 <a class="b-link b-link_style_black" href="http://ufcstats.com/fighter-details/f166e93d04a8c274">Diego Lopes </a>
 </p>,
 <p class="b-fight-details__table-text">
       0
     </p>,
 <p class="b-fight-details__table-text">
       1
     </p>,
 <p class="b-fight-details__table-text">
       158 of 259
     </p>,
 <p class="b-fight-details__table-text">
       63 of 194
     </p>,
 <p class="b-fight-details__table-text">
       61%
     </p>,
 <p class="b-fight-details__table-text">
       32%
     </p>,
 <p class="b-fight-details__table-text">
       165 of 266
     </p>,
 <p class="b-fight-details__table-text">
       71 of 203
     </p>,
 <p class="b-fight-details__table-text">
       1 of 11
     </p>,
 <p class="b-fight-details__table-text">
       0 of 0
     </p>,
 <p c