In [2]:
#Import libraries for web-scraping and saving to CSV file.
import requests
import bs4
import re
import csv
import os
import pandas as pd
from Scraper import get_urls, events, fights, fightstats, fighters, upcoming_events, normalise_tables
from datetime import datetime
import time
import inspect
import logging
from functools import wraps

### get fight urls from event urls

In [3]:
main_url = requests.get('http://ufcstats.com/statistics/events/completed?page=all')
main_event_soup = bs4.BeautifulSoup(main_url.text, 'lxml')


#Adds href to list if href contains a link with keyword 'event-details'
all_event_urls = [item.get('href') for item in  main_event_soup.find_all('a') 
                    if type(item.get('href')) == str 
                    and 'event-details' in item.get('href')]

all_event_urls = all_event_urls[:2]


In [4]:
#Iterates through each event URL
all_fight_urls = []
for url in all_event_urls:
    event_url = requests.get(url)
    event_soup = bs4.BeautifulSoup(event_url.text,'lxml')

    #Scrapes fight URLs from event pages and adds to list
    for item in event_soup.find_all('a', class_='b-flag b-flag_style_green'):
        all_fight_urls.append(item.get('href'))

### get fight info from fight urls

In [6]:
def safe_fight_info_get(default_return=None):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            try:
                return func(*args, **kwargs)
            except (IndexError, AttributeError, TypeError) as e:
                logging.warning(f"Known error for {func.__name__}: {type(e).__name__}: {e}")
                return default_return
            except Exception as e:
                logging.error(f"Unexpected error for {func.__name__}: {type(e).__name__}: {e}")
                return default_return
        return wrapper
    return decorator

@safe_fight_info_get(default_return='NULL')
def get_referee(overview):
    return overview[3].text.split(':')[1]

@safe_fight_info_get(default_return=('NULL','NULL'))
def get_fighters(fight_details,fight_soup):
    try:
        return fight_details[0].text, fight_details[1].text
    except:
        links = fight_soup.select('a.b-fight-details__person-link')
        return links[0].text, links[1].text

@safe_fight_info_get(default_return=('NULL','NULL'))
def get_aliases(nicknames):
    n1 = nicknames[0].text if len(nicknames[0].text.strip())>0 else 'NULL'
    n2 = nicknames[1].text if len(nicknames[1].text.strip())>0 else 'NULL'
    return n1,n2
    
        
@safe_fight_info_get(default_return='NULL')
def get_winner(win_lose,f_1,f_2):
    if (win_lose[0].text.strip()=='W') | (win_lose[1].text.strip()=='W'):
        if (win_lose[0].text.strip()=='W'):
            return f_1
        else:
            return f_2
    else:
        return 'NULL'

@safe_fight_info_get(default_return='NULL')
def get_title_fight(fight_type):
    if 'Title' in fight_type[0].text:
        return 'T'
    else:
        return 'F'

@safe_fight_info_get(default_return='NULL')
def get_weight_class(fight_type):
    if 'Light Heavyweight' in fight_type[0].text.strip():
        return 'Light Heavyweight'
        
    elif 'Women' in fight_type[0].text.strip():
        return "Women's " + re.findall('\w*weight',fight_type[0].text.strip())[0]
        
    elif 'Catch Weight' in fight_type[0].text.strip():
        return 'Catch Weight'
            
    elif 'Open Weight' in fight_type[0].text.strip():
        return 'Open Weight' 
    else:   
        try:
            return re.findall('\w*weight',fight_type[0].text.strip())[0]
        except: 
            return 'NULL'

@safe_fight_info_get(default_return='NULL')
def get_gender(fight_type):
    if 'women' in str.lower(fight_type[0].text):
        return 'F'
    else:
        return 'M'

@safe_fight_info_get(default_return='NULL')
def get_result(select_result,select_result_details):
    if 'Decision' in select_result[0].text.split(':')[1]:
        return select_result[0].text.split(':')[1].split()[0], select_result[0].text.split(':')[1].split()[-1]
    else:
        return select_result[0].text.split(':')[1], select_result_details[1].text.split(':')[-1]

  return "Women's " + re.findall('\w*weight',fight_type[0].text.strip())[0]
  return re.findall('\w*weight',fight_type[0].text.strip())[0]


In [7]:
all_fight_urls

['http://ufcstats.com/fight-details/e733f148060bef2a',
 'http://ufcstats.com/fight-details/d05cb4c4135ce402',
 'http://ufcstats.com/fight-details/d3be5a4e0ec273e2',
 'http://ufcstats.com/fight-details/8c540eb4afe8c43e',
 'http://ufcstats.com/fight-details/b2d731415bd367df',
 'http://ufcstats.com/fight-details/fc43d60cbd6b0e6a',
 'http://ufcstats.com/fight-details/0eccebee160137b1',
 'http://ufcstats.com/fight-details/11eb27a1ac74d225',
 'http://ufcstats.com/fight-details/08e7a39eb7482ebf',
 'http://ufcstats.com/fight-details/4bbda6c1f6cf9d4e',
 'http://ufcstats.com/fight-details/fd98843926965cbd',
 'http://ufcstats.com/fight-details/b87d6f71d901355b',
 'http://ufcstats.com/fight-details/0e71e69359db4d1e']

In [8]:
#Scrapes details of each UFC fight and appends to file 'ufc_fight_data.csv'

url = all_fight_urls[0]

fight_url = requests.get(url)
fight_soup = bs4.BeautifulSoup(fight_url.text,'lxml')

#Define key select statements
overview = fight_soup.select('i.b-fight-details__text-item')
select_result = fight_soup.select('i.b-fight-details__text-item_first')
select_result_details = fight_soup.select('p.b-fight-details__text')
fight_details = fight_soup.select('p.b-fight-details__table-text')
fight_type = fight_soup.select('i.b-fight-details__fight-title')
win_lose = fight_soup.select('i.b-fight-details__person-status')
# nicknames = fight_soup.select('p.b-fight-details__person-title')


In [9]:
#Scrape fight details
event_name = fight_soup.h2.text
referee = get_referee(overview)
f_1,f_2 = get_fighters(fight_details,fight_soup)
# f_1_alias,f_2_alias= get_aliases(nicknames)
num_rounds = overview[2].text.split(':')[1].strip()[0]
title_fight = get_title_fight(fight_type)
weight_class = get_weight_class(fight_type)
gender = get_gender(fight_type)  
result,result_details = get_result(select_result,select_result_details)
finish_round = overview[0].text.split(':')[1]
finish_time = re.findall('\d:\d\d',overview[1].text)[0]
winner = get_winner(win_lose,f_1,f_2)

  finish_time = re.findall('\d:\d\d',overview[1].text)[0]


In [10]:
field_list = [
    event_name,
    referee,
    f_1,
    f_2,
    # f_1_alias, - this is stored at the fighter level as there is only 1 relevant nickname per fighter
    # f_2_alias,
    num_rounds,
    title_fight,
    weight_class,
    gender,
    result,
    result_details,
    finish_round,
    finish_time,
    winner
]

In [11]:
[i.strip() for i in field_list]

['UFC 314: Volkanovski vs. Lopes',
 'Marc Goddard',
 'Alexander Volkanovski',
 'Diego Lopes',
 '5',
 'T',
 'Featherweight',
 'M',
 'Decision',
 'Unanimous',
 '5',
 '5:00',
 'Alexander Volkanovski']

### get fight stat info from fight urls

In [None]:
def safe_text(stat_list, index, default = 'NULL', split = False, part = None, logger = None):
    try:
        text = stat_list[index].text
        if split:
            parts = text.split(' of ')
            return parts[part] if len(parts) == 2 else default
        return text
    except Exception as e:
        if logger:
            logger.warning(f"Failed to get a stat at index {index}:{e}")
        return default


def get_fighter_id(fight_soup,fight_stats,fighter):
    try:
        return fight_stats[fighter - 1].text
    except:
        try:
            return fight_soup.select('a.b-fight-details __person-link')[fighter - 1].text
        except:
            return 'NULL'
        
def get_striking_stats(fight_stats,fighter):
    idx = 0 if fighter == 1 else 1
    kd = 2 + idx
    total = 8 + idx
    significant = 4 + idx
    return(
        safe_text(fight_stats, kd)
        safe_text(fight_stats, total, split = True, part = 1)
        safe_text(fight_stats, total, split = True, part = 0)
        safe_text(fight_stats, significant, split = True, parts = 1)
        safe_text(fight_stats, significant, split = True, parts = 0)
    )

def get_grappling_stats(fight_stats,fighter):
    td_index = 10 + fighter - 1
    submission_index = 14 + fighter - 1
    reversal_index = 16 + fighter - 1
    control_index = 18 + fighter - 1

    return (
        safe_text(fight_stats, td_index, split = True, part = 1),
        safe_text(fight_stats, td_index, split = True, part = 0),
        safe_text(fight_stats, submission_index),
        safe_text(fight_stats, reversal_index),
        safe_text(fight_stats, control_index),
    )
        
def get_strike_stats_distributed(fight_stats,fighter):
    
    indexes = {
    #Head strikes
    'hs_index' : 5 + fighter,
    #Body strikes
    'bs_index' : 7 + fighter,
    #Leg strikes
    'ls_index' : 9 + fighter,
    #Distance strikes
    'ds_index' : 11 + fighter,
    #Clinch strikes
    'cs_index' : 13 + fighter,
    #Ground strikes
    'gs_index' : 15 + fighter
    }

    results = []
    for key, val in indexes.items():
        for i in range(2):
            results.append(fight_stats,val,split=True,part=i)
 
    return tuple(results)

In [26]:

fight_stats_table_totals = fight_soup.select('p.b-fight-details__table-text')
if not fight_stats_table_totals:
    logging.error(f'Fight stats totals table not found. url {url}')
significant_strikes_start = fight_soup.find(
    "p",
    class_="b-fight-details__collapse-link_tot",
    string=lambda x: x and "Significant Strikes" in x
)
if not significant_strikes_start:
    logging.error(f'Significant Strikes section header not found. url {url}')
section_tag = significant_strikes_start.find_next('tbody', class_='b-fight-details__table-body')
if not section_tag:
    logging.error(f'Significant Strikes section tag not found. url {url}')
fight_stats_table_significant_strikes = section_tag.select('p.b-fight-details__table-text')
if not fight_stats_table_significant_strikes:
    logging.error(f'Fight Details table not found. url {url}')

fight_stats = {
    "url" : url,
    "fighter_1":get_fighter_id(fight_soup, fight_stats_table_totals,1),
    "fighter_2":get_fighter_id(fight_soup, fight_stats_table_totals,2)
}
for fighter in [1,2]:
    (fight_stats[f"fighter_{fighter}_knockdowns"],
        fight_stats[f"fighter_{fighter}_total_strikes_att"],
        fight_stats[f"fighter_{fighter}_total_strikes_succ"],
        fight_stats[f"fighter_{fighter}_sig_strikes_att"],
        fight_stats[f"fighter_{fighter}_sig_strikes_succ"]) = get_striking_stats(fight_stats_table_totals, fighter)

    (fight_stats[f"fighter_{fighter}_takedown_att"],
        fight_stats[f"fighter_{fighter}_takedown_succ"],
        fight_stats[f"fighter_{fighter}_submission_att"],
        fight_stats[f"fighter_{fighter}_reversals"],
        fight_stats[f"fighter_{fighter}_ctrl_time"]) = get_grappling_stats(fight_stats_table_totals, fighter)

    (fight_stats[f"fighter_{fighter}_head_strikes_att"],
        fight_stats[f"fighter_{fighter}_head_strikes_succ"],
        fight_stats[f"fighter_{fighter}_body_strikes_att"],
        fight_stats[f"fighter_{fighter}_body_strikes_succ"],
        fight_stats[f"fighter_{fighter}_leg_strikes_att"],
        fight_stats[f"fighter_{fighter}_leg_strikes_succ"],
        fight_stats[f"fighter_{fighter}_distance_strikes_att"],
        fight_stats[f"fighter_{fighter}_distance_strikes_succ"],
        fight_stats[f"fighter_{fighter}_clinch_strikes_att"],
        fight_stats[f"fighter_{fighter}_clinch_strikes_succ"],
        fight_stats[f"fighter_{fighter}_ground_strikes_att"],
        fight_stats[f"fighter_{fighter}_ground_strikes_succ"]) = get_strike_stats_distributed(fight_stats_table_significant_strikes, fighter)



In [27]:
for key,value in fight_stats.items():
    fight_stats[key] = value.strip()

In [28]:
fight_stats

{'url': 'http://ufcstats.com/fight-details/e733f148060bef2a',
 'fighter_1': 'Alexander Volkanovski',
 'fighter_2': 'Diego Lopes',
 'fighter_1_knockdowns': '0',
 'fighter_1_total_strikes_att': '266',
 'fighter_1_total_strikes_succ': '165',
 'fighter_1_sig_strikes_att': '259',
 'fighter_1_sig_strikes_succ': '158',
 'fighter_1_takedown_att': '11',
 'fighter_1_takedown_succ': '1',
 'fighter_1_submission_att': '0',
 'fighter_1_reversals': '0',
 'fighter_1_ctrl_time': '1:18',
 'fighter_1_head_strikes_att': '225',
 'fighter_1_head_strikes_succ': '136',
 'fighter_1_body_strikes_att': '14',
 'fighter_1_body_strikes_succ': '8',
 'fighter_1_leg_strikes_att': '20',
 'fighter_1_leg_strikes_succ': '14',
 'fighter_1_distance_strikes_att': '237',
 'fighter_1_distance_strikes_succ': '142',
 'fighter_1_clinch_strikes_att': '11',
 'fighter_1_clinch_strikes_succ': '7',
 'fighter_1_ground_strikes_att': '11',
 'fighter_1_ground_strikes_succ': '9',
 'fighter_2_knockdowns': '1',
 'fighter_2_total_strikes_att'