In [2]:
#Import libraries for web-scraping and saving to CSV file.
import requests
import bs4
import re
import csv
import os
import pandas as pd
from Scraper import get_urls, events, fights, fightstats, fighters, upcoming_events, normalise_tables
from datetime import datetime
import time
import inspect
import logging
from functools import wraps

### get fight urls from event urls

In [3]:
main_url = requests.get('http://ufcstats.com/statistics/events/completed?page=all')
main_event_soup = bs4.BeautifulSoup(main_url.text, 'lxml')


#Adds href to list if href contains a link with keyword 'event-details'
all_event_urls = [item.get('href') for item in  main_event_soup.find_all('a') 
                    if type(item.get('href')) == str 
                    and 'event-details' in item.get('href')]

all_event_urls = all_event_urls[:2]


In [4]:
#Iterates through each event URL
all_fight_urls = []
for url in all_event_urls:
    event_url = requests.get(url)
    event_soup = bs4.BeautifulSoup(event_url.text,'lxml')

    #Scrapes fight URLs from event pages and adds to list
    for item in event_soup.find_all('a', class_='b-flag b-flag_style_green'):
        all_fight_urls.append(item.get('href'))

### get fight info from fight urls

In [5]:
def safe_fight_info_get(default_return=None):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            try:
                return func(*args, **kwargs)
            except (IndexError, AttributeError, TypeError) as e:
                logging.warning(f"Known error for {func.__name__}: {type(e).__name__}: {e}")
                return default_return
            except Exception as e:
                logging.error(f"Unexpected error for {func.__name__}: {type(e).__name__}: {e}")
                return default_return
        return wrapper
    return decorator

@safe_fight_info_get(default_return='NULL')
def get_referee(overview):
    return overview[3].text.split(':')[1]

@safe_fight_info_get(default_return=('NULL','NULL'))
def get_fighters(fight_details,fight_soup):
    try:
        return fight_details[0].text, fight_details[1].text
    except:
        links = fight_soup.select('a.b-fight-details__person-link')
        return links[0].text, links[1].text

@safe_fight_info_get(default_return=('NULL','NULL'))
def get_aliases(nicknames):
    n1 = nicknames[0].text if len(nicknames[0].text.strip())>0 else 'NULL'
    n2 = nicknames[1].text if len(nicknames[1].text.strip())>0 else 'NULL'
    return n1,n2
    
        
@safe_fight_info_get(default_return='NULL')
def get_winner(win_lose,f_1,f_2):
    if (win_lose[0].text.strip()=='W') | (win_lose[1].text.strip()=='W'):
        if (win_lose[0].text.strip()=='W'):
            return f_1
        else:
            return f_2
    else:
        return 'NULL'

@safe_fight_info_get(default_return='NULL')
def get_title_fight(fight_type):
    if 'Title' in fight_type[0].text:
        return 'T'
    else:
        return 'F'

@safe_fight_info_get(default_return='NULL')
def get_weight_class(fight_type):
    if 'Light Heavyweight' in fight_type[0].text.strip():
        return 'Light Heavyweight'
        
    elif 'Women' in fight_type[0].text.strip():
        return "Women's " + re.findall('\w*weight',fight_type[0].text.strip())[0]
        
    elif 'Catch Weight' in fight_type[0].text.strip():
        return 'Catch Weight'
            
    elif 'Open Weight' in fight_type[0].text.strip():
        return 'Open Weight' 
    else:   
        try:
            return re.findall('\w*weight',fight_type[0].text.strip())[0]
        except: 
            return 'NULL'

@safe_fight_info_get(default_return='NULL')
def get_gender(fight_type):
    if 'women' in str.lower(fight_type[0].text):
        return 'F'
    else:
        return 'M'

@safe_fight_info_get(default_return='NULL')
def get_result(select_result,select_result_details):
    if 'Decision' in select_result[0].text.split(':')[1]:
        return select_result[0].text.split(':')[1].split()[0], select_result[0].text.split(':')[1].split()[-1]
    else:
        return select_result[0].text.split(':')[1], select_result_details[1].text.split(':')[-1]

  return "Women's " + re.findall('\w*weight',fight_type[0].text.strip())[0]
  return re.findall('\w*weight',fight_type[0].text.strip())[0]


In [6]:
all_fight_urls

['http://ufcstats.com/fight-details/633f4b5ec767769d',
 'http://ufcstats.com/fight-details/a526a72ade275f46',
 'http://ufcstats.com/fight-details/4072abf0b0c179f2',
 'http://ufcstats.com/fight-details/9bbd45774fb4a6eb',
 'http://ufcstats.com/fight-details/fbf83609dc96f32e',
 'http://ufcstats.com/fight-details/62fbdd4f0bcd3c2a',
 'http://ufcstats.com/fight-details/d8d22dcc1f6912e7',
 'http://ufcstats.com/fight-details/97770b38df67b276',
 'http://ufcstats.com/fight-details/6f3cb7ce84185dd3',
 'http://ufcstats.com/fight-details/7d51649373d42571',
 'http://ufcstats.com/fight-details/c2c20a04a78d9143',
 'http://ufcstats.com/fight-details/e7b3ca036835b7c2',
 'http://ufcstats.com/fight-details/23201d09a78d446d',
 'http://ufcstats.com/fight-details/7fd6770fe2d4f258']

In [7]:
#Scrapes details of each UFC fight and appends to file 'ufc_fight_data.csv'

url = all_fight_urls[0]

fight_url = requests.get(url)
fight_soup = bs4.BeautifulSoup(fight_url.text,'lxml')

#Define key select statements
overview = fight_soup.select('i.b-fight-details__text-item')
select_result = fight_soup.select('i.b-fight-details__text-item_first')
select_result_details = fight_soup.select('p.b-fight-details__text')
fight_details = fight_soup.select('p.b-fight-details__table-text')
fight_type = fight_soup.select('i.b-fight-details__fight-title')
win_lose = fight_soup.select('i.b-fight-details__person-status')
# nicknames = fight_soup.select('p.b-fight-details__person-title')


In [8]:
#Scrape fight details
event_name = fight_soup.h2.text
referee = get_referee(overview)
f_1,f_2 = get_fighters(fight_details,fight_soup)
# f_1_alias,f_2_alias= get_aliases(nicknames)
num_rounds = overview[2].text.split(':')[1].strip()[0]
title_fight = get_title_fight(fight_type)
weight_class = get_weight_class(fight_type)
gender = get_gender(fight_type)  
result,result_details = get_result(select_result,select_result_details)
finish_round = overview[0].text.split(':')[1]
finish_time = re.findall('\d:\d\d',overview[1].text)[0]
winner = get_winner(win_lose,f_1,f_2)

  finish_time = re.findall('\d:\d\d',overview[1].text)[0]


In [9]:
field_list = [
    event_name,
    referee,
    f_1,
    f_2,
    # f_1_alias, - this is stored at the fighter level as there is only 1 relevant nickname per fighter
    # f_2_alias,
    num_rounds,
    title_fight,
    weight_class,
    gender,
    result,
    result_details,
    finish_round,
    finish_time,
    winner
]

In [10]:
[i.strip() for i in field_list]

['UFC Fight Night: Machado Garry vs. Prates',
 'Dan Miragliotta',
 'Ian Machado Garry',
 'Carlos Prates',
 '5',
 'F',
 'Welterweight',
 'M',
 'Decision',
 'Unanimous',
 '5',
 '5:00',
 'Ian Machado Garry']

### get fight stat info from fight urls

In [16]:
def safe_text(stat_list, index, default = 'NULL', split = False, part = None, logger = None):
    try:
        text = stat_list[index].text
        if split:
            parts = text.split(' of ')
            return parts[part] if len(parts) == 2 else default
        return text
    except Exception as e:
        if logger:
            logger.warning(f"Failed to get a stat at index {index}:{e}")
        return default


def get_fighter_id(fight_soup,fight_stats,fighter):
    try:
        return fight_stats[fighter - 1].text
    except:
        try:
            return fight_soup.select('a.b-fight-details __person-link')[fighter - 1].text
        except:
            return 'NULL'
        
def get_striking_stats(fight_stats,fighter):
    idx = 0 if fighter == 1 else 1
    kd = 2 + idx
    total = 8 + idx
    significant = 4 + idx
    return(
        safe_text(fight_stats, kd),
        safe_text(fight_stats, total, split = True, part = 1),
        safe_text(fight_stats, total, split = True, part = 0),
        safe_text(fight_stats, significant, split = True, part = 1),
        safe_text(fight_stats, significant, split = True, part = 0)
    )

def get_grappling_stats(fight_stats,fighter):
    td_index = 10 + fighter - 1
    submission_index = 14 + fighter - 1
    reversal_index = 16 + fighter - 1
    control_index = 18 + fighter - 1

    return (
        safe_text(fight_stats, td_index, split = True, part = 1),
        safe_text(fight_stats, td_index, split = True, part = 0),
        safe_text(fight_stats, submission_index),
        safe_text(fight_stats, reversal_index),
        safe_text(fight_stats, control_index),
    )
        
def get_strike_stats_distributed(fight_stats,fighter):
    
    indexes = {
    #Head strikes
    'hs_index' : 5 + fighter,
    #Body strikes
    'bs_index' : 7 + fighter,
    #Leg strikes
    'ls_index' : 9 + fighter,
    #Distance strikes
    'ds_index' : 11 + fighter,
    #Clinch strikes
    'cs_index' : 13 + fighter,
    #Ground strikes
    'gs_index' : 15 + fighter
    }

    results = []
    for key, val in indexes.items():
        for i in range(2):
            results.append(safe_text(fight_stats,val,split=True,part=i))
 
    return tuple(results)

In [17]:

fight_stats_table_totals = fight_soup.select('p.b-fight-details__table-text')
if not fight_stats_table_totals:
    logging.error(f'Fight stats totals table not found. url {url}')
significant_strikes_start = fight_soup.find(
    "p",
    class_="b-fight-details__collapse-link_tot",
    string=lambda x: x and "Significant Strikes" in x
)
if not significant_strikes_start:
    logging.error(f'Significant Strikes section header not found. url {url}')
section_tag = significant_strikes_start.find_next('tbody', class_='b-fight-details__table-body')
if not section_tag:
    logging.error(f'Significant Strikes section tag not found. url {url}')
fight_stats_table_significant_strikes = section_tag.select('p.b-fight-details__table-text')
if not fight_stats_table_significant_strikes:
    logging.error(f'Fight Details table not found. url {url}')

fight_stats = {
    "url" : url,
    "fighter_1":get_fighter_id(fight_soup, fight_stats_table_totals,1),
    "fighter_2":get_fighter_id(fight_soup, fight_stats_table_totals,2)
}
for fighter in [1,2]:
    (fight_stats[f"fighter_{fighter}_knockdowns"],
        fight_stats[f"fighter_{fighter}_total_strikes_att"],
        fight_stats[f"fighter_{fighter}_total_strikes_succ"],
        fight_stats[f"fighter_{fighter}_sig_strikes_att"],
        fight_stats[f"fighter_{fighter}_sig_strikes_succ"]) = get_striking_stats(fight_stats_table_totals, fighter)

    (fight_stats[f"fighter_{fighter}_takedown_att"],
        fight_stats[f"fighter_{fighter}_takedown_succ"],
        fight_stats[f"fighter_{fighter}_submission_att"],
        fight_stats[f"fighter_{fighter}_reversals"],
        fight_stats[f"fighter_{fighter}_ctrl_time"]) = get_grappling_stats(fight_stats_table_totals, fighter)

    (fight_stats[f"fighter_{fighter}_head_strikes_att"],
        fight_stats[f"fighter_{fighter}_head_strikes_succ"],
        fight_stats[f"fighter_{fighter}_body_strikes_att"],
        fight_stats[f"fighter_{fighter}_body_strikes_succ"],
        fight_stats[f"fighter_{fighter}_leg_strikes_att"],
        fight_stats[f"fighter_{fighter}_leg_strikes_succ"],
        fight_stats[f"fighter_{fighter}_distance_strikes_att"],
        fight_stats[f"fighter_{fighter}_distance_strikes_succ"],
        fight_stats[f"fighter_{fighter}_clinch_strikes_att"],
        fight_stats[f"fighter_{fighter}_clinch_strikes_succ"],
        fight_stats[f"fighter_{fighter}_ground_strikes_att"],
        fight_stats[f"fighter_{fighter}_ground_strikes_succ"]) = get_strike_stats_distributed(fight_stats_table_significant_strikes, fighter)



In [None]:
for key,value in fight_stats.items():
    fight_stats[key] = value.strip()

In [24]:
for index,value in enumerate(fight_stats_table_totals):
    print(index,value)

0 <p class="b-fight-details__table-text">
<a class="b-link b-link_style_black" href="http://ufcstats.com/fighter-details/442c9011034ae1fd">Ian Machado Garry </a>
</p>
1 <p class="b-fight-details__table-text">
<a class="b-link b-link_style_black" href="http://ufcstats.com/fighter-details/7ee0fd831c0fe7c3">Carlos Prates </a>
</p>
2 <p class="b-fight-details__table-text">
      0
    </p>
3 <p class="b-fight-details__table-text">
      0
    </p>
4 <p class="b-fight-details__table-text">
      126 of 242
    </p>
5 <p class="b-fight-details__table-text">
      63 of 129
    </p>
6 <p class="b-fight-details__table-text">
      52%
    </p>
7 <p class="b-fight-details__table-text">
      48%
    </p>
8 <p class="b-fight-details__table-text">
      141 of 259
    </p>
9 <p class="b-fight-details__table-text">
      64 of 132
    </p>
10 <p class="b-fight-details__table-text">
      4 of 19
    </p>
11 <p class="b-fight-details__table-text">
      0 of 0
    </p>
12 <p class="b-fight-details_

In [25]:
fight_soup

<!DOCTYPE html>
<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--><!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]--><!--[if IE 8]>         <html class="no-js ie8 lt-ie9"> <![endif]--><!--[if gt IE 8]><!--><html class="no-js"> <!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<title>
    Stats | UFC
  </title>
<meta content="" name="description"/>
<meta content="" name="viewport"/>
<link href="/blocks/main.css?ver=732859" rel="stylesheet"/>
<script src="/js/vendor/modernizr-2.6.2.min.js"></script>
<script>
    (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
    (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
    m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
    })(window,document,'script','//www.google-analytics.com/analytics.js','ga');

    ga('create', 'UA-2855164-1', 'auto');
    ga('send', 'p

In [None]:
# per round totals
print(fight_soup.find(
    "i",
    class_="b-fight-details__collapse-left"
).find_next("tbody","b-fight-details__table-body").select("p.b-fight-details__table-text"))

[<p class="b-fight-details__table-text">
<a class="b-link b-link_style_black" href="http://ufcstats.com/fighter-details/442c9011034ae1fd">Ian Machado Garry </a>
</p>, <p class="b-fight-details__table-text">
<a class="b-link b-link_style_black" href="http://ufcstats.com/fighter-details/7ee0fd831c0fe7c3">Carlos Prates </a>
</p>, <p class="b-fight-details__table-text">
      0
    </p>, <p class="b-fight-details__table-text">
      0
    </p>, <p class="b-fight-details__table-text">
      15 of 39
    </p>, <p class="b-fight-details__table-text">
      7 of 16
    </p>, <p class="b-fight-details__table-text">
      38%
    </p>, <p class="b-fight-details__table-text">
      43%
    </p>, <p class="b-fight-details__table-text">
      15 of 39
    </p>, <p class="b-fight-details__table-text">
      7 of 16
    </p>, <p class="b-fight-details__table-text">
      0 of 1
    </p>, <p class="b-fight-details__table-text">
      0 of 0
    </p>, <p class="b-fight-details__table-text">
      0%
  

In [None]:
# per round distributed
print(fight_soup.find(
    "i",
    class_="b-fight-details__collapse-left"
).find_next(
    "i",
    class_="b-fight-details__collapse-left"
).find_next("tbody","b-fight-details__table-body").select("p.b-fight-details__table-text"))

[<p class="b-fight-details__table-text">
<a class="b-link b-link_style_black" href="http://ufcstats.com/fighter-details/442c9011034ae1fd">Ian Machado Garry </a>
</p>, <p class="b-fight-details__table-text">
<a class="b-link b-link_style_black" href="http://ufcstats.com/fighter-details/7ee0fd831c0fe7c3">Carlos Prates </a>
</p>, <p class="b-fight-details__table-text">
      15 of 39
    </p>, <p class="b-fight-details__table-text">
      7 of 16
    </p>, <p class="b-fight-details__table-text">
      38%
    </p>, <p class="b-fight-details__table-text">
      43%
    </p>, <p class="b-fight-details__table-text">
      8 of 25
    </p>, <p class="b-fight-details__table-text">
      2 of 8
    </p>, <p class="b-fight-details__table-text">
      3 of 7
    </p>, <p class="b-fight-details__table-text">
      2 of 3
    </p>, <p class="b-fight-details__table-text">
      4 of 7
    </p>, <p class="b-fight-details__table-text">
      3 of 5
    </p>, <p class="b-fight-details__table-text">
   

In [19]:
fight_stats

{'url': 'http://ufcstats.com/fight-details/633f4b5ec767769d',
 'fighter_1': 'Ian Machado Garry',
 'fighter_2': 'Carlos Prates',
 'fighter_1_knockdowns': '0',
 'fighter_1_total_strikes_att': '259',
 'fighter_1_total_strikes_succ': '141',
 'fighter_1_sig_strikes_att': '242',
 'fighter_1_sig_strikes_succ': '126',
 'fighter_1_takedown_att': '19',
 'fighter_1_takedown_succ': '4',
 'fighter_1_submission_att': '0',
 'fighter_1_reversals': '0',
 'fighter_1_ctrl_time': '3:09',
 'fighter_1_head_strikes_att': '86',
 'fighter_1_head_strikes_succ': '185',
 'fighter_1_body_strikes_att': '17',
 'fighter_1_body_strikes_succ': '28',
 'fighter_1_leg_strikes_att': '23',
 'fighter_1_leg_strikes_succ': '29',
 'fighter_1_distance_strikes_att': '120',
 'fighter_1_distance_strikes_succ': '231',
 'fighter_1_clinch_strikes_att': '2',
 'fighter_1_clinch_strikes_succ': '6',
 'fighter_1_ground_strikes_att': '4',
 'fighter_1_ground_strikes_succ': '5',
 'fighter_2_knockdowns': '0',
 'fighter_2_total_strikes_att': '1