In [44]:
#Import libraries for web-scraping and saving to CSV file.
import requests
import bs4
import re
import csv
import os
import pandas as pd
from Scraper import get_urls, events, fights, fightstats, fighters, upcoming_events, normalise_tables
from datetime import datetime
import time
import inspect
import logging
from functools import wraps

### get fight urls from event urls

In [13]:
main_url = requests.get('http://ufcstats.com/statistics/events/completed?page=all')
main_event_soup = bs4.BeautifulSoup(main_url.text, 'lxml')


#Adds href to list if href contains a link with keyword 'event-details'
all_event_urls = [item.get('href') for item in  main_event_soup.find_all('a') 
                    if type(item.get('href')) == str 
                    and 'event-details' in item.get('href')]

all_event_urls = all_event_urls[:2]


In [14]:
#Iterates through each event URL
all_fight_urls = []
for url in all_event_urls:
    event_url = requests.get(url)
    event_soup = bs4.BeautifulSoup(event_url.text,'lxml')

    #Scrapes fight URLs from event pages and adds to list
    for item in event_soup.find_all('a', class_='b-flag b-flag_style_green'):
        all_fight_urls.append(item.get('href'))

### get fight info from fight urls

In [None]:
def safe_fight_info_get(default_return=None):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            try:
                return func(*args, **kwargs)
            except (IndexError, AttributeError) as e:
                logging.warning(f"Known error for {func.__name__}: {type(e).__name__}: {e}")
                return default_return
            except Exception as e:
                logging.error(f"Unexpected error for {func.__name__}: {type(e).__name__}: {e}")
                return default_return
        return wrapper
    return decorator

@safe_fight_info_get(default_return='NULL')
def get_referee(overview):
    return overview[3].text.split(':')[1]

@safe_fight_info_get(default_return=('NULL','NULL'))
def get_fighters(fight_details,fight_soup):
    try:
        return fight_details[0].text, fight_details[1].text
    except:
        links = fight_soup.select('a.b-fight-details__person-link')
        return links[0].text, links[1].text

@safe_fight_info_get(default_return=('NULL','NULL'))
def get_aliases(nicknames):
    n1 = nicknames[0].text if len(nicknames[0].text.strip()>0) else 'NULL'
    n2 = nicknames[1].text if len(nicknames[1].text.strip()>0) else 'NULL'
    return n1,n2
    
        
@safe_fight_info_get(default_return='NULL')
def get_winner(win_lose,f_1,f_2):
    if (win_lose[0].text.strip()=='W') | (win_lose[1].text.strip()=='W'):
        if (win_lose[0].text.strip()=='W'):
            return f_1
        else:
            return f_2
    else:
        return 'NULL'

@safe_fight_info_get(default_return='NULL')
def get_title_fight(fight_type):
    if 'Title' in fight_type[0].text:
        return 'T'
    else:
        return 'F'

@safe_fight_info_get(default_return='NULL')
def get_weight_class(fight_type):
    if 'Light Heavyweight' in fight_type[0].text.strip():
        return 'Light Heavyweight'
        
    elif 'Women' in fight_type[0].text.strip():
        return "Women's " + re.findall('\w*weight',fight_type[0].text.strip())[0]
        
    elif 'Catch Weight' in fight_type[0].text.strip():
        return 'Catch Weight'
            
    elif 'Open Weight' in fight_type[0].text.strip():
        return 'Open Weight' 
    else:   
        try:
            return re.findall('\w*weight',fight_type[0].text.strip())[0]
        except: 
            return 'NULL'

@safe_fight_info_get(default_return='NULL')
def get_gender(fight_type):
    if 'women' in str.lower(fight_type[0].text):
        return 'F'
    else:
        return 'M'

@safe_fight_info_get(default_return='NULL')
def get_result(select_result,select_result_details):
    if 'Decision' in select_result[0].text.split(':')[1]:
        return select_result[0].text.split(':')[1].split()[0], select_result[0].text.split(':')[1].split()[-1]
    else:
        return select_result[0].text.split(':')[1], select_result_details[1].text.split(':')[-1]

  return "Women's " + re.findall('\w*weight',fight_type[0].text.strip())[0]
  return re.findall('\w*weight',fight_type[0].text.strip())[0]


In [15]:
all_fight_urls

['http://ufcstats.com/fight-details/e733f148060bef2a',
 'http://ufcstats.com/fight-details/d05cb4c4135ce402',
 'http://ufcstats.com/fight-details/d3be5a4e0ec273e2',
 'http://ufcstats.com/fight-details/8c540eb4afe8c43e',
 'http://ufcstats.com/fight-details/b2d731415bd367df',
 'http://ufcstats.com/fight-details/fc43d60cbd6b0e6a',
 'http://ufcstats.com/fight-details/0eccebee160137b1',
 'http://ufcstats.com/fight-details/11eb27a1ac74d225',
 'http://ufcstats.com/fight-details/08e7a39eb7482ebf',
 'http://ufcstats.com/fight-details/4bbda6c1f6cf9d4e',
 'http://ufcstats.com/fight-details/fd98843926965cbd',
 'http://ufcstats.com/fight-details/b87d6f71d901355b',
 'http://ufcstats.com/fight-details/0e71e69359db4d1e']

In [49]:
#Scrapes details of each UFC fight and appends to file 'ufc_fight_data.csv'

url = all_fight_urls[0]

fight_url = requests.get(url)
fight_soup = bs4.BeautifulSoup(fight_url.text,'lxml')

#Define key select statements
overview = fight_soup.select('i.b-fight-details__text-item')
select_result = fight_soup.select('i.b-fight-details__text-item_first')
select_result_details = fight_soup.select('p.b-fight-details__text')
fight_details = fight_soup.select('p.b-fight-details__table-text')
fight_type = fight_soup.select('i.b-fight-details__fight-title')
win_lose = fight_soup.select('i.b-fight-details__person-status')
nicknames = fight_soup.select('p.b-fight-details__person-title')


In [None]:
#Scrape fight details
event_name = fight_soup.h2.text
referee = get_referee(overview)
f_1,f_2 = get_fighters(fight_details,fight_soup)
f_1_alias,f_2_alias= get_aliases(nicknames)
num_rounds = overview[2].text.split(':')[1].strip()[0]
title_fight = get_title_fight(fight_type)
weight_class = get_weight_class(fight_type)
gender = get_gender(fight_type)  
result,result_details = get_result(select_result,select_result_details)
finish_round = overview[0].text.split(':')[1]
finish_time = re.findall('\d:\d\d',overview[1].text)[0]
winner = get_winner(win_lose,f_1,f_2)

  finish_time = re.findall('\d:\d\d',overview[1].text)[0]
