In [2]:
#Import libraries for web-scraping and saving to CSV file.
import requests
import bs4
import re
import csv
import os
import pandas as pd
from Scraper import get_urls, events, fights, fightstats, fighters, upcoming_events, normalise_tables
from datetime import datetime
import time
import inspect
import logging
from functools import wraps
from string import ascii_lowercase

### get fighter urls from scratch

In [3]:

def get_fighter_urls():
    main_url_list = []
    # for letter in 'abcdefghijklmnopqrstuvwxyz':
    for letter in 'a':
        main_url_list.append(requests.get(f'http://ufcstats.com/statistics/fighters?char={letter}&page=all'))
        #Adds 1s delay to avoid response(429)
        time.sleep(1)
    #Iterates through each page and scrapes fighter links
    main_soup_list = [bs4.BeautifulSoup(url.text,'lxml') for url in main_url_list]
    fighter_urls = []
    for main_link in main_soup_list:
        for link in main_link.select('a.b-link')[1::3]:
            fighter_urls.append(link.get('href'))
    return fighter_urls

In [4]:
urls = get_fighter_urls()

In [5]:
urls

['http://ufcstats.com/fighter-details/93fe7332d16c6ad9',
 'http://ufcstats.com/fighter-details/15df64c02b6b0fde',
 'http://ufcstats.com/fighter-details/59a9d6dac61c2540',
 'http://ufcstats.com/fighter-details/b361180739bed4b0',
 'http://ufcstats.com/fighter-details/3329d692aea4dc28',
 'http://ufcstats.com/fighter-details/841695e02c99a521',
 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4',
 'http://ufcstats.com/fighter-details/c0ed7b208197e8de',
 'http://ufcstats.com/fighter-details/5140122c3eecd307',
 'http://ufcstats.com/fighter-details/c9f6385af6df66d7',
 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd',
 'http://ufcstats.com/fighter-details/7279654c7674cd24',
 'http://ufcstats.com/fighter-details/f689bd7bbd14b392',
 'http://ufcstats.com/fighter-details/1c5879330d42255f',
 'http://ufcstats.com/fighter-details/989b85f6540c86b1',
 'http://ufcstats.com/fighter-details/2620f3eb21c79614',
 'http://ufcstats.com/fighter-details/83b00f7597e5ac83',
 'http://ufcstats.com/fighter-d

### get fighter info from fighter urls

In [None]:
def safe_fighter_info_get(default_return=None):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            try:
                return func(*args, **kwargs)
            except (IndexError, AttributeError, TypeError) as e:
                logging.warning(f"Known error for {func.__name__}: {type(e).__name__}: {e}")
                return default_return
            except Exception as e:
                logging.error(f"Unexpected error for {func.__name__}: {type(e).__name__}: {e}")
                return default_return
        return wrapper
    return decorator

@safe_fighter_info_get(default_return='NULL')
def parse_l_name(name_parts):
    return ' '.join(name_parts[1:])
    
@safe_fighter_info_get(default_return='NULL')
def parse_nickname(nickname_tag):
    return nickname_tag.text.strip()


@safe_fighter_info_get(default_return='NULL')
def parse_height(height_tag):
    height_text = height_tag.text.split(':')[1].strip()
    feet, inches = height_text.split("'")
    feet = int(feet)
    inches = int(inches.strip('"'))
    return round((feet * 12 + inches) * 2.54, 2)


@safe_fighter_info_get(default_return='NULL')
def parse_reach(reach):
    reach_text = reach.text.split(':')[1]
    return round(int(reach_text.strip().strip('"')) * 2.54, 2)
        

@safe_fighter_info_get(default_return='NULL')
def parse_weight(weight_element):
    weight_text = weight_element.text.split(':')[1]
    return weight_text.split()[0].strip()

@safe_fighter_info_get(default_return='NULL')
def parse_stance(stance):
    stance_text = stance.text.split(':')[1]
    return stance_text.strip()


@safe_fighter_info_get(default_return='NULL')
def parse_dob(dob):
    dob_text = dob.text.split(':')[1].strip()
    return str(datetime.strptime(dob_text, '%b %d, %Y'))[0:10]
    
@safe_fighter_info_get(default_return=('NULL', 'NULL', 'NULL', 'NULL'))   
def parse_record(record_str):
    wins, losses, *rest = record_str.strip().split('-')
    draws = '0'
    other = 'NULL'
    if rest:
        if '(' in rest[0]:
            draws, note = rest[0].split('(')
            other = note.strip(')')
        else:
            draws = rest[0]
    return wins, losses, draws, other
        

#Iterates through each url and scrapes key details
url = urls[8]
try:
    fighter_url = requests.get(url)
    fighter_soup = bs4.BeautifulSoup(fighter_url.text, 'lxml')

    name_parts = fighter_soup.select('span')[0].text.split()
    nickname_tag = fighter_soup.select('p.b-content__Nickname')[0]
    details = fighter_soup.select('li.b-list__box-list-item')
    record_tag = fighter_soup.select('span.b-content__title-record')[0]

    record_data = record_tag.text.split(':')[1] if ':' in record_tag.text else record_tag.text
    fighter_w, fighter_l, fighter_d, fighter_nc_dq = parse_record(record_data)

    fighter_data = {
            'url': url,
            'first_name': name_parts[0],
            'last_name': parse_l_name(name_parts),
            'nickname': parse_nickname(nickname_tag),
            'height_cm': parse_height(details[0]),
            'weight_lbs': parse_weight(details[1]),
            'reach_cm': parse_reach(details[2]) ,
            'stance': parse_stance(details[3]),
            'dob': parse_dob(details[4]),
            'wins': fighter_w,
            'losses': fighter_l,
            'draws': fighter_d,
            'no_contest_or_dq': fighter_nc_dq
        }
except Exception as e:
    logging.exception(f"Error scraping fighter page: {url}")


In [11]:
fighter_data

{'url': 'http://ufcstats.com/fighter-details/5140122c3eecd307',
 'first_name': 'Daichi',
 'last_name': 'Abe',
 'nickname': '',
 'height_cm': 180.34,
 'weight_lbs': '170',
 'reach_cm': 180.34,
 'stance': 'Orthodox',
 'dob': '1991-11-27',
 'wins': '6',
 'losses': '2',
 'draws': '0',
 'no_contest_or_dq': 'NULL'}