In [1]:
#import necessary libraries
import requests as r
from bs4 import BeautifulSoup as BS
import string
import pandas as pd
import numpy as np

In [2]:
"""This method takes in a url, gets the response text, 
and returns a beautiful soup object"""
def get_BS_object(url):
    response = r.get(url)
    bs_object = BS(response.text)
    return bs_object

"""This method takes in a BeautifulSoup object and finds all 'td' tags, 
converts them to a string, appends them to a list and returns it"""#need to edit this description
def get_tc(bs_object, tag, href):
    table_contents = []
    for column in bs_object.find_all(tag, href = href):
        table_contents.append(str(column.text))
    return table_contents

"""This method takes in a list of strings and compares the first 
40 characters of a string to http://www.ufcstats.com/fighter-details/ 
and returns a list of strings that satisfy this constraint"""
def get_detail_url(project_href):
    fighter_detail_url = []
    for i in range(0, len(project_href)):
        if project_href[i][0:40]=='http://www.ufcstats.com/fighter-details/':
            fighter_detail_url.append(project_href[i])
        else:
            continue
    return fighter_detail_url

"""This method takes in dirty table contents, removes newline 
character, and white spaces, returns clean table contents"""
def clean_tc(dirty_tc):
    clean_table_contents = []
    for item in dirty_tc:
        clean_table_contents.append(item.replace("\n", "").strip())
    return clean_table_contents

In [3]:
#create list of letters to iterate through
letters = list(string.ascii_lowercase)

#create a dictionary to store list of fighter detail urls for each letter
dictionary = {}
for letter in letters:#iterate through list of letters
    
    #query fighter stats page by each letter
    url = "http://www.ufcstats.com/statistics/fighters?char={0}&page=all".format(letter)
    
    #create BeautifulSoup object
    bs_object = get_BS_object(url)
    
    #get the href attribute of each 'a' tag on page
    project_href = [i['href'] for i in bs_object.find_all('a', href=True)]
    
    #get the fighter detail urls and store in dictionary by letters a-z
    dictionary[letter] = get_detail_url(project_href)

In [4]:
#ensure dictionary populated urls
dictionary

{'a': ['http://www.ufcstats.com/fighter-details/93fe7332d16c6ad9',
  'http://www.ufcstats.com/fighter-details/93fe7332d16c6ad9',
  'http://www.ufcstats.com/fighter-details/93fe7332d16c6ad9',
  'http://www.ufcstats.com/fighter-details/15df64c02b6b0fde',
  'http://www.ufcstats.com/fighter-details/15df64c02b6b0fde',
  'http://www.ufcstats.com/fighter-details/15df64c02b6b0fde',
  'http://www.ufcstats.com/fighter-details/59a9d6dac61c2540',
  'http://www.ufcstats.com/fighter-details/59a9d6dac61c2540',
  'http://www.ufcstats.com/fighter-details/59a9d6dac61c2540',
  'http://www.ufcstats.com/fighter-details/b361180739bed4b0',
  'http://www.ufcstats.com/fighter-details/b361180739bed4b0',
  'http://www.ufcstats.com/fighter-details/b361180739bed4b0',
  'http://www.ufcstats.com/fighter-details/3329d692aea4dc28',
  'http://www.ufcstats.com/fighter-details/3329d692aea4dc28',
  'http://www.ufcstats.com/fighter-details/3329d692aea4dc28',
  'http://www.ufcstats.com/fighter-details/2f5cbecbbe18bac4',
  '

In [5]:
"""Takes in dictionary and returns a flattened list of contents of each item"""
def to_list(dictionary):
    lst = []
    for item in dictionary.keys():
        lst.append(dictionary[item])
    return [i for item in lst for i in item]    

In [6]:
#create a list of urls without duplicates
lst = list(set(to_list(dictionary)))
len(lst)

3960

In [7]:
"""This function takes in a list of urls and returns a dictionary of dictionaries 
containing the names of ufc fighters and their career stats, and the stat headers"""
def assemble_stats(url_lst):
    dictionary = {}#stores all fighter stats
    for item in url_lst:
        fighter_detail = {}#stores fighter stats
        #create BeautifulSoup object
        bs_object = get_BS_object(item)
        
        #get the first 'span' tag text item
        name = clean_tc(get_tc(bs_object,'span',False))[0]
        
        #Get all 'i' tags pos 4-15 
        headers = [i.replace(":", "")for i in clean_tc(get_tc(bs_object, 'i', False))[4:15]]
        headers.pop(1)#pop position 1
        headers.pop(5)#pop 5 blank value
        headers.append('Name')#add name to headers
        
        #get all 'li' tag text pos 7-17
        stats = clean_tc(get_tc(bs_object, 'li', False))[7:17]
        
        #replace spaces in list; pop position 5 blank value
        clean_stats = []
        for k in stats:
            clean_stats.append(k.replace(' ',''))
    
        clean_stats.pop(5)#pop position 5 blank value
        
        #replace % with ''; get values for stats, get values after ':'
        clean_stats2 = []
        for j in clean_stats:
            clean_stats2.append(j[j.index(':')+1:len(j)].replace('%',''))
            
        #zip headers and clean stats into fighter detail dictionary
        for a, b in zip(headers, clean_stats2):
            fighter_detail[a] = b
        
        dictionary[name] = fighter_detail#fighter name as key, values = dict of stats
    
    return dictionary, headers
        

In [8]:
#assemble the dictionary of stats by fighter
stats, headers = assemble_stats(lst)#300

In [9]:
stats

{'Shannon Ross': {'DOB': 'May12,1989',
  'SLpM': '6.57',
  'Str. Acc.': '37',
  'SApM': '8.99',
  'Str. Def': '52',
  'TD Avg.': '0.00',
  'TD Acc.': '0',
  'TD Def.': '100',
  'Sub. Avg.': '0.0'},
 'Maia Stevenson': {'DOB': 'Mar19,1982',
  'SLpM': '1.57',
  'Str. Acc.': '54',
  'SApM': '3.13',
  'Str. Def': '33',
  'TD Avg.': '3.91',
  'TD Acc.': '100',
  'TD Def.': '50',
  'Sub. Avg.': '0.0'},
 'Ian Butler': {'DOB': '--',
  'SLpM': '2.09',
  'Str. Acc.': '14',
  'SApM': '6.69',
  'Str. Def': '36',
  'TD Avg.': '0.00',
  'TD Acc.': '0',
  'TD Def.': '50',
  'Sub. Avg.': '0.0'},
 'Alex Hunter': {'DOB': '--',
  'SLpM': '0.00',
  'Str. Acc.': '0',
  'SApM': '0.00',
  'Str. Def': '0',
  'TD Avg.': '0.00',
  'TD Acc.': '0',
  'TD Def.': '0',
  'Sub. Avg.': '0.0'},
 'Josh Thomson': {'DOB': 'Sep21,1978',
  'SLpM': '2.59',
  'Str. Acc.': '47',
  'SApM': '2.12',
  'Str. Def': '64',
  'TD Avg.': '1.92',
  'TD Acc.': '39',
  'TD Def.': '54',
  'Sub. Avg.': '1.3'},
 'Milana Dudieva': {'DOB': 'Aug

In [10]:
len(stats)

3955

In [11]:
"""This method takes in a dictionary of stats dictionaries and column headers,converts it into a dataframe and returns it"""
def insert_df2(stats, headers):
    attributes = []#create list to store lists of attributes
    for item in stats.keys():#for each name
        row = []#list to store row
        for i in stats[item].keys():#for each attribute by name
            row.append(stats[item][i])#append it to the row
            
        row.append(item)#append the fighter name to row
        attributes.append(row)#append the row to attribute list
    df = pd.DataFrame(attributes, columns = headers)#make the dataframe with headers
            
        
    return df          

In [17]:
#Insert stats dictionary into df
df = insert_df2(stats, headers)

In [18]:
#ensure df is filled
df.tail(30)

Unnamed: 0,DOB,SLpM,Str. Acc.,SApM,Str. Def,TD Avg.,TD Acc.,TD Def.,Sub. Avg.,Name
3925,"Jan29,1989",0.0,0,0.0,0,0.0,0,0,0.0,Garrett Mueller
3926,"Dec08,1979",1.37,39,1.71,55,1.48,50,25,1.1,Makoto Takimoto
3927,"Aug13,1993",1.73,46,1.6,55,0.0,0,50,0.0,Gabriella Fernandes
3928,--,0.0,0,0.0,0,0.0,0,0,0.0,Eric Martin
3929,"Apr09,1985",4.01,48,3.61,52,0.0,0,44,0.0,Kelly Faszholz
3930,--,0.0,0,0.0,0,0.0,0,0,0.0,Luis Mendoza
3931,"Nov04,1992",1.33,36,3.0,47,0.0,0,100,0.0,Matheus Scheffel
3932,"Dec04,1991",4.09,55,11.73,51,0.0,0,0,0.0,Danilo Suzart
3933,"Mar05,1983",3.96,26,4.82,50,0.0,0,80,0.0,Chris Cope
3934,"Jan31,1988",1.78,80,3.11,65,0.0,0,0,0.0,Jamelle Jones


In [19]:
#write this script to csv and store in this location
#df.to_csv(r"C:\Users\Zachw\Downloads\UFC_Fighter_Detail.csv", index=False)