In [1]:
#import requests, pandas, BS
import requests as r 
import pandas as pd
from bs4 import BeautifulSoup as BS

In [2]:
"""This method takes in a url, gets the response text, 
and returns a beautiful soup object"""
def get_BS_object(url):
    response = r.get(url)
    bs_object = BS(response.text)
    return bs_object

In [3]:
"""This method takes in a BeautifulSoup object, a tag, a boolean indicating href and 
converts them to a string, appends them to a list and returns it"""
def get_tc(bs_object, tag, href):
    table_contents = []
    for column in bs_object.find_all(tag, href = href):
        table_contents.append(str(column.text))
    return table_contents

In [4]:
#request url of all events completed
url = 'http://ufcstats.com/statistics/events/completed?page=all'
response_text = get_BS_object(url)

In [5]:
#print the response
print(response_text.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="no-js ie8 lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <title>
   Stats | UFC
  </title>
  <meta content="" name="description"/>
  <meta content="" name="viewport"/>
  <link href="/blocks/main.css?ver=779885" rel="stylesheet"/>
  <script src="/js/vendor/modernizr-2.6.2.min.js">
  </script>
  <script>
   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
    (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
    m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
    })(window,document,'script','//www.google-analytics.com/analytics.js','ga');

    ga('create', 'UA-2855164-1', 'au

In [6]:
#Index response text for 'href' to get links
url_links = [i['href'] for i in response_text.find_all('a', href=True)]

In [7]:
#print all links
print(url_links)

['/', '/statistics/fighters', 'http://statleaders.ufc.com', 'http://ufcstats.com/statistics/events/completed', 'http://ufcstats.com/statistics/events/upcoming', 'http://ufcstats.com/event-details/6fb1ba67bef41b37', 'http://ufcstats.com/event-details/15b1b21cd743d652', 'http://ufcstats.com/event-details/3dc3022232b79c7a', 'http://ufcstats.com/event-details/aec273fcb765330d', 'http://ufcstats.com/event-details/e4bb7e483c4ad318', 'http://ufcstats.com/event-details/35080a7f406f9ab3', 'http://ufcstats.com/event-details/1ccff7f0cfdf85eb', 'http://ufcstats.com/event-details/806975e1b4f47b27', 'http://ufcstats.com/event-details/f21a3d68fb9df387', 'http://ufcstats.com/event-details/01dd4cdc2446f665', 'http://ufcstats.com/event-details/d26394fc0e8e880a', 'http://ufcstats.com/event-details/5717efc6f271cd52', 'http://ufcstats.com/event-details/2e2cdb6e9eb84bb9', 'http://ufcstats.com/event-details/56ec58954158966a', 'http://ufcstats.com/event-details/f65a0eb902f9476b', 'http://ufcstats.com/event-de

In [8]:
"""This method takes in a list of url strings, a length x of each url that should be checked, 
and a portion of a given url and compares the first 
x characters of all url strings to a given url of interest 
and returns a list of strings that satisfy this constraint"""
def get_detail_url(list_of_urls, x, url):
    fighter_detail_url = []
    for i in range(0, len(list_of_urls)):
        if list_of_urls[i][0:x]==url:
            fighter_detail_url.append(list_of_urls[i])
        else:
            continue
    return fighter_detail_url

In [9]:
#give me the urls where the first len(url) characters is this url
event_detail_urls = get_detail_url(url_links, len('http://ufcstats.com/event-details/'), 'http://ufcstats.com/event-details/')
print(len(event_detail_urls))

643


In [10]:
#for multithreading
import concurrent.futures

In [11]:
#Use multithreading with max of 10 threads to make 643 url requests. Append to list of future objects
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    futures = []
    for url in event_detail_urls:
        futures.append(executor.submit(get_BS_object, url)) 

In [12]:
#ensure there are results in the list
print(futures[1].result().prettify())

<!DOCTYPE html>
<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="no-js ie8 lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <title>
   Stats | UFC
  </title>
  <meta content="" name="description"/>
  <meta content="" name="viewport"/>
  <link href="/blocks/main.css?ver=640180" rel="stylesheet"/>
  <script src="/js/vendor/modernizr-2.6.2.min.js">
  </script>
  <script>
   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
    (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
    m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
    })(window,document,'script','//www.google-analytics.com/analytics.js','ga');

    ga('create', 'UA-2855164-1', 'au

In [13]:
"""This method takes in dirty table contents, removes newline 
character, and white spaces, returns list of clean table contents"""
def clean_tc(dirty_tc):
    clean_table_contents = []
    for item in dirty_tc:
        clean_table_contents.append(item.replace("\n", "").strip())
    return clean_table_contents

In [14]:
#get the headers for the stats
headers = clean_tc(get_tc(futures[1].result(), 'th', False))
print(headers)

['W/L', 'Fighter', 'Kd', 'Str', 'Td', 'Sub', 'Weight class', 'Method', 'Round', 'Time']


In [15]:
#import numpy lib for arrays
import numpy as np

In [16]:
"""This method takes in a list of futures and returns the modulus 16 of the length of 
clean table contents of the p tags for each item in the list"""
def div_by_16(futures):
    lengths = []
    for future in futures:
        lengths.append(len(clean_tc(get_tc(future.result(), 'p', False)))%16)#16 features of interest
    return lengths

In [17]:
#find out which request result is not divisible by 16
lengths = div_by_16(futures)

In [18]:
#ensure this math worked; if not divisible by 16, there is an issue
print(lengths)

[8, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 1, 0, 0, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 

In [19]:
"""This is a method that takes in a list of futures objects and examines the contents of 'p' 
tags to see if there is a draw or nc. If there is, it deletes the draw or nc following 
it to make 16 features for each fight """
def remove_drawnc(futures):
    new_lst = []
    for a in range(1, len(futures)):#start at 1 because 0 hasn't happend
        p_contents = clean_tc(get_tc(futures[a].result(), 'p', False))
        
        for i in range(0, len(p_contents)-4):#look through contents; use len - 4 so index isn't out of bounds
            if p_contents[i] == 'draw' or p_contents[i] == 'nc':#if 'draw', 'nc' appear
                p_contents.pop(i+1)#pop next item
        new_lst.append(p_contents)
    return new_lst
    

In [20]:
#remove 1 draw and 1 nc from any fight instance that contains it to get 16 features 
lst = remove_drawnc(futures)

In [21]:
"""This method takes in a list of futures objects and extracts the 
event name, Date, location and returns a list of lists 
containing this data for each event"""
def get_event_info(futures):
    all_events = []
    for i in range(1, len(futures)):#start at 1, 0 hasn't happend
        event = []#holds single event name, date, location
        event.append(clean_tc(get_tc(futures[i].result(), 'span', False))[0])#event name
        li_contents = clean_tc(get_tc(futures[i].result(), 'li', False))
        event.append(li_contents[3].replace('Date:','').strip())#Date
        event.append(li_contents[4].replace('Location:','').strip())#Location
        all_events.append(event)
    return all_events
        
    

In [22]:
#get name, date, location for each event
info = get_event_info(futures)

In [23]:
info

[['UFC Fight Night: Holloway vs. Allen',
  'April 15, 2023',
  'Kansas City, Missouri, USA'],
 ['UFC 287: Pereira vs. Adesanya 2', 'April 08, 2023', 'Miami, Florida, USA'],
 ['UFC Fight Night: Vera vs. Sandhagen',
  'March 25, 2023',
  'San Antonio, Texas, USA'],
 ['UFC 286: Edwards vs. Usman 3',
  'March 18, 2023',
  'London, England, United Kingdom'],
 ['UFC Fight Night: Yan vs. Dvalishvili',
  'March 11, 2023',
  'Las Vegas, Nevada, USA'],
 ['UFC 285: Jones vs. Gane', 'March 04, 2023', 'Las Vegas, Nevada, USA'],
 ['UFC Fight Night: Muniz vs. Allen',
  'February 25, 2023',
  'Las Vegas, Nevada, USA'],
 ['UFC Fight Night: Andrade vs. Blanchfield',
  'February 18, 2023',
  'Las Vegas, Nevada, USA'],
 ['UFC 284: Makhachev vs. Volkanovski',
  'February 11, 2023',
  'Perth, Western Australia, Australia'],
 ['UFC Fight Night: Lewis vs. Spivac',
  'February 04, 2023',
  'Las Vegas, Nevada, USA'],
 ['UFC 283: Teixeira vs. Hill',
  'January 21, 2023',
  'Rio de Janeiro, Rio de Janeiro, Brazil

In [24]:
"""This function takes in a list input, flattens its contents and returns it"""
def flatten(lst):
    return [i for item in lst for i in item]

In [25]:
"""This method takes in a list of futures objects and a list of event info and returns 
an array of event name, date, locations to append to fight array"""
def make_NDL_array(futures,info):
    new_lst = []
    for i, j in zip(range(1, len(futures)),info):#start at 1, 0 hasn't happend
        new_lst.append(int(len(clean_tc(get_tc(futures[i].result(), 'p', False)))/16) * j)#div by 16 to get num of fights in event, multiply by j to duplicate event name, date, location num of fight times
    
    return np.array(flatten(new_lst)).reshape(int(len(flatten(new_lst))/3),3)#return array with 3 columns for 3 features
    

In [26]:
#Make an array of name, dates, locations for each event to append to fight array
event = make_NDL_array(futures, info)

In [27]:
#ensure event array is created
event

array([['UFC Fight Night: Holloway vs. Allen', 'April 15, 2023',
        'Kansas City, Missouri, USA'],
       ['UFC Fight Night: Holloway vs. Allen', 'April 15, 2023',
        'Kansas City, Missouri, USA'],
       ['UFC Fight Night: Holloway vs. Allen', 'April 15, 2023',
        'Kansas City, Missouri, USA'],
       ...,
       ['UFC 2: No Way Out', 'March 11, 1994', 'Denver, Colorado, USA'],
       ['UFC 2: No Way Out', 'March 11, 1994', 'Denver, Colorado, USA'],
       ['UFC 2: No Way Out', 'March 11, 1994', 'Denver, Colorado, USA']],
      dtype='<U63')

In [28]:
#Ensure lst is clean of 2-nc 2-draws
print(lst)

[['win', 'Max Holloway', 'Arnold Allen', '0', '0', '147', '76', '0', '0', '0', '0', 'Featherweight', 'U-DEC', '', '5', '5:00', 'win', 'Edson Barboza', 'Billy Quarantillo', '1', '0', '21', '19', '0', '0', '0', '0', 'Featherweight', 'KO/TKO', 'Knee', '1', '2:37', 'win', 'Azamat Murzakanov', 'Dustin Jacoby', '1', '0', '67', '64', '1', '1', '0', '0', 'Light Heavyweight', 'U-DEC', '', '3', '5:00', 'win', 'Ion Cutelaba', 'Tanner Boser', '0', '0', '30', '5', '1', '0', '0', '0', 'Light Heavyweight', 'KO/TKO', 'Punches', '1', '2:05', 'win', 'Pedro Munhoz', 'Chris Gutierrez', '1', '0', '60', '77', '0', '0', '0', '0', 'Bantamweight', 'U-DEC', '', '3', '5:00', 'win', 'Rafa Garcia', 'Clay Guida', '0', '0', '141', '63', '0', '0', '0', '0', 'Lightweight', 'U-DEC', '', '3', '5:00', 'win', 'Bill Algeo', 'TJ Brown', '1', '0', '64', '46', '1', '1', '1', '0', 'Featherweight', 'SUB', 'Rear Naked Choke', '2', '1:40', 'win', 'Brandon Royval', 'Matheus Nicolau', '1', '0', '13', '2', '0', '0', '0', '0', 'Flywe

In [29]:
"""This function takes in a list input, flattens its contents and returns it"""
def flatten(lst):
    return [i for item in lst for i in item]

#create an array of shape length of list / 16 rows by 16 columns
fight = np.array(flatten(lst)).reshape(int(len(flatten(lst))/16), 16)

In [30]:
#check array is created
fight

array([['win', 'Max Holloway', 'Arnold Allen', ..., '', '5', '5:00'],
       ['win', 'Edson Barboza', 'Billy Quarantillo', ..., 'Knee', '1',
        '2:37'],
       ['win', 'Azamat Murzakanov', 'Dustin Jacoby', ..., '', '3',
        '5:00'],
       ...,
       ['win', 'Johnny Rhodes', 'David Levicki', ..., 'Punches', '1',
        '12:13'],
       ['win', 'Patrick Smith', 'Ray Wizard', ..., 'Guillotine Choke',
        '1', '0:58'],
       ['win', 'Scott Morris', 'Sean Daugherty', ..., 'Guillotine Choke',
        '1', '0:20']], dtype='<U25')

In [31]:
fight.shape

(7086, 16)

In [32]:
event.shape

(7086, 3)

In [33]:
#Append the 2 arrays
final_array = np.append(fight, event, axis = 1)

In [34]:
#ensure arrays are appended
final_array[0:15,:]

array([['win', 'Max Holloway', 'Arnold Allen', '0', '0', '147', '76',
        '0', '0', '0', '0', 'Featherweight', 'U-DEC', '', '5', '5:00',
        'UFC Fight Night: Holloway vs. Allen', 'April 15, 2023',
        'Kansas City, Missouri, USA'],
       ['win', 'Edson Barboza', 'Billy Quarantillo', '1', '0', '21',
        '19', '0', '0', '0', '0', 'Featherweight', 'KO/TKO', 'Knee', '1',
        '2:37', 'UFC Fight Night: Holloway vs. Allen', 'April 15, 2023',
        'Kansas City, Missouri, USA'],
       ['win', 'Azamat Murzakanov', 'Dustin Jacoby', '1', '0', '67',
        '64', '1', '1', '0', '0', 'Light Heavyweight', 'U-DEC', '', '3',
        '5:00', 'UFC Fight Night: Holloway vs. Allen', 'April 15, 2023',
        'Kansas City, Missouri, USA'],
       ['win', 'Ion Cutelaba', 'Tanner Boser', '0', '0', '30', '5', '1',
        '0', '0', '0', 'Light Heavyweight', 'KO/TKO', 'Punches', '1',
        '2:05', 'UFC Fight Night: Holloway vs. Allen', 'April 15, 2023',
        'Kansas City, Missouri

In [35]:
#shape of final array
final_array.shape

(7086, 19)

In [36]:
#create columns for dataframe
columns = ['Winner', 'Fighter_1', 'Fighter_2', 'KD_1', 'KD_2', 'Str_1', 'Str_2', 'TD_1', 'TD_2', 'SUB_1', 'SUB_2', 'Weight_class', 'Method', 'Method2', 'Round', 'Time', 'Event_name', 'Date', 'Location']

In [37]:
#create dataframe
df = pd.DataFrame(final_array, columns=columns)

In [38]:
#Ensure dataframe created
df.head()

Unnamed: 0,Winner,Fighter_1,Fighter_2,KD_1,KD_2,Str_1,Str_2,TD_1,TD_2,SUB_1,SUB_2,Weight_class,Method,Method2,Round,Time,Event_name,Date,Location
0,win,Max Holloway,Arnold Allen,0,0,147,76,0,0,0,0,Featherweight,U-DEC,,5,5:00,UFC Fight Night: Holloway vs. Allen,"April 15, 2023","Kansas City, Missouri, USA"
1,win,Edson Barboza,Billy Quarantillo,1,0,21,19,0,0,0,0,Featherweight,KO/TKO,Knee,1,2:37,UFC Fight Night: Holloway vs. Allen,"April 15, 2023","Kansas City, Missouri, USA"
2,win,Azamat Murzakanov,Dustin Jacoby,1,0,67,64,1,1,0,0,Light Heavyweight,U-DEC,,3,5:00,UFC Fight Night: Holloway vs. Allen,"April 15, 2023","Kansas City, Missouri, USA"
3,win,Ion Cutelaba,Tanner Boser,0,0,30,5,1,0,0,0,Light Heavyweight,KO/TKO,Punches,1,2:05,UFC Fight Night: Holloway vs. Allen,"April 15, 2023","Kansas City, Missouri, USA"
4,win,Pedro Munhoz,Chris Gutierrez,1,0,60,77,0,0,0,0,Bantamweight,U-DEC,,3,5:00,UFC Fight Night: Holloway vs. Allen,"April 15, 2023","Kansas City, Missouri, USA"


In [39]:
#value counts of Winner col
df.Winner.value_counts()

win     6960
nc        71
draw      55
Name: Winner, dtype: int64

In [40]:
"""This method takes in a ufc event dataframe and  converts 'win' to 
'Fighter_1' in the 'winner' column and returns the converted df"""
def conv_win_fighter1(df):
    for i in range(0, len(df.Winner)):
        if df.loc[i, 'Winner'] == 'win':
            df.loc[i, 'Winner'] = df.loc[i, 'Fighter_1']
    return df   

In [41]:
#Convert win to name of fighter who won
df = conv_win_fighter1(df)

In [42]:
#ensure winner column converted
df.tail()

Unnamed: 0,Winner,Fighter_1,Fighter_2,KD_1,KD_2,Str_1,Str_2,TD_1,TD_2,SUB_1,SUB_2,Weight_class,Method,Method2,Round,Time,Event_name,Date,Location
7081,Orlando Wiet,Orlando Wiet,Robert Lucarelli,0,0,8,2,0,1,0,1,Open Weight,KO/TKO,,1,2:50,UFC 2: No Way Out,"March 11, 1994","Denver, Colorado, USA"
7082,Frank Hamaker,Frank Hamaker,Thaddeus Luster,0,0,2,0,1,0,3,0,Open Weight,SUB,Keylock,1,4:52,UFC 2: No Way Out,"March 11, 1994","Denver, Colorado, USA"
7083,Johnny Rhodes,Johnny Rhodes,David Levicki,0,0,11,4,1,0,0,0,Open Weight,KO/TKO,Punches,1,12:13,UFC 2: No Way Out,"March 11, 1994","Denver, Colorado, USA"
7084,Patrick Smith,Patrick Smith,Ray Wizard,0,0,1,1,0,0,1,0,Open Weight,SUB,Guillotine Choke,1,0:58,UFC 2: No Way Out,"March 11, 1994","Denver, Colorado, USA"
7085,Scott Morris,Scott Morris,Sean Daugherty,0,0,1,0,1,0,1,0,Open Weight,SUB,Guillotine Choke,1,0:20,UFC 2: No Way Out,"March 11, 1994","Denver, Colorado, USA"


In [43]:
#write this script to csv and store in this location
#df.to_csv(r"C:\Users\Zachw\Downloads\UFC_Event_Stats.csv", index=False)