In [1]:
#import necessary libraries
import requests as r
from bs4 import BeautifulSoup as BS
import string
import pandas as pd
import numpy as np

In [2]:
"""This method takes in a url, gets the response text, 
and returns a beautiful soup object"""
def get_BS_object(url):
    response = r.get(url)
    bs_object = BS(response.text)
    return bs_object

"""This method takes in a BeautifulSoup object and finds all 'td' tags, 
converts them to a string, appends them to a list and returns it"""
def get_tc(bs_object):
    table_contents = []
    for column in bs_object.find_all("td"):
        table_contents.append(str(column.text))
    return table_contents

"""This method takes in dirty table contents, removes newline 
character, and white spaces, returns clean table contents"""
def clean_tc(dirty_tc):
    clean_table_contents = []
    for item in table_contents:
        clean_table_contents.append(item.replace("\n", "").strip())
    return clean_table_contents

"""This method takes in a row position, contents of a table and 
returns each rows contents in the row position specified"""
def extract_col_content(row_position, clean_table_contents):
    col_content = []
    for index in range(row_position, len(clean_table_contents), 11):
        col_content.append(clean_table_contents[index])
    return col_content
    

In [3]:
#create letters to iterate through
letters = list(string.ascii_lowercase)

#create dictionary that stores fighter data for each letter
dictionary = {}

#iterate through list of letters
for letter in letters:
    #get the url, create a BeautifulSoup object
    url = "http://www.ufcstats.com/statistics/fighters?char={0}&page=all".format(letter)
    bs_object = get_BS_object(url)
    
    #find all 'td' tags, store each item as a string in table_contents list    
    table_contents = get_tc(bs_object)
    
    #remove blank spaces and newline characters from table contents
    clean_table_contents = clean_tc(table_contents)
    
    #create dictionary to store stats
    ufc_fighters = {}
    
    #input list of first names into ufc_fighters dictionary
    ufc_fighters['first_name'] = extract_col_content(1,clean_table_contents)
        
    #input list of last names into ufc_fighters dictionary    
    ufc_fighters['last_name'] = extract_col_content(2,clean_table_contents)
    
    #input list of nicknames into ufc_fighters dictionary
    ufc_fighters['nickname'] = extract_col_content(3,clean_table_contents)
    
    #input list of height into ufc_fighters dictionary
    ufc_fighters['height'] = extract_col_content(4,clean_table_contents)
    
    #input list of weight into ufc_fighters dictionary
    ufc_fighters['weight'] = extract_col_content(5,clean_table_contents)
    
    #input list of reach into ufc_fighters dictionary
    ufc_fighters['reach'] = extract_col_content(6,clean_table_contents)
    
    #input list of stance into ufc_fighters dictionary
    ufc_fighters['stance'] = extract_col_content(7,clean_table_contents)
    
    #input list of wins into ufc_fighters dictionary
    ufc_fighters['wins'] = extract_col_content(8,clean_table_contents)
    
    #input list of losses into ufc_fighters dictionary
    ufc_fighters['losses'] = extract_col_content(9,clean_table_contents)
    
    #input list of draws into ufc_fighters dictionary
    ufc_fighters['draws'] = extract_col_content(10,clean_table_contents)
    
    #input list of belt into ufc_fighters dictionary
    ufc_fighters['belt'] = extract_col_content(11,clean_table_contents)
    
    #store stats in dictionary organized by letter
    dictionary[letter] = ufc_fighters

In [4]:
#create list of column names, create dataframe to store UFC fighter data
col_names = ['first_name', 'last_name', 'nickname', 'height', 'weight', 'reach', 'stance', 'wins', 'losses', 'draws', 'belt']
df = pd.DataFrame(columns = col_names)

In [5]:
"""This function takes in a list input, flattens its contents and returns it"""
def flatten(lst):
    return [i for item in lst for i in item]

#create main lists of attributes to append lists from dictionary
first_names = []
last_names = []
nicknames = []
ht = []
weight = []
reach = []
stance = []
wins = []
losses = []
draws = []
belt = []

#For each letter and column_name, append to main list of attributes
for letter in letters:
    first_names.append(dictionary[letter][col_names[0]])
    last_names.append(dictionary[letter][col_names[1]])
    nicknames.append(dictionary[letter][col_names[2]])
    ht.append(dictionary[letter][col_names[3]])
    weight.append(dictionary[letter][col_names[4]])
    reach.append(dictionary[letter][col_names[5]])
    stance.append(dictionary[letter][col_names[6]])
    wins.append(dictionary[letter][col_names[7]])
    losses.append(dictionary[letter][col_names[8]])
    draws.append(dictionary[letter][col_names[9]])
    belt.append(dictionary[letter][col_names[10]])
    
#flatten the attribute lists and insert into DataFrame    
df.first_name = flatten(first_names)
df.last_name = flatten(last_names)
df.nickname = flatten(nicknames)
df.height = flatten(ht)
df.weight = flatten(weight)
df.reach = flatten(reach)
df.stance = flatten(stance)
df.wins = flatten(wins)
df.losses = flatten(losses)
df.draws = flatten(draws)
df.belt = flatten(belt)

In [6]:
#look at head of dataframe
df.head(20)

Unnamed: 0,first_name,last_name,nickname,height,weight,reach,stance,wins,losses,draws,belt
0,Tom,Aaron,,--,155 lbs.,--,,5,3,0,
1,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4,6,0,
2,Nariman,Abbasov,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28,4,0,
3,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10,15,0,
4,Hamdy,Abdelwahab,The Hammer,"6' 2""",264 lbs.,"72.0""",Southpaw,5,0,0,
5,Shamil,Abdurakhimov,Abrek,"6' 3""",235 lbs.,"76.0""",Orthodox,20,8,0,
6,Hiroyuki,Abe,Abe Ani,"5' 6""",145 lbs.,--,Orthodox,8,15,3,
7,Daichi,Abe,,"5' 11""",170 lbs.,"71.0""",Orthodox,6,2,0,
8,Papy,Abedi,Makambo,"5' 11""",185 lbs.,--,Southpaw,10,4,0,
9,Ricardo,Abreu,Demente,"5' 11""",185 lbs.,--,Orthodox,5,1,0,


In [7]:
len(df)

3953

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3953 entries, 0 to 3952
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   first_name  3953 non-null   object
 1   last_name   3953 non-null   object
 2   nickname    3953 non-null   object
 3   height      3953 non-null   object
 4   weight      3953 non-null   object
 5   reach       3953 non-null   object
 6   stance      3953 non-null   object
 7   wins        3953 non-null   object
 8   losses      3953 non-null   object
 9   draws       3953 non-null   object
 10  belt        3953 non-null   object
dtypes: object(11)
memory usage: 339.8+ KB


In [9]:
#convert data types
df[['first_name','last_name', 'nickname']] = df[['first_name','last_name', 'nickname']].astype(str)
df[['wins','losses','draws']] = df[['wins','losses','draws']].astype(int)
df['stance'] = df.stance.astype('category')

#drop belt column; no values
df.drop('belt', inplace=True, axis=1)

In [10]:
df.dtypes#view datatypes

first_name      object
last_name       object
nickname        object
height          object
weight          object
reach           object
stance        category
wins             int32
losses           int32
draws            int32
dtype: object

In [12]:
#replace empty string to Unknown
df.nickname = df.nickname.replace("", 'Unknown')
df.stance = df.stance.replace("", 'Unknown')

#ensure changes worked
df.head()

In [14]:
#replace lbs, -- in string; convert to a float
df.weight = df.weight.str.replace("lbs.", "").replace("--",np.nan).str.strip().astype(float)

#ensure changes worked
df.head()

  df.weight = df.weight.str.replace("lbs.", "").replace("--",np.nan).str.strip().astype(float)


In [16]:
#replace -- with nan
df.height = df.height.replace('--', np.nan)

#get first character of height, convert to float
height1 = df.height.str[0].astype(float)

#get 2nd character and beyond of height and eliminate ", convert to float
height2 = df.height.str[2:].str.replace('"', '').astype(float)

#convert the height values to inches and replace height column
df.height = height1*12 + height2

In [21]:
#ensure changes 
df.head()

Unnamed: 0,first_name,last_name,nickname,height,weight,reach,stance,wins,losses,draws
0,Tom,Aaron,Unknown,,155.0,--,Unknown,5,3,0
1,Danny,Abbadi,The Assassin,71.0,155.0,--,Orthodox,4,6,0
2,Nariman,Abbasov,Bayraktar,68.0,155.0,"66.0""",Orthodox,28,4,0
3,David,Abbott,Tank,72.0,265.0,--,Switch,10,15,0
4,Hamdy,Abdelwahab,The Hammer,74.0,264.0,"72.0""",Southpaw,5,0,0


In [22]:
#remove unnecessary characters and convert to float
df.reach = df.reach.str.replace('"','').str.replace('--','').replace('', np.nan).astype(float)

#ensure changes worked
df.head()

In [24]:
#view data types
df.dtypes

first_name      object
last_name       object
nickname        object
height         float64
weight         float64
reach          float64
stance        category
wins             int32
losses           int32
draws            int32
dtype: object

In [None]:
#write this script to csv and store in this location
#df.to_csv(r"C:\Users\Zachw\Downloads\UFC_Fighter_Stats.csv", index=False)