### Imports

In [64]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import bs4
import pandas as pd
import tqdm

### First Access fighters page

In [65]:
def access_page(url):
    try:
        driver = webdriver.Chrome()
        driver.get(url)
    except TimeoutException:
        print("TimeoutException: Page could not load, rerun the cell")
    return driver

def click_on_all_fighters(driver):
    driver.find_elements(By.CLASS_NAME, "b-statistics__paginate-link")[-1].click()
    return driver

def get_source(driver):
    return bs4.BeautifulSoup(driver.page_source, "html.parser")

def collect_fighter_records(page_source):
    fighters = get_source(page_source)
    entries = fighters.find_all("tr", class_="b-statistics__table-row")
    
    number_of_entries = len(entries)
    print(f"Number of entries: {number_of_entries}")
    
    df = pd.DataFrame(columns=["FirstName", "LastName", "Nickname", "Height", "Weight", "Reach", "Stance", "Wins", "Losses", "Draws", "Belt"])  
    
    for entry in entries[2:]:
        stats = []
        for i in range(0,11):
            # store entry in a dictionary
            stats.append(entry.find_all("td")[i].text.strip())
        df = pd.concat([df, pd.DataFrame([stats], columns=df.columns)], ignore_index=True)
    

    return df

def iterate_scrape(driver):
    source = get_source(driver)
    nav_items = source.find_all("ul", class_="b-statistics__nav-items")
    alphabet = nav_items[0].find_all("a")
    
    df = pd.DataFrame(columns=["FirstName", "LastName", "Nickname", "Height", "Weight", "Reach", "Stance", "Wins", "Losses", "Draws", "Belt"])
    for i in tqdm.tqdm(range(0, len(alphabet))):
        letter = alphabet[i]
        driver.get("http://ufcstats.com" + letter.get("href"))
        click_on_all_fighters(driver)
        data = collect_fighter_records(driver)
        df = pd.concat([df, data], ignore_index=True)
        
    # for letter in alphabet:
    #     print(letter.get("href"))
    #     driver.get("http://ufcstats.com" + letter.get("href"))
    #     click_on_all_fighters(driver)
    #     data = collect_fighter_records(driver)
    #     df = pd.concat([df, data], ignore_index=True)
        
    return df

In [66]:
homepage = access_page("http://ufcstats.com/statistics/fighters?char=a")
dta = iterate_scrape(homepage)

  4%|▍         | 1/26 [00:01<00:25,  1.01s/it]

Number of entries: 225


  8%|▊         | 2/26 [00:04<01:03,  2.66s/it]

Number of entries: 309


 12%|█▏        | 3/26 [00:06<00:51,  2.23s/it]

Number of entries: 282


 15%|█▌        | 4/26 [00:07<00:42,  1.92s/it]

Number of entries: 202


 19%|█▉        | 5/26 [00:09<00:36,  1.75s/it]

Number of entries: 94


 23%|██▎       | 6/26 [00:10<00:29,  1.49s/it]

Number of entries: 141


 27%|██▋       | 7/26 [00:12<00:30,  1.60s/it]

Number of entries: 230


 31%|███       | 8/26 [00:13<00:29,  1.65s/it]

Number of entries: 224


 35%|███▍      | 9/26 [00:15<00:25,  1.50s/it]

Number of entries: 47


 38%|███▊      | 10/26 [00:16<00:23,  1.44s/it]

Number of entries: 125


 42%|████▏     | 11/26 [00:17<00:20,  1.40s/it]

Number of entries: 172


 46%|████▌     | 12/26 [00:19<00:19,  1.42s/it]

Number of entries: 204
Number of entries: 421


 54%|█████▍    | 14/26 [00:22<00:17,  1.49s/it]

Number of entries: 115


 58%|█████▊    | 15/26 [00:23<00:15,  1.38s/it]

Number of entries: 82


 62%|██████▏   | 16/26 [00:25<00:14,  1.45s/it]

Number of entries: 224


 65%|██████▌   | 17/26 [00:26<00:11,  1.30s/it]

Number of entries: 15


 69%|██████▉   | 18/26 [00:27<00:10,  1.36s/it]

Number of entries: 224
Number of entries: 459


 77%|███████▋  | 20/26 [00:32<00:11,  1.94s/it]

Number of entries: 154


 81%|████████  | 21/26 [00:34<00:08,  1.77s/it]

Number of entries: 24


 85%|████████▍ | 22/26 [00:35<00:06,  1.58s/it]

Number of entries: 95


 88%|████████▊ | 23/26 [00:36<00:04,  1.48s/it]

Number of entries: 140


 92%|█████████▏| 24/26 [00:37<00:02,  1.30s/it]

Number of entries: 5


 96%|█████████▌| 25/26 [00:38<00:01,  1.22s/it]

Number of entries: 52


100%|██████████| 26/26 [00:39<00:00,  1.52s/it]

Number of entries: 39





In [67]:
dta

Unnamed: 0,FirstName,LastName,Nickname,Height,Weight,Reach,Stance,Wins,Losses,Draws,Belt
0,Tom,Aaron,,--,155 lbs.,--,,5,3,0,
1,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4,6,0,
2,Nariman,Abbasov,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28,4,0,
3,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10,15,0,
4,Hamdy,Abdelwahab,The Hammer,"6' 2""",264 lbs.,"72.0""",Southpaw,5,0,0,
...,...,...,...,...,...,...,...,...,...,...,...
4247,Dave,Zitanick,,--,170 lbs.,--,,5,7,0,
4248,Alex,Zuniga,,--,145 lbs.,--,,6,3,0,
4249,George,Zuniga,,"5' 9""",185 lbs.,--,,3,1,0,
4250,Allan,Zuniga,Tigre,"5' 7""",155 lbs.,"70.0""",Orthodox,13,1,0,


In [69]:
## add a 'record' column
dta["Record"] = dta["Wins"] + "-" + dta["Losses"] + "-" + dta["Draws"]

In [70]:
dta

Unnamed: 0,FirstName,LastName,Nickname,Height,Weight,Reach,Stance,Wins,Losses,Draws,Belt,Record
0,Tom,Aaron,,--,155 lbs.,--,,5,3,0,,5-3-0
1,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4,6,0,,4-6-0
2,Nariman,Abbasov,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28,4,0,,28-4-0
3,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10,15,0,,10-15-0
4,Hamdy,Abdelwahab,The Hammer,"6' 2""",264 lbs.,"72.0""",Southpaw,5,0,0,,5-0-0
...,...,...,...,...,...,...,...,...,...,...,...,...
4247,Dave,Zitanick,,--,170 lbs.,--,,5,7,0,,5-7-0
4248,Alex,Zuniga,,--,145 lbs.,--,,6,3,0,,6-3-0
4249,George,Zuniga,,"5' 9""",185 lbs.,--,,3,1,0,,3-1-0
4250,Allan,Zuniga,Tigre,"5' 7""",155 lbs.,"70.0""",Orthodox,13,1,0,,13-1-0


In [71]:
dta.to_csv("fighter-overview-stats1", index=False)