# Intelligent Asynchronous Web-Scraper - UFC Completed Events

### Data Summary

**completed_events.csv  - Features**
* EVENT: name of event
* DATE: date of event
* LOCATION: location of event
* URL: event url

**completed_events_fights.csv - Features**
* EVENT: name of event
* DATE: date of event
* LOCATION: location of event
* WEIGHTCLASS: weight class of fight
* FIGHTER_R: red fighter name
* FIGHTER_R_COLOR: red color feature for analysis
* FIGHTER_B: blue fighter name
* FIGHTER_B_COLOR: blue color feature for analysis
* EVENT_URL: event url
* FIGHT_URL: specific matchup url
* WIN_METHOD: method of victory
* OUTCOME_FIGHTER: name of winner 
* OUTCOME: red win = 0 ; blue win = 1

### Imports

In [4]:
from pathlib import Path
import pandas as pd
import numpy as np
import os
import requests
from bs4 import BeautifulSoup
import asyncio
import time
from tqdm import tqdm
import aiohttp

### Global Variables

In [5]:
cwd = os.getcwd() # collect current working directory
data_dir = Path(f"{cwd}/data") # define data directory

completed_events_url = 'http://ufcstats.com/statistics/events/completed?page=all' # url for ufcstats.com which contains a table of all completed UFC events
completed_events_soup = BeautifulSoup(requests.get(completed_events_url).content, 'html.parser') # Collect page contents

sema = asyncio.BoundedSemaphore(5) # define aysnc blocking

### Collect Completed Events

**NOTE:** Since we can collect the basic event info with a single page request; we will not be using asynchronous functions until we are ready to collect individual fight details.

In [6]:
def get_event_info(soup) -> pd.DataFrame():
    '''collect ['EVENT', 'DATE', 'LOCATION', 'URL'] for all listed events'''
    
    data = [] # init data list

    for tag in soup.find_all('tr', class_='b-statistics__table-row')[2:]: # collect all rows exluding header bar and first row
        name_url = tag.find('a', class_='b-link b-link_style_black') # collect name and url html
        event_name, event_url = name_url.text.strip(), name_url['href'] # init name and url variables from html
        event_date = tag.find('span', class_='b-statistics__date').text.strip() # collect event date
        event_location = tag.find('td', class_='b-statistics__table-col b-statistics__table-col_style_big-top-padding').text.strip() # collect event location
        data.append([event_name, event_date, event_location, event_url]) # append row to data

    # create df to store event details
    event_info = pd.DataFrame(data, columns=['EVENT', 'DATE', 'LOCATION', 'URL'])

    # convert date col to pandas datetime object
    event_info.DATE = pd.to_datetime(event_info.DATE)
    
    return event_info

In [7]:
def getUpdateDF(online_events) -> pd.DataFrame():
    '''collect update rows'''
    
    stored_events = pd.read_csv(f'{data_dir}/completed_events.csv')
    # print(online_events.EVENT) # debug
    # print(stored_events.EVENT) # debug
    return online_events.loc[~online_events['EVENT'].isin(stored_events['EVENT'])], stored_events # get a df of events not already collected

In [8]:
def collectEvents() -> pd.DataFrame():
    '''collect event info, create file if needed, update if needed'''
    
    all_event_info = get_event_info(completed_events_soup) # collect online event info

    # export file if none exists else collect the update dataframe
    if not Path(f"{data_dir}/completed_events.csv").is_file(): 
        all_event_info.to_csv(Path(f"{data_dir}/completed_events.csv"), index=False) # export file
        update_df = pd.DataFrame() # init empty df for correct update bool eval
    else:
        update_df, stored_events = getUpdateDF(all_event_info)
        
    # if update dateframe is not empty prepend rows and replace the file
    if not update_df.empty:
        stored_events.DATE = pd.to_datetime(stored_events.DATE) # correct datetime format from import
        all_event_info = pd.concat([update_df, stored_events]).reset_index(drop = True) # prepend rows
        all_event_info.to_csv(Path(f"{data_dir}/completed_events.csv"), index=False) # export file
    
    return update_df, all_event_info # return update df even if empty for fight collection logic - do base case check

### Collect Matches

asynchronously collect all match urls for each event which we will use to asynchronously collect all match details

In [9]:
async def getMatchLinks(row: np.array) -> tuple:
    '''collect idividual match urls for a specified event'''
    async with sema:
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(row[-1]) as resp: # start async session with last item in row array, the match url
                    # print(resp.status, row[-1]) # debug
                    page = await resp.text() 
                    soup = BeautifulSoup(requests.get(row[-1]).content, 'html.parser') # collect url soup
        except:
            raise ValueError
        finally:
            await resp.release()
            
    a_href = soup.find_all("a", {"class": "b-flag b-flag_style_green"}) # find all matches from url soup

    return (row,[href.attrs['href'] for href in a_href]) # return row array and list of all match urls

async def getAllEventMatchLinks(data) -> list:
    '''collect match details for a given dataframe'''
    return await asyncio.gather(*[asyncio.create_task(getMatchLinks(row)) for row in data.to_numpy()])

In [10]:
async def getMatch(event_info, url) -> list:
    '''get match details given event info and a match url'''
    async with sema:
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as resp: #  start async session
                    # print(resp.status, url)
                    page = await resp.text() 
                    soup = BeautifulSoup(page, 'html.parser') # collect url soup
        except:
            raise ValueError
        finally:
            await resp.release()

    event = event_info[0] # parse event name from event array
    date = event_info[1] # parse event date from event array
    location = event_info[2] # parse event location from event array
    weightclass = ' '.join(soup.find("i", {"class": "b-fight-details__fight-title"}).getText().strip().split(' ')[0:-1]) # collect weightclass from soup
    fighter_soup = soup.find_all("a", {"class": "b-link b-fight-details__person-link"}) # collect fighter names from soup
    fighter_r,fighter_b = [name.getText().strip() for name in fighter_soup] # parse fighter names
    event_url = event_info[3] # parse event url from event array
    win_method = soup.find("i", {"class": "b-fight-details__label"}).findNext("i").getText().strip() # collect win method from soup
    outcome_fighter = soup.find("i", {"class": "b-fight-details__person-status b-fight-details__person-status_style_green"}).findNext("div").getText().strip().split('\n')[0].strip() # collect winner name from soup
    
    if outcome_fighter == fighter_r:
        outcome = 0 # define red win
    elif outcome_fighter == fighter_b:
        outcome = 1 # define blue win
        
    return (event,date,location,weightclass,fighter_r,'red',fighter_b,'blue',event_url,url,win_method,outcome_fighter,outcome) # return data row

async def getEventMatchInfo(match) -> list:
    '''collect match details for a given array of match data'''
    return await asyncio.gather(*[asyncio.create_task(getMatch(match[0],url)) for url in match[-1]]) # match[0] = array of event info to carry into the match record

async def getAllEventMatchInfo(eventMatchInfo) -> list:
    '''collect match details for a given array of eventMatchinfo'''
    return await asyncio.gather(*[asyncio.create_task(getEventMatchInfo(match)) for match in eventMatchInfo])


In [19]:
async def collectEventMatches(update_events,all_events) -> pd.DataFrame():
    '''collect event match info, create file if needed, update if needed'''
    
    # define scoped variables
    cols = ['EVENT', 'DATE', 'LOCATION', 'WEIGHTCLASS','FIGHTER_R','FIGHTER_R_COLOR','FIGHTER_B','FIGHTER_B_COLOR','EVENT_URL','FIGHT_URL','WIN_METHOD','OUTCOME_FIGHTER','OUTCOME']
    update = False

    # if completed_event_matches.csv does not exist collect all info from scratch
    if not Path(f"{data_dir}/completed_event_matches.csv").is_file(): 
        all_event_match_info = await getAllEventMatchInfo(await getAllEventMatchLinks(all_events)) # collect all match info
        all_data = [item for sublist in all_event_match_info for item in sublist] # flatten async data
        all_event_match_df = pd.DataFrame(all_data, columns=cols) # define dataframe
        all_event_match_df.to_csv(Path(f"{data_dir}/completed_event_matches.csv"), index=False) # export file
        
        update_event_match_info = pd.DataFrame() # init empty dataframe for return
        stored_events = all_event_match_df # set stored events in the case of file creation

    # if completed_event_matches.csv does exist collect stored info to append to updates
    else:
        update_event_match_info = pd.DataFrame() # init empty dataframe for return
        stored_events = pd.read_csv(f'{data_dir}/completed_event_matches.csv') # collect stored event matches
        stored_events.DATE = pd.to_datetime(stored_events.DATE) # correct datetime format from import

        update = True # set update True for bool eval
        all_event_match_df = stored_events # in case where file needs update define all events as stored for update proccess

    # if there are updates collect only event match info for update dataframe
    if update:
        all_event_match_info = await getAllEventMatchInfo(await getAllEventMatchLinks(update_events)) # collect match info from update dataframe
        all_data = [item for sublist in all_event_match_info for item in sublist] # flatten async data
        update_event_match_info = pd.DataFrame(all_data, columns=cols) # define dataframe
        all_event_match_df = pd.concat([update_event_match_info, stored_events]).reset_index(drop = True) # prepend rows
        all_event_match_df.to_csv(Path(f"{data_dir}/completed_event_matches.csv"), index=False) # export file

    return update_event_match_info, all_event_match_df

### Collect All Data

In [12]:
async def CollectAll():
    '''driver function for collection functions'''

    # intelligently collect completed ufc event information. Updating or creating a completed_event_matches.csv
    update_events, all_event_info = collectEvents()

    # test = all_event_info.iloc[:10] # subset for testing
    # update_event_matches, all_event_matches = await collectEventMatches(update_events,test) # subset for testing

    # intelligently collect completed ufc event match information. Updating or creating a completed_events.csv
    update_event_matches, all_event_matches = await collectEventMatches(update_events,all_event_info)

    return update_events, all_event_info, update_event_matches, all_event_matches

In [22]:
update_events, all_event_info, update_event_matches, all_event_matches = await CollectAll() # collect all update and full dataframes

In [14]:
update_events

In [15]:
all_event_info

Unnamed: 0,EVENT,DATE,LOCATION,URL
0,UFC 279: Diaz vs. Ferguson,2022-09-10,"Las Vegas, Nevada, USA",http://ufcstats.com/event-details/93bf96be327f...
1,UFC Fight Night: Gane vs. Tuivasa,2022-09-03,"Paris, Ile-de-France, France",http://ufcstats.com/event-details/00a905a4a4a2...
2,UFC 278: Usman vs. Edwards,2022-08-20,"Salt Lake City, Utah, USA",http://ufcstats.com/event-details/4f853e988862...
3,UFC Fight Night: Vera vs. Cruz,2022-08-13,"San Diego, California, USA",http://ufcstats.com/event-details/a23e63184c65...
4,UFC Fight Night: Santos vs. Hill,2022-08-06,"Las Vegas, Nevada, USA",http://ufcstats.com/event-details/8f6a18831a12...
...,...,...,...,...
614,UFC 6: Clash of the Titans,1995-07-14,"Casper, Wyoming, USA",http://ufcstats.com/event-details/1c3f5e85b59e...
615,UFC 5: The Return of the Beast,1995-04-07,"Charlotte, North Carolina, USA",http://ufcstats.com/event-details/dedc3bb440d0...
616,UFC 4: Revenge of the Warriors,1994-12-16,"Tulsa, Oklahoma, USA",http://ufcstats.com/event-details/b60391da771d...
617,UFC 3: The American Dream,1994-09-09,"Charlotte, North Carolina, USA",http://ufcstats.com/event-details/1a49e0670dfa...


In [16]:
update_event_matches

In [17]:
all_event_matches

Unnamed: 0,EVENT,DATE,LOCATION,WEIGHTCLASS,FIGHTER_R,FIGHTER_R_COLOR,FIGHTER_B,FIGHTER_B_COLOR,EVENT_URL,FIGHT_URL,WIN_METHOD,OUTCOME_FIGHTER,OUTCOME
0,UFC 279: Diaz vs. Ferguson,2022-09-10,"Las Vegas, Nevada, USA",Welterweight,Nate Diaz,red,Tony Ferguson,blue,http://ufcstats.com/event-details/93bf96be327f...,http://ufcstats.com/fight-details/7e8fd03e070d...,Submission,Nate Diaz,0
1,UFC 279: Diaz vs. Ferguson,2022-09-10,"Las Vegas, Nevada, USA",Catch Weight,Khamzat Chimaev,red,Kevin Holland,blue,http://ufcstats.com/event-details/93bf96be327f...,http://ufcstats.com/fight-details/0111e6a64b51...,Submission,Khamzat Chimaev,0
2,UFC 279: Diaz vs. Ferguson,2022-09-10,"Las Vegas, Nevada, USA",Catch Weight,Li Jingliang,red,Daniel Rodriguez,blue,http://ufcstats.com/event-details/93bf96be327f...,http://ufcstats.com/fight-details/d3e23d7d207d...,Decision - Split,Daniel Rodriguez,1
3,UFC 279: Diaz vs. Ferguson,2022-09-10,"Las Vegas, Nevada, USA",Catch Weight,Irene Aldana,red,Macy Chiasson,blue,http://ufcstats.com/event-details/93bf96be327f...,http://ufcstats.com/fight-details/989b8d362742...,KO/TKO,Irene Aldana,0
4,UFC 279: Diaz vs. Ferguson,2022-09-10,"Las Vegas, Nevada, USA",Light Heavyweight,Johnny Walker,red,Ion Cutelaba,blue,http://ufcstats.com/event-details/93bf96be327f...,http://ufcstats.com/fight-details/d869eaea3842...,Submission,Johnny Walker,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,UFC 276: Adesanya vs. Cannonier,2022-07-02,"Las Vegas, Nevada, USA",Welterweight,Ian Garry,red,Gabe Green,blue,http://ufcstats.com/event-details/4a9e305633f3...,http://ufcstats.com/fight-details/4d8d1701c43c...,Decision - Unanimous,Ian Garry,0
115,UFC 276: Adesanya vs. Cannonier,2022-07-02,"Las Vegas, Nevada, USA",Middleweight,Brad Tavares,red,Dricus Du Plessis,blue,http://ufcstats.com/event-details/4a9e305633f3...,http://ufcstats.com/fight-details/57775c8af890...,Decision - Unanimous,Dricus Du Plessis,1
116,UFC 276: Adesanya vs. Cannonier,2022-07-02,"Las Vegas, Nevada, USA",Middleweight,Uriah Hall,red,Andre Muniz,blue,http://ufcstats.com/event-details/4a9e305633f3...,http://ufcstats.com/fight-details/99ac96bb67e5...,Decision - Unanimous,Andre Muniz,1
117,UFC 276: Adesanya vs. Cannonier,2022-07-02,"Las Vegas, Nevada, USA",Women's Flyweight,Jessica Eye,red,Maycee Barber,blue,http://ufcstats.com/event-details/4a9e305633f3...,http://ufcstats.com/fight-details/1c85003f6ec5...,Decision - Unanimous,Maycee Barber,1
