In [1]:
import pandas as pd
import numpy as np
import re
import datetime
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

### Intermediate functions I will use to clean the data

In [3]:
def add_events(df_list):
    '''Event list (alternating sexes for each event).'''
    events = ['100', '100', '200', '200', '400', '400', '800', '800', '1500', '1500', '5000', '5000', '10000', '10000', 
              '100H', '100H', '400H', '400H', '3000S', '3000S', '4x100', '4x100', '4x400', '4x400', 
              'HJ', 'HJ', 'PV', 'PV', 'LJ', 'LJ', 'TJ', 'TJ', 'SP', 'SP', 'DT', 'DT', 'HT', 'HT', 'JT', 'JT', 
              'Dec', 'Hep']
    for i in range(len(df_list)):
        df_list[i]['EVENT'] = events[i]

In [4]:
def add_sex(df_list):
    '''add Men or Women for sex. Alternating dfs in df_list Men Women. '''
    for i in range(len(df_list)):
        if i%2 == 0: #Index starting at zero, even number dfs are Men, odd are Women
            df_list[i]['SEX'] = 'Men'
        else:
            df_list[i]['SEX'] = 'Women'

In [5]:
def add_division(df_list, division):
    '''division from function argument''' 
    for i in range(len(df_list)):
        df_list[i]['DIVISION'] = division

In [6]:
def add_champ_year(df_list):
    '''last 4 chars from meet date string as champ year'''
    for i in range(len(df_list)):
        df_list[i]['CHAMP_YEAR'] = [j[-4:] for j in df_list[i]['MEET DATE']]

In [7]:
def clean_time(df_list):
    '''Takes time in format %M:%S.%f or %S.%f and converts to %M:%S.%f or %S.%f and then creates 
    new variable for total seconds.'''
    
    for df in df_list:
        if ~df['TIME'].astype(str).str.contains('nan').any():
            df['TIME'] = df['TIME'].astype(str) # make sure all times are str format
            df['TIME'] = [re.sub(re.compile(r'\([^)]*\)'), '', i) for i in df['TIME']]
            df['TIME'] = df['TIME'].str.replace('.',':').str.replace('@','')

In [8]:
def add_total_seconds(df_list):
    '''Takes dfs from df_list that have had TIME cleaned and creates a TIME_SECS column of total seconds'''
    for df in df_list:
        if ~df['TIME'].astype(str).str.contains('nan').any():
            results = []
            for time in df['TIME']:
                if len(time) <6:
                    date_time = datetime.datetime.strptime(time, '%S:%f')
                    a_timedelta = date_time - datetime.datetime(1900, 1, 1)
                    results.append(a_timedelta.total_seconds())
                else:
                    date_time = datetime.datetime.strptime(time, "%M:%S:%f")
                    a_timedelta = date_time - datetime.datetime(1900, 1, 1)
                    results.append(a_timedelta.total_seconds())
            df['TIME_SECS'] = results

### Combine functions into one cleaning function

In [32]:
def clean_dfs(df_list, division):
    '''Take in list of dataframes, and division(str).
       Loop through each df in df_list and add column for:
       event, sex, division and championship year.
       Reorder rename and drop columns appropriately.
       Add TIME_SECS from TIME column for running events.
       '''
    
    add_events(df_list)
    add_sex(df_list)
    add_division(df_list, division)
    add_champ_year(df_list)
    
    # rename column to POSITION
    for i in range(len(df_list)):
        df_list[i].rename(columns={'Unnamed: 0' : 'POSITION'}, inplace=True)
    
    # reorder and drop columns
    col_names = ['POSITION', 'CHAMP_YEAR', 'DIVISION', 'EVENT', 'SEX', 'ATHLETE', 'YEAR', 'TEAM', 'TIME', 'TIME_SECS', 
                 'MARK', 'CONV', 'POINTS', 'MEET', 'MEET DATE']    
    for i in range(len(df_list)):
        df_list[i] = df_list[i].reindex(columns=col_names)
        
    
    clean_time(df_list)
    add_total_seconds(df_list)

### Function that loops through list of urls, scrapes and then cleans them.

In [33]:
def scrape_tfrrs(url_list, division):
    '''For each url in the list, clean the dfs and return a concatenated version of all dfs'''
    cleaned_dfs = []
    for url in url_list:
        # scrape url into list of dfs
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        soup = BeautifulSoup(webpage)
        df_list = pd.read_html(webpage)
        
        # clean each df in df_list
        clean_dfs(df_list, 'D1')
        
        # add each cleaned df to cleaned_dfs list
        for i in df_list:
            cleaned_dfs.append(i)
        
    # concatenate cleaned_dfs list into one long dataframe
    result = pd.concat(cleaned_dfs)
    result.index = range(len(result))
    
    return result    

### TEST

In [11]:
d1_urls = ['https://www.tfrrs.org/archived_lists/2568/2019_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2019/o', 
           'https://www.tfrrs.org/archived_lists/2279/2018_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2018/o']

In [12]:
%%time 
d1_scraped = scrape_tfrrs(d1_urls, 'D1')

Wall time: 2min 21s


In [28]:
d1_scraped.index = range(len(d1_scraped))

In [30]:
d1_scraped.head()

Unnamed: 0,POSITION,CHAMP_YEAR,DIVISION,EVENT,SEX,ATHLETE,YEAR,TEAM,TIME,TIME_SECS,MARK,CONV,POINTS,MEET,MEET DATE
0,1,2019,D1,100,Men,"Oduduru, Divine",JR-3,Texas Tech,9:94,9.94,,,,Michael Johnson Invitational,"Apr 19, 2019"
1,2,2019,D1,100,Men,"Burke, Mario",SR-4,Houston,9:95,9.95,,,,The American Outdoor Track & Field Championships,"May 10, 2019"
2,3,2019,D1,100,Men,"Gillespie, Cravon",SR-4,Oregon,9:97,9.97,,,,2019 Pac-12 Track & Field Championships,"May 4, 2019"
3,4,2019,D1,100,Men,"Sani Brown, Hakim",SO-2,Florida,9:99,9.99,,,,SEC Outdoor Track & Field Championships,"May 9, 2019"
4,5,2019,D1,100,Men,"Ekevwo, Raymond",JR-3,Florida,10:02,10.02,,,,SEC Outdoor Track & Field Championships,"May 9, 2019"


### 1 URL AT A TIME

In [None]:
%%time
# start with 2019 Outdoor D1
d1_top100 = 'https://www.tfrrs.org/archived_lists/2568/2019_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2019/o'
d1_top500 = 'https://www.tfrrs.org/lists/2568.html?limit=%3C%3D500&event_type=all&year=&gender=x'

req = Request(d1_top100, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage)
dfs = pd.read_html(webpage)

In [None]:
# create deep copy of dfs to work with
dfs_copy = [i.copy(deep=True) for i in dfs]

In [None]:
dfs_copy[0].head()

In [None]:
dfs_copy[13].head()

In [None]:
# add event, sex champ_year and division column to each dataframe in list of dataframes, add column for time converted to seconds

def add_variables(df_list, division):
    '''Take in list of dataframes, division(str), champ_year(str).  
       Loop through list of data frames and add column for:
       event, sex, division and championship year'''
    
    # event list (alternating sexes for each event)
    events = ['100', '100', '200', '200', '400', '400', '800', '800', '1500', '1500', '5000', '5000', '10000', '10000', 
              '100H', '100H', '400H', '400H', '3000S', '3000S', '4x100', '4x100', '4x400', '4x400', 
              'HJ', 'HJ', 'PV', 'PV', 'LJ', 'LJ', 'TJ', 'TJ', 'SP', 'SP', 'DT', 'DT', 'HT', 'HT', 'JT', 'JT', 
              'Dec', 'Hep']
    
    
    for i in range(len(df_list)):
        df_list[i]['EVENT'] = events[i]
        
    # male recorded first in df for each event    
    for i in range(len(df_list)):
        if i%2 == 0:
            df_list[i]['SEX'] = 'Male'
        else:
            df_list[i]['SEX'] = 'Female'
            
    for i in range(len(df_list)):
        df_list[i]['DIVISION'] = division
        
    for i in range(len(df_list)):
        df_list[i]['CHAMP_YEAR'] = [j[-4:] for j in dfs_copy[40]['MEET DATE']]
        
    # rename column to POSITION
    for i in range(len(df_list)):
        df_list[i].rename(columns={'Unnamed: 0' : 'POSITION'}, inplace=True)
        
    #CONVERT TIME TO TOTAL SECONDS            
    
    # reorder and drop columns
    col_names = ['POSITION', 'CHAMP_YEAR', 'DIVISION', 'EVENT', 'SEX', 'ATHLETE', 'YEAR', 'TEAM', 'TIME', 'TIME_SECS', 
                 'MARK', 'CONV', 'POINTS', 'MEET', 'MEET DATE']
    for i in range(len(df_list)):
        df_list[i] = df_list[i].reindex(columns=col_names)

### List of URLS for each Division I and Division II

In [34]:
# list of urls for available years in each division
D1_top100 = ['https://www.tfrrs.org/archived_lists/2568/2019_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2019/o',
 'https://www.tfrrs.org/archived_lists/2279/2018_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2018/o',
 'https://www.tfrrs.org/archived_lists/1912/2017_NCAA_Div._I_Outdoor_Qualifying_(FINAL)/2017/o',
 'https://www.tfrrs.org/archived_lists/1688/2016_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2016/o',
 'https://www.tfrrs.org/archived_lists/1439/2015_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2015/o',
 'https://www.tfrrs.org/archived_lists/1228/2014_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2014/o',
 'https://www.tfrrs.org/archived_lists/1029/2013_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2013/o',
 'https://www.tfrrs.org/archived_lists/840/2012_NCAA_Div._I_Outdoor_Qualifiers_(Final)/2012/o',
 'https://www.tfrrs.org/archived_lists/673/2011_NCAA_Division_I_Outdoor_POP_List_(FINAL)/2011/o',
 'https://www.tfrrs.org/archived_lists/528/2010_NCAA_Division_I_Outdoor_POP_List_(FINAL)/2010/o']

D2_top100 = ['https://www.tfrrs.org/archived_lists/2571/2019_NCAA_Div._II_Outdoor_Qualifying_(FINAL)/2019/o',
 'https://www.tfrrs.org/lists/2282.html',
 'https://www.tfrrs.org/lists/1913.html',
 'https://www.tfrrs.org/lists/1685.html',
 'https://www.tfrrs.org/lists/1442.html',
 'https://www.tfrrs.org/lists/1231.html',
 'https://www.tfrrs.org/lists/1032.html',
 'https://www.tfrrs.org/lists/841.html',
 'https://www.tfrrs.org/lists/674.html',
 'https://www.tfrrs.org/lists/529.html']

In [None]:
%%time
d1_scraped = 

In [None]:
%%time
d2_scraped = 

In [None]:
# graph some stuff
import matplotlib.pyplot as plt
from matplotlib import dates