In [1]:
import pandas as pd
import numpy as np
import re
import datetime
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

### Intermediate functions I will use to clean the data

In [3]:
def add_events(df_list):
    '''Event list (alternating sexes for each event).'''
    events = ['100', '100', '200', '200', '400', '400', '800', '800', '1500', '1500', '5000', '5000', '10000', '10000', 
              '100H', '100H', '400H', '400H', '3000S', '3000S', '4x100', '4x100', '4x400', '4x400', 
              'HJ', 'HJ', 'PV', 'PV', 'LJ', 'LJ', 'TJ', 'TJ', 'SP', 'SP', 'DT', 'DT', 'HT', 'HT', 'JT', 'JT', 
              'Dec', 'Hep']
    for i in range(len(df_list)):
        df_list[i]['EVENT'] = events[i]

In [79]:
def add_sex(df_list):
    # male recorded first in df for each event except last two dfs (hep and dec are reversed for some reason)
    for i in range(len(df_list)-2):
        if i%2 == 0:
            df_list[i]['SEX'] = 'Male'
        else:
            df_list[i]['SEX'] = 'Female'
    df_list[40]['SEX'] = 'Female'
    df_list[41]['SEX'] = 'Male'

In [5]:
def add_division(df_list, division):
    '''division from function argument''' 
    for i in range(len(df_list)):
        df_list[i]['DIVISION'] = division

In [6]:
def add_champ_year(df_list):
    '''last 4 chars from meet date string as champ year'''
    for i in range(len(df_list)):
        df_list[i]['CHAMP_YEAR'] = [j[-4:] for j in df_list[i]['MEET DATE']]

In [38]:
def clean_time(df_list):
    '''Takes time in format %M:%S.%f or %S.%f and converts to %M:%S.%f or %S.%f and then creates 
    new variable for total seconds.'''
    
    for df in df_list:
        if ~df['TIME'].astype(str).str.contains('nan').any():
            df['TIME'] = df['TIME'].astype(str) # make sure all times are str format
            df['TIME'] = [re.sub(re.compile(r'\([^)]*\)'), '', i) for i in df['TIME']]
            df['TIME'] = df['TIME'].str.replace('.',':').str.replace('@','').str.replace('h','')

In [8]:
def add_total_seconds(df_list):
    '''Takes dfs from df_list that have had TIME cleaned and creates a TIME_SECS column of total seconds'''
    for df in df_list:
        if ~df['TIME'].astype(str).str.contains('nan').any():
            results = []
            for time in df['TIME']:
                if len(time) <6:
                    date_time = datetime.datetime.strptime(time, '%S:%f')
                    a_timedelta = date_time - datetime.datetime(1900, 1, 1)
                    results.append(a_timedelta.total_seconds())
                else:
                    date_time = datetime.datetime.strptime(time, "%M:%S:%f")
                    a_timedelta = date_time - datetime.datetime(1900, 1, 1)
                    results.append(a_timedelta.total_seconds())
            df['TIME_SECS'] = results

### Combine functions into one cleaning function

In [32]:
def clean_dfs(df_list, division):
    '''Take in list of dataframes, and division(str).
       Loop through each df in df_list and add column for:
       event, sex, division and championship year.
       Reorder rename and drop columns appropriately.
       Add TIME_SECS from TIME column for running events.
       '''
    
    add_events(df_list)
    add_sex(df_list)
    add_division(df_list, division)
    add_champ_year(df_list)
    
    # rename column to POSITION
    for i in range(len(df_list)):
        df_list[i].rename(columns={'Unnamed: 0' : 'POSITION'}, inplace=True)
    
    # reorder and drop columns
    col_names = ['POSITION', 'CHAMP_YEAR', 'DIVISION', 'EVENT', 'SEX', 'ATHLETE', 'YEAR', 'TEAM', 'TIME', 'TIME_SECS', 
                 'MARK', 'CONV', 'POINTS', 'MEET', 'MEET DATE']    
    for i in range(len(df_list)):
        df_list[i] = df_list[i].reindex(columns=col_names)
        
    
    clean_time(df_list)
    add_total_seconds(df_list)

### Final function that loops through list of urls, scrapes and then cleans them.

In [33]:
def scrape_tfrrs(url_list, division):
    '''For each url in the list, clean the dfs and return a concatenated version of all dfs'''
    cleaned_dfs = []
    for url in url_list:
        # scrape url into list of dfs
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        soup = BeautifulSoup(webpage)
        df_list = pd.read_html(webpage)
        
        # clean each df in df_list
        clean_dfs(df_list, 'D1')
        
        # add each cleaned df to cleaned_dfs list
        for i in df_list:
            cleaned_dfs.append(i)
        
    # concatenate cleaned_dfs list into one long dataframe
    result = pd.concat(cleaned_dfs)
    result.index = range(len(result))
    
    return result    

### TEST

In [11]:
d1_urls = ['https://www.tfrrs.org/archived_lists/2568/2019_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2019/o', 
           'https://www.tfrrs.org/archived_lists/2279/2018_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2018/o']

In [12]:
%%time 
d1_scraped = scrape_tfrrs(d1_urls, 'D1')

Wall time: 2min 21s


In [28]:
d1_scraped.index = range(len(d1_scraped))

In [30]:
d1_scraped.head()

Unnamed: 0,POSITION,CHAMP_YEAR,DIVISION,EVENT,SEX,ATHLETE,YEAR,TEAM,TIME,TIME_SECS,MARK,CONV,POINTS,MEET,MEET DATE
0,1,2019,D1,100,Men,"Oduduru, Divine",JR-3,Texas Tech,9:94,9.94,,,,Michael Johnson Invitational,"Apr 19, 2019"
1,2,2019,D1,100,Men,"Burke, Mario",SR-4,Houston,9:95,9.95,,,,The American Outdoor Track & Field Championships,"May 10, 2019"
2,3,2019,D1,100,Men,"Gillespie, Cravon",SR-4,Oregon,9:97,9.97,,,,2019 Pac-12 Track & Field Championships,"May 4, 2019"
3,4,2019,D1,100,Men,"Sani Brown, Hakim",SO-2,Florida,9:99,9.99,,,,SEC Outdoor Track & Field Championships,"May 9, 2019"
4,5,2019,D1,100,Men,"Ekevwo, Raymond",JR-3,Florida,10:02,10.02,,,,SEC Outdoor Track & Field Championships,"May 9, 2019"


### 1 URL AT A TIME

In [49]:
%%time
# start with 2019 Outdoor D1
d1_top100 = 'https://www.tfrrs.org/archived_lists/2568/2019_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2019/o'
d1_top500 = 'https://www.tfrrs.org/lists/2568.html?limit=%3C%3D500&event_type=all&year=&gender=x'
d12017 = 'https://www.tfrrs.org/archived_lists/1912/2017_NCAA_Div._I_Outdoor_Qualifying_(FINAL)/2017/o'

req = Request(d12017, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage)
dfs = pd.read_html(webpage)

Wall time: 1min 11s


In [66]:
# create deep copy of dfs to work with
dfs_copy = [i.copy(deep=True) for i in dfs]

In [92]:
dfs_copy[41]

Unnamed: 0,POSITION,CHAMP_YEAR,DIVISION,EVENT,SEX,ATHLETE,YEAR,TEAM,TIME,TIME_SECS,MARK,CONV,POINTS,MEET,MEET DATE
0,1,2017,D1,Dec,Male,"Victor, Lindon",SR-4,Texas A&M,,,,,8539,SEC Outdoor Track & Field Championships,"May 11, 2017"
1,2,2017,D1,Dec,Male,"Williams, Devon",SR-4,Georgia,,,,,8345,Spec Towns Invitational,"Apr 7, 2017"
2,3,2017,D1,Dec,Male,"Wieland, Luca",SR-4,Minnesota,,,,,8201,Mt. SAC Relays/CA Invitational Multi Events,"Apr 12, 2017"
3,4,2017,D1,Dec,Male,"Duckworth, Tim",JR-3,Kentucky,,,,,7973,Spec Towns Invitational,"Apr 7, 2017"
4,5,2017,D1,Dec,Male,"Saluri, Karl",JR-3,Georgia,,,,,7948,SEC Outdoor Track & Field Championships,"May 11, 2017"
5,6,2017,D1,Dec,Male,"Walton, Cody",JR-3,Nebraska,,,,,7937,Spec Towns Invitational,"Apr 7, 2017"
6,7,2017,D1,Dec,Male,"Filip, Scott",JR-3,Rice,,,,,7915,2017 90th Clyde Littlefield Texas Relays,"Mar 29, 2017"
7,8,2017,D1,Dec,Male,"Mahler, Wolf",SR-4,Texas,,,,,7897,2017 90th Clyde Littlefield Texas Relays,"Mar 29, 2017"
8,9,2017,D1,Dec,Male,"Price, Hunter",JR-3,Colorado St.,,,,,7801,Mt. SAC Relays/CA Invitational Multi Events,"Apr 12, 2017"
9,10,2017,D1,Dec,Male,"Leemet, Markus",JR-3,South Carolina,,,,,7720,SEC Outdoor Track & Field Championships,"May 11, 2017"


In [68]:
dfs_copy[13].head()

Unnamed: 0.1,Unnamed: 0,ATHLETE,YEAR,TEAM,TIME,MEET,MEET DATE
0,1,"Rohrer, Anna",FR-1,Notre Dame,31:58.99,Stanford Invitational,"Mar 31, 2017"
1,2,"Taylor, Charlotte",SR-4,San Francisco,32:11.80,Stanford Invitational,"Mar 31, 2017"
2,3,"Wright, Alice",JR-3,New Mexico,32:29.28,Payton Jordan Invitational,"May 5, 2017"
3,4,"Kempfer, Jamie",SO-2,Missouri,33:09.67,Stanford Invitational,"Mar 31, 2017"
4,5,"Blaney, Anne-Marie",SR-4,UCF,33:18.81,Stanford Cardinal Classic,"Apr 21, 2017"


In [93]:
# add event, sex champ_year and division column to each dataframe in list of dataframes, add column for time converted to seconds

def add_variables(df_list, division):
    '''Take in list of dataframes, division(str), champ_year(str).  
       Loop through list of data frames and add column for:
       event, sex, division and championship year'''
    
    # event list (alternating sexes for each event)
    events = ['100', '100', '200', '200', '400', '400', '800', '800', '1500', '1500', '5000', '5000', '10000', '10000', 
              '100H', '110H', '400H', '400H', '3000S', '3000S', '4x100', '4x100', '4x400', '4x400', 
              'HJ', 'HJ', 'PV', 'PV', 'LJ', 'LJ', 'TJ', 'TJ', 'SP', 'SP', 'DT', 'DT', 'HT', 'HT', 'JT', 'JT', 
              'Hep', 'Dec']
    
    
    for i in range(len(df_list)):
        df_list[i]['EVENT'] = events[i]
        
    # male recorded first in df for each event except last two dfs (hep and dec are reversed for some reason)
    for i in range(len(df_list)-3):
        if i%2 == 0:
            df_list[i]['SEX'] = 'Male'
        else:
            df_list[i]['SEX'] = 'Female'
    df_list[40]['SEX'] = 'Female'
    df_list[41]['SEX'] = 'Male'
            
    for i in range(len(df_list)):
        df_list[i]['DIVISION'] = division
        
    for i in range(len(df_list)):
        df_list[i]['CHAMP_YEAR'] = [j[-4:] for j in dfs_copy[40]['MEET DATE']]
        
    # rename column to POSITION
    for i in range(len(df_list)):
        df_list[i].rename(columns={'Unnamed: 0' : 'POSITION'}, inplace=True)
        
    #CONVERT TIME TO TOTAL SECONDS            
    
    # reorder and drop columns
    col_names = ['POSITION', 'CHAMP_YEAR', 'DIVISION', 'EVENT', 'SEX', 'ATHLETE', 'YEAR', 'TEAM', 'TIME', 'TIME_SECS', 
                 'MARK', 'CONV', 'POINTS', 'MEET', 'MEET DATE']
    for i in range(len(df_list)):
        df_list[i] = df_list[i].reindex(columns=col_names)

In [94]:
add_variables(dfs_copy, 'D1')

In [96]:
dfs_copy[12]

Unnamed: 0,POSITION,CHAMP_YEAR,DIVISION,EVENT,SEX,ATHLETE,YEAR,TEAM,TIME,TIME_SECS,MARK,CONV,POINTS,MEET,MEET DATE
0,1,2017,D1,10000,Male,"Chelanga, Alfred",JR-3,Alabama,28:04.95,,,,,Stanford Invitational,"Mar 31, 2017"
1,2,2017,D1,10000,Male,"Scott, Marc",SR-4,Tulsa,28:07.97,,,,,Payton Jordan Invitational,"May 5, 2017"
2,3,2017,D1,10000,Male,"Peterson, Erik",SR-4,Butler,28:11.02,,,,,Stanford Invitational,"Mar 31, 2017"
3,4,2017,D1,10000,Male,"Mock, Jerrell",JR-3,Colorado St.,28:11.80,,,,,Stanford Invitational,"Mar 31, 2017"
4,5,2017,D1,10000,Male,"Kosgei, Antibahs",SR-4,Alabama,28:35.79,,,,,Stanford Invitational,"Mar 31, 2017"
5,6,2017,D1,10000,Male,"Kibichiy, Edwin",SR-4,Louisville,28:38.06,,,,,Raleigh Relays,"Mar 24, 2017"
6,7,2017,D1,10000,Male,"Choge, Jacob",FR-1,Mid. Tenn. State,28:42.31,,,,,Mt. SAC Relays,"Apr 13, 2017"
7,8,2017,D1,10000,Male,"Parsons, George",SR-4,North Carolina St.,28:43.74,,,,,Stanford Invitational,"Mar 31, 2017"
8,9,2017,D1,10000,Male,"Herriott, Zach",SR-4,Virginia,28:44.22,,,,,Raleigh Relays,"Mar 24, 2017"
9,10,2017,D1,10000,Male,"Young, Clayton",SO-2,BYU,28:45.36,,,,,Stanford Invitational,"Mar 31, 2017"


### List of URLS for each Division I and Division II

In [73]:
# list of urls for available years in each division
D1_top100 = ['https://www.tfrrs.org/archived_lists/2568/2019_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2019/o',
 'https://www.tfrrs.org/archived_lists/2279/2018_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2018/o',
 'https://www.tfrrs.org/archived_lists/1912/2017_NCAA_Div._I_Outdoor_Qualifying_(FINAL)/2017/o',
 'https://www.tfrrs.org/archived_lists/1688/2016_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2016/o',
 'https://www.tfrrs.org/archived_lists/1439/2015_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2015/o',
 'https://www.tfrrs.org/archived_lists/1228/2014_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2014/o',
 'https://www.tfrrs.org/archived_lists/1029/2013_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2013/o',
 'https://www.tfrrs.org/archived_lists/840/2012_NCAA_Div._I_Outdoor_Qualifiers_(Final)/2012/o',
 'https://www.tfrrs.org/archived_lists/673/2011_NCAA_Division_I_Outdoor_POP_List_(FINAL)/2011/o',
 'https://www.tfrrs.org/archived_lists/528/2010_NCAA_Division_I_Outdoor_POP_List_(FINAL)/2010/o']

D2_top100 = ['https://www.tfrrs.org/archived_lists/2571/2019_NCAA_Div._II_Outdoor_Qualifying_(FINAL)/2019/o',
 'https://www.tfrrs.org/lists/2282.html',
 'https://www.tfrrs.org/lists/1913.html',
 'https://www.tfrrs.org/lists/1685.html',
 'https://www.tfrrs.org/lists/1442.html',
 'https://www.tfrrs.org/lists/1231.html',
 'https://www.tfrrs.org/lists/1032.html',
 'https://www.tfrrs.org/lists/841.html',
 'https://www.tfrrs.org/lists/674.html',
 'https://www.tfrrs.org/lists/529.html']

In [85]:
%%time
d1_scraped = scrape_tfrrs(D1_top100, 'D1')

IndexError: list index out of range

In [40]:
%%time
d2_scraped = scrape_tfrrs(D2_top100, 'D2')

Wall time: 14min 57s


In [91]:
d1_scraped.CHAMP_YEAR.unique()

array(['2019', '2018'], dtype=object)

In [None]:
# # write to csv
# d1_scraped.to_csv('d1_scraped.csv')

In [87]:
# write to csv
d2_scraped.to_csv('d2_scraped.csv')

In [None]:
# graph some stuff
import matplotlib.pyplot as plt
from matplotlib import dates