# Scraping https://www.tfrrs.org/ Performance List Archives 
By Adam Visokay
### This scraper will gather available data for Outdoor Qualifying lists from 2010-2019 and 2021 for Divisions I, II and III.  I ignore 2020 because those performance lists are incomplete due to the COVID-19 pandemic ending the season prematurely in March.  
### Each year and division has it's own url.  
### Each url contains the top 100 performances at the end of the regular season (before championships) for each of the following NCAA Outdoor Championship events :
### 100 200 400 800 1500 5000 10000 100H 110H 400H 3000S 4x100 4x400 HJ PV LJ TJ SP DT HT JT Hep Dec

### Import Python libraries

In [None]:
import pandas as pd
import numpy as np
import re
import datetime

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup


### Intermediate functions I will use to clean the data

In [None]:
def add_events(df_list):
    '''Add EVENT column to each df in df_list from event list.
    Alternating sexes starting with Men for each event, except 100/110H and Hep/Dec are reversed.'''
    
    events = ['100', '100', '200', '200', '400', '400', '800', '800', '1500', '1500', '5000', '5000', '10000', '10000', 
              '100H', '110H', '400H', '400H', '3000S', '3000S', '4x100', '4x100', '4x400', '4x400', 
              'HJ', 'HJ', 'PV', 'PV', 'LJ', 'LJ', 'TJ', 'TJ', 'SP', 'SP', 'DT', 'DT', 'HT', 'HT', 'JT', 'JT', 
              'Hep', 'Dec']
    
    for i in range(len(df_list)):
        df_list[i]['EVENT'] = events[i]

In [None]:
def add_sex(df_list):
    '''Add event to SEX column (Men/Women) to each df in df_list from event list.
    Male recorded first in df for each event except 100/110H and Hep/Dec.'''
    
    for i in range(14):
        if i%2 == 0:
            df_list[i]['SEX'] = 'Men'
        else:
            df_list[i]['SEX'] = 'Women'
    df_list[14]['SEX'] = 'Women'
    df_list[15]['SEX'] = 'Men'
    for i in range(16,40):
        if i%2 == 0:
            df_list[i]['SEX'] = 'Men'
        else:
            df_list[i]['SEX'] = 'Women'
    df_list[40]['SEX'] = 'Women'
    df_list[41]['SEX'] = 'Men'
    

In [None]:
def add_division(df_list, division):
    '''Add DIVISION column (D1, D2 or D3) to each df in df_list from input parameter division.''' 
    
    for i in range(len(df_list)):
        df_list[i]['DIVISION'] = division

In [None]:
def add_champ_year(df_list):
    '''Add CHAMP_YEAR column to each df in df_list from last 4 chars from df['MEET DATE'] column.'''
    
    for i in range(len(df_list)):
        df_list[i]['CHAMP_YEAR'] = [j[-4:] for j in df_list[i]['MEET DATE']]

In [None]:
def clean_time(df_list):
    '''Takes time in format %M:%S.%f or %S.%f and converts to %M:%S:%f or %S:%f for parsing with datetime library.'''
    
    for df in df_list:
        if ~df['TIME'].astype(str).str.contains('nan').any():
            df['TIME'] = df['TIME'].astype(str) # make sure all times are str format
            df['TIME'] = [re.sub(re.compile(r'\([^)]*\)'), '', i) for i in df['TIME']]
            df['TIME'] = df['TIME'].str.replace('.',':').str.replace('@','').str.replace('h','')

In [None]:
def add_total_seconds(df_list):
    '''Takes dfs from df_list that have had TIME cleaned and creates a TIME_SECS column of total seconds.'''
    
    for df in df_list:
        if ~df['TIME'].astype(str).str.contains('nan').any():
            results = []
            for time in df['TIME']:
                if len(time) <6:
                    date_time = datetime.datetime.strptime(time, '%S:%f')
                    a_timedelta = date_time - datetime.datetime(1900, 1, 1)
                    results.append(a_timedelta.total_seconds())
                else:
                    date_time = datetime.datetime.strptime(time, "%M:%S:%f")
                    a_timedelta = date_time - datetime.datetime(1900, 1, 1)
                    results.append(a_timedelta.total_seconds())
            df['TIME_SECS'] = results

### Combine above functions into one cleaning function.  

In [None]:
def clean_dfs(df_list, division):
    '''Take in list of dataframes, and division(str).
       Loop through each df in df_list and add column for:
       event, sex, division and championship year.
       Reorder rename and drop columns appropriately.
       Add TIME_SECS from TIME column for running events.
       '''
    
    add_events(df_list)
    add_sex(df_list)
    add_division(df_list, division)
    add_champ_year(df_list)
    
    # rename column to POSITION
    for i in range(len(df_list)):
        df_list[i].rename(columns={'Unnamed: 0' : 'POSITION'}, inplace=True)
    
    # reorder and drop columns
    col_names = ['POSITION', 'CHAMP_YEAR', 'DIVISION', 'EVENT', 'SEX', 'ATHLETE', 'YEAR', 'TEAM', 'TIME', 'TIME_SECS', 
                 'MARK', 'CONV', 'POINTS', 'MEET', 'MEET DATE']    
    for i in range(len(df_list)):
        df_list[i] = df_list[i].reindex(columns=col_names)
        
    
    clean_time(df_list)
    add_total_seconds(df_list)

### Final function that loops through list of urls for each division, scrapes and then cleans them.

In [None]:
def scrape_tfrrs(url_list, division):
    '''For each url in the list, clean the dfs and return a concatenated version of all dfs'''
    cleaned_dfs = []
    for url in url_list:
        # scrape url into list of dfs
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        soup = BeautifulSoup(webpage)
        df_list = pd.read_html(webpage)
        
        # clean each df in df_list
        clean_dfs(df_list, division)
        
        # add each cleaned df to cleaned_dfs list
        for i in df_list:
            cleaned_dfs.append(i)
        
    # concatenate cleaned_dfs list into one long dataframe
    result = pd.concat(cleaned_dfs)
    result.index = range(1, len(result) + 1)
    
    return result    

### CLEAN ONE URL AT A TIME
Unfortunately the D1 2010 url includes a Men's and Women's 3000m which throws off the scraper, So D1 2010 must be handled on it's own. This is a function modified to include the same code from above.

In [10]:
def add_variables(df_list, division):
    '''Cleaning function using modified version of above code to appropriately clean and modify dfs in df_list'''
    
    # EVENT
    events = ['100', '100', '200', '200', '400', '400', '800', '800', '1500', '1500', '5000', '5000', '10000', '10000', 
              '100H', '110H', '400H', '400H', '3000S', '3000S', '4x100', '4x100', '4x400', '4x400', 
              'HJ', 'HJ', 'PV', 'PV', 'LJ', 'LJ', 'TJ', 'TJ', 'SP', 'SP', 'DT', 'DT', 'HT', 'HT', 'JT', 'JT', 
              'Hep', 'Dec']    
    for i in range(len(df_list)):
        df_list[i]['EVENT'] = events[i]
    for i in range(len(df_list)):
        df_list[i]['EVENT'] = events[i]
        
    # SEX
    for i in range(14):
        if i%2 == 0:
            df_list[i]['SEX'] = 'Men'
        else:
            df_list[i]['SEX'] = 'Women'
    df_list[14]['SEX'] = 'Women'
    df_list[15]['SEX'] = 'Men'
    for i in range(16,40):
        if i%2 == 0:
            df_list[i]['SEX'] = 'Men'
        else:
            df_list[i]['SEX'] = 'Women'
    df_list[40]['SEX'] = 'Women'
    df_list[41]['SEX'] = 'Men'

    # DIVISION
    for i in range(len(df_list)):
        df_list[i]['DIVISION'] = division
        
    # CHAMP_YEAR
    for i in range(len(df_list)):
        df_list[i]['CHAMP_YEAR'] = [j[-4:] for j in df_list[i]['MEET DATE']]
        
    # rename column to POSITION
    for i in range(len(df_list)):
        df_list[i].rename(columns={'Unnamed: 0' : 'POSITION'}, inplace=True)
    
    # reorder and drop columns
    col_names = ['POSITION', 'CHAMP_YEAR', 'DIVISION', 'EVENT', 'SEX', 'ATHLETE', 'YEAR', 'TEAM', 'TIME', 'TIME_SECS', 
                 'MARK', 'CONV', 'POINTS', 'MEET', 'MEET DATE']    
    for i in range(len(df_list)):
        df_list[i] = df_list[i].reindex(columns=col_names)
        
    # clean time
    for df in df_list:
        if ~df['TIME'].astype(str).str.contains('nan').any():
            df['TIME'] = df['TIME'].astype(str) # make sure all times are str format
            df['TIME'] = [re.sub(re.compile(r'\([^)]*\)'), '', i) for i in df['TIME']]
            df['TIME'] = df['TIME'].str.replace('.',':').str.replace('@','').str.replace('h','')
            
    # TIME_SECS
    for df in df_list:
        if ~df['TIME'].astype(str).str.contains('nan').any():
            results = []
            for time in df['TIME']:
                if len(time) <6:
                    date_time = datetime.datetime.strptime(time, '%S:%f')
                    a_timedelta = date_time - datetime.datetime(1900, 1, 1)
                    results.append(a_timedelta.total_seconds())
                else:
                    date_time = datetime.datetime.strptime(time, "%M:%S:%f")
                    a_timedelta = date_time - datetime.datetime(1900, 1, 1)
                    results.append(a_timedelta.total_seconds())
            df['TIME_SECS'] = results

### SCRAPE ONE URL AT A TIME
Scrape the D1 2010 url into a list of dataframes for each event. 

In [None]:
%%time

d12010 = 'https://www.tfrrs.org/lists/528.html'

req = Request(d12010, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage)
dfs = pd.read_html(webpage)

In [None]:
# create deep copy of dfs to work with
dfs_copy2010 = [i.copy(deep=True) for i in dfs]

### This shows the top results for the Men's and Women's 3000m in the 2010 results that we do not want in our data.

In [None]:
dfs_copy2010[10].head()

In [None]:
dfs_copy2010[11].head()

### Drop the 3000m results from the list of dataframes.

In [None]:
del dfs_copy2010[10]
del dfs_copy2010[10]

### Now it correctly skips from Women's 1500 to Men's 5000

In [None]:
dfs_copy2010[9].head()

In [None]:
dfs_copy2010[10].head()

### Using the cleaning function from above, let's clean the 2010 data

In [None]:
add_variables(dfs_copy2010, 'D1')

### Taking a peek at the Women's 10000, it looks like we have what we want for our edge case DI 2010.

In [None]:
dfs_copy2010[13].head()

### List of URLS for each Division I and Division II
Now that we have handled the edge case DI 2010, we can use the automated looping scraper to handle sets of urls at a time for each division I, II, and III. This list of urls is taken from https://www.tfrrs.org/archives.html and includes the top 100 performances per event from the available years 2010-2019 and current 2021.

In [None]:
D1_top100 = ['https://www.tfrrs.org/lists/3191/2021_NCAA_Division_I_Outdoor_Qualifying/2021/o',
             'https://www.tfrrs.org/lists/2909.html',
             'https://www.tfrrs.org/archived_lists/2568/2019_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2019/o',
             'https://www.tfrrs.org/archived_lists/2279/2018_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2018/o',
             'https://www.tfrrs.org/archived_lists/1912/2017_NCAA_Div._I_Outdoor_Qualifying_(FINAL)/2017/o',
             'https://www.tfrrs.org/archived_lists/1688/2016_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2016/o',
             'https://www.tfrrs.org/archived_lists/1439/2015_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2015/o',
             'https://www.tfrrs.org/archived_lists/1228/2014_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2014/o',
             'https://www.tfrrs.org/archived_lists/1029/2013_NCAA_Division_I_Outdoor_Qualifying_(FINAL)/2013/o',
             'https://www.tfrrs.org/archived_lists/840/2012_NCAA_Div._I_Outdoor_Qualifiers_(Final)/2012/o',
             'https://www.tfrrs.org/archived_lists/673/2011_NCAA_Division_I_Outdoor_POP_List_(FINAL)/2011/o']

D2_top100 = ['https://www.tfrrs.org/lists/3194/2021_NCAA_Division_II_Outdoor_Qualifying',
             'https://www.tfrrs.org/lists/2908/2020_NCAA_Div._II_Outdoor_Qualifying/2020/o?gender=m',
             'https://www.tfrrs.org/archived_lists/2571/2019_NCAA_Div._II_Outdoor_Qualifying_(FINAL)/2019/o',
             'https://www.tfrrs.org/lists/2282.html',
             'https://www.tfrrs.org/lists/1913.html',
             'https://www.tfrrs.org/lists/1685.html',
             'https://www.tfrrs.org/lists/1442.html',
             'https://www.tfrrs.org/lists/1231.html',
             'https://www.tfrrs.org/lists/1032.html',
             'https://www.tfrrs.org/lists/841.html',
             'https://www.tfrrs.org/lists/674.html',
             'https://www.tfrrs.org/lists/529.html']

D3_top100 = ['https://www.tfrrs.org/lists/3195/2021_NCAA_Division_III_Outdoor_Qualifying/2021/o',
             'https://www.tfrrs.org/lists/2907/2020_NCAA_Div._III_Outdoor_Qualifying/2020/o?gender=m',
             'https://www.tfrrs.org/lists/2572.html',
             'https://www.tfrrs.org/lists/2283.html',
             'https://www.tfrrs.org/lists/1914.html',
             'https://www.tfrrs.org/lists/1684.html',
             'https://www.tfrrs.org/lists/1443.html',
             'https://www.tfrrs.org/lists/1232.html',
             'https://www.tfrrs.org/lists/1033.html',
             'https://www.tfrrs.org/lists/842.html',
             'https://www.tfrrs.org/lists/675.html',
             'https://www.tfrrs.org/lists/530.html']

### Time to Scrape! Apply the scrape function to the Division I, II and III lists of urls as well as the DI 2010 edge case that we already cleaned.  This will take some time. 

In [None]:
%%time
d1_scraped = scrape_tfrrs(D1_top100, 'D1')

# add DI 2010 to d1_scraped
d1_scraped = d1_scraped.append(pd.concat(dfs_copy2010))

d1_scraped.index = range(1, len(d1_scraped) + 1)

In [None]:
%%time
d2_scraped = scrape_tfrrs(D2_top100, 'D2')
d2_scraped.index = range(1, len(d2_scraped) + 1)

In [None]:
%%time
d3_scraped = scrape_tfrrs(D3_top100, 'D3')
d3_scraped.index = range(1, len(d3_scraped) + 1)

### Time to turn the individual lists of scraped and cleaned dataframes into one big dataframe. 

In [None]:
big_list = [d1_scraped, d2_scraped, d3_scraped]

tfrrs_scraped = pd.concat(big_list)
tfrrs_scraped.index = range(1, len(tfrrs_scraped) + 1)

In [None]:
tfrrs_scraped.tail()

### Finally we can output individual csv files for each division and the combined scrape.  

In [None]:
d1_scraped.to_csv('d1top100.csv', index=False)

d2_scraped.to_csv('d2top100.csv', index=False)

d3_scraped.to_csv('d3top100.csv', index=False)

tfrrs_scraped.to_csv('tfrrstop100.csv', index=False)

### Voila! Now we have some data to work with.  