# EDA Part II

<i>**Author:** Brendan McDonnell</i>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('manual_date_imputation.csv')

## Dealing With the Date Column 

In [3]:
# taken from stack overflow: https://stackoverflow.com/questions/3418050/month-name-to-month-number-and-vice-versa-in-python
def month_to_num(short_month):

    return {
        'jan' : 1,
        'feb' : 2,
        'mar' : 3,
        'apr' : 4,
        'may' : 5,
        'jun' : 6,
        'jul' : 7,
        'aug' : 8,
        'sep' : 9, 
        'oct' : 10,
        'nov' : 11,
        'dec' : 12
    }[short_month]

In [4]:
def date_to_season(date):
    if ',' in date:
        date = date.split(',')
        year = int(date[1])
        if '/' in date[0]:
            month = int(date[0].split('/')[0])
            day = int(date[0].split('/')[1].split()[0])
        elif '-' in date[0]:
            month = date[0].split()[0]
            day = int(date[0].split()[1].split('-')[0])
    else:
        date = date.split('-')
        year = int('20' + date[2])
        month = month_to_num(date[1].lower())
        day = int(date[0])
    return str(year) + '-' + str(month) + '-' + str(day)

In [5]:
df['datetime'] = pd.to_datetime(df.Date.map(date_to_season))

In [6]:
df = df.drop(columns='Unnamed: 0')

In [7]:
df.head()

Unnamed: 0,Name,Grade,Team,Location,Date,Events,Performance,Place,event,is_relay,has_hurdles,is_field_event,is_multi_event,datetime
0,Abbie Hetherington,(Sr-4),Oklahoma State,Big 12 Championships,"02/22 - Feb 23, 2019",1000,02:59.0,8th (F),1000,0,0,0,0,2019-02-22
1,Abbie Hetherington,(Sr-4),Oklahoma State,Big 12 Championships,"02/22 - Feb 23, 2019",1000,02:51.0,4th (P),1000,0,0,0,0,2019-02-22
2,Abbie Hetherington,(Sr-4),Oklahoma State,Arkansas Qualifier,15-Feb-19,800,02:11.7,3rd (F),800,0,0,0,0,2019-02-15
3,Abbie Hetherington,(Sr-4),Oklahoma State,2019 Husky Classic,"02/08 - Feb 09, 2019",800,02:11.1,15th (F),800,0,0,0,0,2019-02-08
4,Abbie Hetherington,(Sr-4),Oklahoma State,2019 UW Invitational,"01/25 - Jan 26, 2019",800,02:10.0,18th (F),800,0,0,0,0,2019-01-25


In [8]:
# DNF; can get rid of the one performance that is null (source: https://www.tfrrs.org/athletes/5464384.html

df.dropna(axis=0, subset=['Performance'], inplace=True)

##  Updating Time Column for Purdy Points Eligible Events

### Taking Care of Performances and Places 

In [9]:
# all the non time or numeric values that exist in the performance category need to be removed; here's the list
list_of_non_performances = list(df[~df.Performance.str.contains('\d')].Performance.unique())

In [10]:
df = df[~df['Performance'].isin(list_of_non_performances)]

In [11]:
# fill NA's in the place column w/ 999
df.Place = df.Place.fillna('999')

In [12]:
df.shape

(187176, 14)

In [13]:
# quick check
df[~df.Place.str.contains('p|P|f|F|999')].Place.unique()

array(['26th', '57th', '40th', '178th', '36th', '23rd', '63rd', '111st',
       '49th', '218th', '12th', '21st', '27th', '24th', '116th', '25th',
       '52nd', '9th', '226th', '34th', '35th', '68th', '59th', '78th',
       '16th', '45th', '3rd', '2nd', '17th', '1st', '120th', '15th',
       '31st', '38th', '10th', '6th', '79th', '163rd', '8th', '70th',
       '219th', '13th', '91st', '46th', '126th', '129th', '5th', '19th',
       '100th', '60th', '237th', '167th', '18th', '144th', '7th', '65th',
       '190th', '14th', '67th', '74th', '200th', '90th', '29th', '172nd',
       '115th', '75th', '11th', '56th', '131st', '101st', '66th', '28th',
       '4th', '146th', '183rd', '157th', '55th', '72nd', '136th', '133rd',
       '207th', '54th', '73rd', '182nd', '122nd', '58th', '170th', '22nd',
       '53rd', '95th', '102nd', '124th', '64th', '105th', '245th',
       '171st', '37th', '48th', '165th', '41st', '33rd', '162nd', '61st',
       '30th', '32nd', '140th', '77th', '224th', '76th', '

In [14]:
# mark prelims values for place
df['is_prelim'] = df.Place.str.contains('p|P')

In [15]:
def return_place(place):
    strip_string = '() abcdefghijklmnopqrstuvwxyz'
    if place == '999':
        return int(place)
    else:
        place = place.lower().strip(strip_string)
        if place == '':
            return 999
        else:
            return int(place)

In [16]:
df['Place'] = df.Place.apply(return_place)

In [17]:
# Create columns indicating season type for each season in dataset

# 2010-11 Indoor Track Season
df['indoor_season_2011'] = df.datetime <= '2011-03-01'

# 2011 Outdoor Track Season
df['outdoor_season_2011'] = (('2011-03-01' <= df.datetime) &
                            (df.datetime <= '2011-07-31'))

# 2011 XC Season Has 0 Entries

# 2011-12 Indoor Track Season
df['indoor_season_2012'] = (('2011-12-01' <= df.datetime) & 
                            (df.datetime <= '2012-03-02'))

# 2012 Outdoor Track Season
df['outdoor_season_2012'] = (('2012-03-03' <= df.datetime) & 
                             (df.datetime <= '2012-07-31'))

# 2012 XC Season Has 0 Entries

# 2012-13 Indoor Track Season
df['indoor_season_2013'] = (('2012-11-20' <= df.datetime) & 
                            (df.datetime <= '2013-03-10'))

# 2013 Outdoor Track Season
df['outdoor_season_2013'] = (('2013-03-10' <= df.datetime) & 
                             (df.datetime <= '2013-07-31'))

# 2013 XC Season
df['xc_season_2013'] = (('2013-08-01' <= df.datetime) & 
                        (df.datetime <= '2013-11-30'))

# 2013-14 Indoor Track Season
df['indoor_season_2014'] = (('2013-12-01' <= df.datetime) & 
                            (df.datetime <= '2014-03-15'))

# 2014 Outdoor Track Season
df['outdoor_season_2014'] = (('2014-03-16' <= df.datetime) & 
                             (df.datetime <= '2014-07-31'))

# 2014 XC Season
df['xc_season_2014'] = (('2014-08-01' <= df.datetime) & 
                        (df.datetime <= '2014-11-30'))

# 2014-15 Indoor Track Season
df['indoor_season_2015'] = (('2014-12-01' <= df.datetime) & 
                           (df.datetime <= '2015-03-13') & 
                           (df.Location != '2015 Ben Brown Invitational') & 
                           (df.Location != 'Cal State LA Invitational') & 
                           (df.Location != 'Cal Opener 2015') & 
                           (df.Location != '2015 Skyline HS / Joel Ezar Track Classic') & 
                           (df.Location != 'Jaguar Opener'))

# 2015 Outdoor Track Season
df['outdoor_season_2015'] = (('2015-03-06' <= df.datetime) & 
                             (df.datetime <= '2015-07-31') &
                             (df.Location != 'NCAA Division I Indoor Track & Field Championships') &
                             (df.Location != '2015 IC4A/ECAC Indoor Track & Field Championships'))

# 2015 XC Season
df['xc_season_2015'] = (('2015-08-01' <= df.datetime) &
                        (df.datetime <= '2015-11-21'))

# 2015-16 Indoor Track Season
df['indoor_season_2016'] = (('2015-11-22' <= df.datetime) &
                            (df.datetime <= '2016-03-11'))

# 2016 Outdoor Track Season
df['outdoor_season_2016'] = (('2016-03-12' <= df.datetime) &
                             (df.datetime <= '2016-08-21'))

# 2016 XC Season
df['xc_season_2016'] = (('2016-08-22' <= df.datetime) &
                        (df.datetime <= '2016-11-19'))

# 2016-17 Indoor Track Season
df['indoor_season_2017'] = (('2016-11-20' <= df.datetime) &
                            (df.datetime <= '2017-03-10') &
                            (df.Location != 'Central Arkansas Spring Opener'))

# 2017 Outdoor Track Season
df['outdoor_season_2017'] = (('2017-03-10' <= df.datetime) &
                             (df.datetime < '2017-08-04') &
                             (df.Location != 'NCAA Division I Indoor Track & Field Championships'))

# 2017 XC Season
df['xc_season_2017'] = (('2017-08-05' <= df.datetime) & 
                        (df.datetime <= '2017-11-20'))

# 2017-18 Indoor Track Season
df['indoor_season_2018'] = (('2017-11-21' <= df.datetime) &
                            (df.datetime <= '2018-03-09'))

# 2018 Outdoor Track Season
df['outdoor_season_2018'] = (('2018-03-10' <= df.datetime) &
                             (df.datetime < '2018-7-31'))

# 2018 XC Season
df['xc_season_2018'] = (('2018-08-01' < df.datetime) & 
                        (df.datetime < '2018-11-20'))

# 2018-19 Indoor Track Season
df['indoor_season_2019'] = (('2018-11-21' <= df.datetime) & 
                            (df.datetime <= '2019-03-12'))

# 2019 Outdoor Track Season
df['outdoor_season_2019'] = ('2019-03-13' <= df.datetime)

### Indicating Championships; Conference, Regional, and National

In [18]:

list(df.Location.unique())

['Big 12 Championships',
 'Arkansas Qualifier',
 '2019 Husky Classic',
 '2019 UW Invitational',
 'Arkansas Invitational',
 'Big 12 Cross Country Championships',
 'Arturo Barrios Invitational',
 'Nuttycombe Wisconsin Invitational Presented By Under Armour',
 'NCAA West Preliminary Round',
 'Arkansas Twilight',
 'National Relay Championships',
 'John McDonnell Invitational',
 'Stanford Invitational',
 'NCAA Division I Indoor Track & Field Championships',
 'Big 12 Indoor Track & Field Championships',
 '2018 Husky Classic',
 'Razorback Invitational',
 'NCAA Division I West Preliminary Round',
 'Big 12 Outdoor Track & Field Championships',
 '2017 Drake Relays presented by Hy-Vee',
 'Arkansas Spring Invitational',
 '2017 Husky Classic',
 '2017 Holiday Inn Invitational',
 'NCAA Division I Cross Country Championships',
 'NCAA Division I Midwest Region Cross Country Championships',
 '2016 Big 12 Cross Country Championships',
 'Penn State National Open',
 'Iowa State Classic',
 'Rock Chalk Class

In [19]:
# flag conference, regional and national championship meet appearances
# flag USA Indoor and Outdoor Championships
# flag olympic and world's appearances


# world or olympic championship appearance
df['oly_or_world_champs'] = (((df.Location.str.contains('IAAF')) & ~(df.Location.str.contains('U20'))) |
                              (df.Location.str.contains('Olympic Games')))

# USA Championship
df['usa_championship'] = (((df.Location.str.contains('Trial')) | (df.Location.str.contains('USATF'))) &
                          (~(df.Location.str.contains('U20'))) &
                          (~(df.Location.str.contains('Junior'))))


# conference championships for the power 5
df['conf_championship'] = (((df.Location.str.contains('Big 12')) & (df.Location.str.contains('Champ'))) | 
                          ((df.Location.str.contains('Big Ten')) & (df.Location.str.contains('Champ'))) |
                          ((df.Location.str.contains('Pac')) & (df.Location.str.contains('Champ'))) |
                          ((df.Location.str.contains('SEC')) & (df.Location.str.contains('Champ'))) |
                          ((df.Location.str.contains('ACC ')) & (df.Location.str.contains('Champ'))))

# Indoor track nationals for DI only
df['indoor_track_nationals'] = ((df.Location.str.contains('NCAA Division I Indoor Championships')) | 
                               (df.Location.str.contains('NCAA Division I Indoor Track & Field Championships')))

# West / East NCAA Prelim Rounds (Regionals for track) for DI only
df['outdoor_track_regionals'] = (df.Location.str.contains('Round'))

# NCAAs for Outdoors DI only
df['outdoor_track_nationals'] = ((df.Location.str.contains('NCAA Division I Track & Field Championships')) | 
                                (df.Location.str.contains('NCAA Division I Championships')) | 
                                (df.Location.str.contains('NCAA Division I Outdoor Track & Field Championships')) | 
                                (df.Location.str.contains('NCAA Division I Outdoor Championships')) | 
                                (df.Location.str.contains('NCAA Division I National Championships')))

# XC Regionals for DI only
df['xc_regionals'] = ((df.Location.str.contains('Region')) &
                     (df.Location.str.contains('Champ')) & 
                     (df.Location.str.contains('NCAA')) &
                     (~(df.Location.str.contains('II'))))

# XC Nationals for DI only
df['xc_nationals'] = ((df.Location.str.contains('NCAA Division I Cross Country Championships')) |
                     (df.Location.str.contains('NCAA DI Cross Country Championships')))

# IAAF World Juniors
df['world_jr_championship'] = ((df.Location.str.contains('World')) &
                              (df.Location.str.contains('U20')))

# USATF Junior Championships
df['usa_jr_championship'] = ((df.Location.str.contains('USATF')) & 
                             ((df.Location.str.contains('Junior')) | (df.Location.str.contains('U20'))))

### Converting Time to Seconds 

In [20]:
# fixing two incorrect values

df.is_field_event = df.is_field_event + ((df['Performance'] == '14m') | (df['Performance'] == '2m'))

In [21]:
# fix a couple of columns
df.is_multi_event = df.is_multi_event.map({1: True, 0: False})
df.is_relay = df.is_relay.map({1: True, 0: False})
df.has_hurdles = df.has_hurdles.map({1: True, 0: False})
df.is_field_event = df.is_field_event.map({1: True, 0: False})

In [22]:
df.Performance

0         02:59.0
1         02:51.0
2         02:11.7
3         02:11.1
4         02:10.0
           ...   
195141       7.53
195142      24.36
195143    03:45.5
195144       7.61
195145    03:43.0
Name: Performance, Length: 187176, dtype: object

In [23]:
df.head()

Unnamed: 0,Name,Grade,Team,Location,Date,Events,Performance,Place,event,is_relay,...,oly_or_world_champs,usa_championship,conf_championship,indoor_track_nationals,outdoor_track_regionals,outdoor_track_nationals,xc_regionals,xc_nationals,world_jr_championship,usa_jr_championship
0,Abbie Hetherington,(Sr-4),Oklahoma State,Big 12 Championships,"02/22 - Feb 23, 2019",1000,02:59.0,8,1000,False,...,False,False,True,False,False,False,False,False,False,False
1,Abbie Hetherington,(Sr-4),Oklahoma State,Big 12 Championships,"02/22 - Feb 23, 2019",1000,02:51.0,4,1000,False,...,False,False,True,False,False,False,False,False,False,False
2,Abbie Hetherington,(Sr-4),Oklahoma State,Arkansas Qualifier,15-Feb-19,800,02:11.7,3,800,False,...,False,False,False,False,False,False,False,False,False,False
3,Abbie Hetherington,(Sr-4),Oklahoma State,2019 Husky Classic,"02/08 - Feb 09, 2019",800,02:11.1,15,800,False,...,False,False,False,False,False,False,False,False,False,False
4,Abbie Hetherington,(Sr-4),Oklahoma State,2019 UW Invitational,"01/25 - Jan 26, 2019",800,02:10.0,18,800,False,...,False,False,False,False,False,False,False,False,False,False


In [24]:
# filter by performances that are purdy points eligible

df['purdy_points_eligible'] = (~df.xc_season_2013 * 
                                 ~df.xc_season_2014 * 
                                 ~df.xc_season_2015 * 
                                 ~df.xc_season_2016 * 
                                 ~df.xc_season_2017 * 
                                 ~df.xc_season_2018 *
                                 ~df.is_field_event * 
                                 ~df.is_relay * 
                                 ~df.has_hurdles * 
                                 ~df.is_multi_event)

  op=op_str, alt_op=unsupported[op_str]


In [25]:
# create purdy points eligible performance times
df.purdy_points_eligible = df.Performance * df.purdy_points_eligible

In [26]:
df['is_wind_aided'] = df.purdy_points_eligible.str.contains('W|w')

In [27]:
df.purdy_points_eligible = df.purdy_points_eligible * ~df.is_wind_aided

In [28]:
df[df.purdy_points_eligible.str.contains('h|H')]

Unnamed: 0,Name,Grade,Team,Location,Date,Events,Performance,Place,event,is_relay,...,conf_championship,indoor_track_nationals,outdoor_track_regionals,outdoor_track_nationals,xc_regionals,xc_nationals,world_jr_championship,usa_jr_championship,purdy_points_eligible,is_wind_aided
42178,Mckenna Smith,(Red Shirt/Unattached),West Virginia,Marty Pushkin Track Classic,2-Dec-17,500.0,1:20.2H,1,500,False,...,False,False,False,False,False,False,False,False,1:20.2H,False
42418,Hayley Jackson,(Jr-3),West Virginia,Marty Pushkin Track Classic,2-Dec-17,1609.344,5:11.1H,1,Mile,False,...,False,False,False,False,False,False,False,False,5:11.1H,False
42663,Michaela Ashford,(Jr-3),West Virginia,Marty Pushkin Track Classic,2-Dec-17,60.0,7.7H,3,60,False,...,False,False,False,False,False,False,False,False,7.7H,False
42867,Marianne Abdalah,(So-2),West Virginia,Marty Pushkin Track Classic,2-Dec-17,3000.0,11:06.4H,1,3000,False,...,False,False,False,False,False,False,False,False,11:06.4H,False
42895,Shamoya Mcneil,(Sr-4),West Virginia,Marty Pushkin Track Classic,2-Dec-17,60.0,7.7H,5,60,False,...,False,False,False,False,False,False,False,False,7.7H,False


In [29]:
# calculate seconds and make a new column from the times given
def time_to_seconds(performance):
    performance = str(performance)
    if performance == '':
        return 0
    else:
        performance = performance.lower().strip('abcdefghijklmnopqrstuvwxyz')
        if ':' in performance:
            performance = performance.split(':')
            return float(performance[0])*60 + float(performance[1])
        else:
            return float(performance)

In [30]:
# a column of the number of seconds for purdy points eligible times
df['seconds'] = df.purdy_points_eligible.apply(time_to_seconds)

In [31]:
df.seconds

0         179.00
1         171.00
2         131.70
3         131.10
4         130.00
           ...  
195141      7.53
195142     24.36
195143      0.00
195144      7.61
195145      0.00
Name: seconds, Length: 187176, dtype: float64

In [32]:
df['meters'] = df.seconds.apply(lambda x: False if x == 0 else True) * df.Events

In [33]:
df.meters.unique()

array(['1000', '800', '', '1500', '1609.344', '400', '600', '548.64',
       '5000', '10000', '3000', '100', '200', '60', '300', '55', '1600',
       '500', '914.4'], dtype=object)

In [34]:
# 0 meters if event is not purdy points elgibile
df['meters'] = df['meters'].apply(lambda x: 0 if x == '' else float(x))

In [35]:
df.meters.unique()

array([ 1000.   ,   800.   ,     0.   ,  1500.   ,  1609.344,   400.   ,
         600.   ,   548.64 ,  5000.   , 10000.   ,  3000.   ,   100.   ,
         200.   ,    60.   ,   300.   ,    55.   ,  1600.   ,   500.   ,
         914.4  ])

## Purdy Points

https://www.cs.uml.edu/~phoffman/xcinfo3.html

In [36]:
# Preparing for Purdy points function
# code translated from link above by Declan McDonnell
def frac(d):
    if (d <110):
        return 0
    else:
        laps = int(d/400)
        meters = d - laps*400
        if (meters <= 50):
            partlap = 0
        elif (meters <= 150):
            partlap = meters - 50
        elif (meters <=250):
            partlap = 100
        elif (meters <=350):
            partlap = 100 + (meters - 250)
        elif (meters <=400):
            partlap = 200
    tmeters= laps*200 + partlap
    return (tmeters/d)

In [37]:
# Purdy points function
# code translated from link above by Declan McDonnell
def purdy(dist, tsec):
    i = 0
    ptable= [40.0,11.000, 50.0,10.9960, 60.0,10.9830, 70.0,10.9620,
            80.0,10.934, 90.0,10.9000,100.0,10.8600,110.0,10.8150,
            120.0,10.765,130.0,10.7110,140.0,10.6540,150.0,10.5940,
            160.0,10.531,170.0,10.4650,180.0,10.3960,200.0,10.2500,
            220.0,10.096,240.0, 9.9350,260.0, 9.7710,280.0, 9.6100,
            300.0, 9.455,320.0, 9.3070,340.0, 9.1660,360.0, 9.0320,
            380.0, 8.905,400.0, 8.7850,450.0, 8.5130,500.0, 8.2790,
            550.0, 8.083,600.0, 7.9210,700.0, 7.6690,800.0, 7.4960,
            900.0,7.32000, 1000.0,7.18933, 1200.0,6.98066, 1500.0,6.75319,
            2000.0,6.50015, 2500.0,6.33424, 3000.0,6.21913, 3500.0,6.13510,
            4000.0,6.07040, 4500.0,6.01822, 5000.0,5.97432, 6000.0,5.90181,
            7000.0,5.84156, 8000.0,5.78889, 9000.0,5.74211,10000.0,5.70050,
           12000.0,5.62944,15000.0,5.54300,20000.0,5.43785,25000.0,5.35842,
           30000.0,5.29298,35000.0,5.23538,40000.0,5.18263,50000.0,5.08615,
            60000.0,4.99762,80000.0,4.83617,100000.0,4.68988,
                                      -1.0,0.0 ]
    c1=0.20
    c2=0.08
    c3=0.0065
    d=0.1
    i = 0
    while dist > d and d>0:
        d=ptable[i]
        i+=2
    if (d<1):
        return 0
    i+= -2
    d3=ptable[i]
    t3= d3/ptable[i+1]
    if d3 >= 50.0:
        d1=ptable[i-2]
        t1=d1/ptable[i-1]
    else:
        d1 = 0
        t1 = 0
    t = t1 + (t3-t1)*(dist-d1)/(d3-d1)
    v = dist/t
    t950 = t +c1+c2*v +c3*frac(dist)*v*v
    k = 0.0654 - 0.00258*v
    a = 85/k
    b = 1-950/a
    p = a*(t950/tsec - b)
    return p

In [38]:
purdy_dict = {}
for i, j in df[['seconds', 'meters']].iterrows():
#     print(i)
#     print(j['Meters'])
#     print(j['duration_seconds'])
#     print('')
    purdy_dict[i] = [purdy(j['meters'], j['seconds'])]

In [39]:
df = pd.concat([df, pd.np.round(pd.DataFrame(purdy_dict).T, 2)], axis=1)

In [40]:
df.head(30)

Unnamed: 0,Name,Grade,Team,Location,Date,Events,Performance,Place,event,is_relay,...,outdoor_track_nationals,xc_regionals,xc_nationals,world_jr_championship,usa_jr_championship,purdy_points_eligible,is_wind_aided,seconds,meters,0
0,Abbie Hetherington,(Sr-4),Oklahoma State,Big 12 Championships,"02/22 - Feb 23, 2019",1000,02:59.0,8,1000,False,...,False,False,False,False,False,02:59.0,False,179.0,1000.0,555.1
1,Abbie Hetherington,(Sr-4),Oklahoma State,Big 12 Championships,"02/22 - Feb 23, 2019",1000,02:51.0,4,1000,False,...,False,False,False,False,False,02:51.0,False,171.0,1000.0,621.51
2,Abbie Hetherington,(Sr-4),Oklahoma State,Arkansas Qualifier,15-Feb-19,800,02:11.7,3,800,False,...,False,False,False,False,False,02:11.7,False,131.7,800.0,613.79
3,Abbie Hetherington,(Sr-4),Oklahoma State,2019 Husky Classic,"02/08 - Feb 09, 2019",800,02:11.1,15,800,False,...,False,False,False,False,False,02:11.1,False,131.1,800.0,620.7
4,Abbie Hetherington,(Sr-4),Oklahoma State,2019 UW Invitational,"01/25 - Jan 26, 2019",800,02:10.0,18,800,False,...,False,False,False,False,False,02:10.0,False,130.0,800.0,633.53
5,Abbie Hetherington,(Sr-4),Oklahoma State,Arkansas Invitational,11-Jan-19,800,02:11.1,1,800,False,...,False,False,False,False,False,02:11.1,False,131.1,800.0,620.7
6,Abbie Hetherington,(Sr-4),Oklahoma State,Arkansas Invitational,11-Jan-19,4x400,03:53.7,9,4x400,True,...,False,False,False,False,False,,False,0.0,0.0,0.0
7,Abbie Hetherington,(Sr-4),Oklahoma State,Big 12 Cross Country Championships,26-Oct-18,6000,21:18.4,26,6K,False,...,False,False,False,False,False,,False,0.0,0.0,0.0
8,Abbie Hetherington,(Sr-4),Oklahoma State,Arturo Barrios Invitational,13-Oct-18,6000,22:09.1,57,6K,False,...,False,False,False,False,False,,False,0.0,0.0,0.0
9,Abbie Hetherington,(Sr-4),Oklahoma State,Nuttycombe Wisconsin Invitational Presented By...,28-Sep-18,6000,21:59.7,40,6K,False,...,False,False,False,False,False,,False,0.0,0.0,0.0


In [41]:
df.rename(columns={0:'purdy_points'}, inplace=True)

In [42]:
# for fun; best and worst performances in recent years from the power 5's
df.sort_values('purdy_points', ascending=False)[['Name', 'Team', 'Location', 'Date', 'Events', 'Performance', 'Place', 'purdy_points']]

Unnamed: 0,Name,Team,Location,Date,Events,Performance,Place,purdy_points
44306,Divine Oduduru,Texas Tech,NCAA Division I Outdoor Track & Field Champion...,"06/05 - Jun 08, 2019",200,19.73,1,1076.48
44597,Divine Oduduru,Texas Tech,Michael Johnson Invitational,"04/19 - Apr 20, 2019",200,19.76,1,1072.97
101582,Cravon Gillespie,Oregon,NCAA Division I Outdoor Track & Field Champion...,"Jun 5-8, 2019",200,19.93,2,1053.31
44307,Divine Oduduru,Texas Tech,NCAA Division I Outdoor Track & Field Champion...,"06/05 - Jun 08, 2019",200,19.97,1,1048.74
87412,Quincy Hall,South Carolina,Gamecock Invitational Outdoor,13-Apr-19,400,44.53,1,1046.29
44224,Divine Oduduru,Texas Tech,NCAA Division I Outdoor Track & Field Champion...,"06/05 - Jun 08, 2019",100,9.86,1,1046.14
100111,Dwight St Hillaire,Kentucky,NCAA East Preliminary Round,"05/24 - May 26, 2018",400,44.55,1,1045.36
87407,Quincy Hall,South Carolina,SEC Outdoor Track & Field Championships,"May 9-11, 2019",400,44.6,1,1043.02
504,Wil London,Baylor,NCAA Division I Outdoor Track & Field Champion...,"06/05 - Jun 08, 2019",400,44.63,3,1041.62
9496,Jonathan Jones,Texas,NCAA Division I Outdoor Track & Field Champion...,"06/05 - Jun 08, 2019",400,44.64,4,1041.15


In [43]:
# read final dataframe for analysis and modeling to csv
df.to_csv('final_performances_for_modeling.csv', index=False)