In [1]:
from datetime import datetime
from datetime import time
from datetime import timedelta

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import sys
sys.path.append('../../')

from pygolfdata.data import shotlink


plt.rcParams["figure.figsize"] = (10,5)
pd.options.display.max_rows = 200

In [2]:
data_path = '../../../golf_course_project_data'

In [3]:
weather = pd.read_csv(f'{data_path}/pga_tour_weather_data.csv')
weather.shape

(42102, 14)

In [4]:
weather[:3]

Unnamed: 0,Date,Hour,Latitude,Longitude,Summary,DegreesFahrenheit,Humidity,Visibility,WindBearing,WindGust,WindSpeed,PrecipitationIntensity,PrecipitationType,CourseName
0,2008-09-18,0,34.143845,-118.174506,Clear,66.65,0.74,9.5,88.0,,4.03,0.0,,Annandale GC
1,2008-09-18,1,34.143845,-118.174506,Clear,66.53,0.75,9.34,144.0,,4.3,0.0,,Annandale GC
2,2008-09-18,2,34.143845,-118.174506,Clear,65.86,0.78,8.95,131.0,,3.94,0.0,,Annandale GC


In [5]:
weather['Hour'].value_counts(dropna=False)

1     1757
11    1755
12    1755
3     1755
4     1755
5     1755
6     1755
7     1755
8     1755
9     1755
10    1755
0     1755
13    1755
14    1755
15    1755
16    1755
22    1754
17    1754
18    1754
19    1754
20    1754
21    1754
23    1754
2     1742
Name: Hour, dtype: int64

In [6]:
weather = weather[weather['Hour'] != 24]
weather.shape

(42102, 14)

In [5]:
weather['WeatherDateAndHour'] = pd.to_datetime(weather['Date'] + ' ' + weather['Hour'].astype(str).str.zfill(2))
weather[:3]

ValueError: hour must be in 0..23

In [6]:
shots = shotlink.get_shots_augmented([2017], data_path)
shots = shotlink.prepare_shots(shots)
shots.shape

(1214437, 48)

In [7]:
shots[:3]

Unnamed: 0,TourCode,TourDescription,Year,TournamentNum,PlayerNum,CourseNum,PermanentTournamentNum,PlayerFirstName,PlayerLastName,Round,...,Date,LeftRight,StrokesGainedBaseline,StrokesGainedCategory,RecoveryShot,AMWindSpd,PMWindSpd,AMWindDir,PMWindDir,ShotDateAndTime
0,R,PGA TOUR,2017,10,1810,552,464,Phil,Mickelson,1,...,2016-10-13,R,-0.091,Off the Tee,No,C,5-10,C,DW,2016-10-13 12:41:00
1,R,PGA TOUR,2017,10,1810,552,464,Phil,Mickelson,1,...,2016-10-13,R,0.076,Approach the Green,No,C,5-10,C,DW,2016-10-13 12:46:00
2,R,PGA TOUR,2017,10,1810,552,464,Phil,Mickelson,1,...,2016-10-13,L,0.07,Putting,No,C,5-10,C,DW,2016-10-13 12:50:00


In [8]:
# shots['ShotDateAndTime'] = pd.to_datetime(shots['Date'].astype(str) + ' ' + shots['Time'].astype(str).str.zfill(4), format='%Y-%m-%d %H%M')
# shots[:3]

In [9]:
# without this, merge_asof gives me 'left keys must be sorted' (right key must be sorted too; I don't both here)
shots.sort_values(by=['ShotDateAndTime'], inplace=True)
weather.sort_values(by=['WeatherDateAndHour'], inplace=True)

In [10]:
combined = pd.merge_asof(shots, weather, left_on='ShotDateAndTime', right_on='WeatherDateAndHour', 
                         suffixes=['_shots', '_weather'], direction='nearest')
combined['TimeDifference'] = combined['ShotDateAndTime'] - combined['WeatherDateAndHour']
combined[:3]

Unnamed: 0,TourCode,TourDescription,Year,TournamentNum,PlayerNum,CourseNum,PermanentTournamentNum,PlayerFirstName,PlayerLastName,Round,...,Humidity,Visibility,WindBearing,WindGust,WindSpeed,PrecipitationIntensity,PrecipitationType,CourseName_weather,WeatherDateAndHour,TimeDifference
0,R,PGA TOUR,2017,10,27644,552,464,Brian,Harman,1,...,0.87,8.94,248.0,,5.32,0.0,,Silverado Resort and Spa North,2016-10-13 07:00:00,00:20:00
1,R,PGA TOUR,2017,10,24358,552,464,Robert,Garrigus,1,...,0.87,8.94,248.0,,5.32,0.0,,Silverado Resort and Spa North,2016-10-13 07:00:00,00:20:00
2,R,PGA TOUR,2017,10,25720,552,464,Chad,Collins,1,...,0.87,8.94,248.0,,5.32,0.0,,Silverado Resort and Spa North,2016-10-13 07:00:00,00:21:00


In [11]:
combined.iloc[0]

TourCode                                                R
TourDescription                                  PGA TOUR
Year                                                 2017
TournamentNum                                          10
PlayerNum                                           27644
CourseNum                                             552
PermanentTournamentNum                                464
PlayerFirstName                                     Brian
PlayerLastName                                     Harman
Round                                                   1
TournamentName                               Safeway Open
CourseName_shots           Silverado Resort and Spa North
Hole                                                   10
HoleScore                                               4
ParValue                                                4
Yardage                                               422
Shot                                                    1
ShotType      

In [62]:
combined.dtypes

TourCode                          category
TourDescription                   category
Year                                uint16
TournamentNum                       uint16
PlayerNum                           uint16
CourseNum                           uint16
PermanentTournamentNum              uint16
PlayerFirstName                   category
PlayerLastName                    category
Round                                uint8
TournamentName                    category
CourseName_shots                  category
Hole                                 uint8
HoleScore                          float32
ParValue                             uint8
Yardage                             uint16
Shot                                 uint8
ShotType                          category
NumStrokes                           uint8
FromLocationScorer                category
FromLocationEnhanced                object
ToLocationScorer                  category
ToLocationEnhanced                category
Distance   

In [12]:
combined[['ShotDateAndTime','WeatherDateAndHour','TimeDifference']]

Unnamed: 0,ShotDateAndTime,WeatherDateAndHour,TimeDifference
0,2016-10-13 07:20:00,2016-10-13 07:00:00,00:20:00
1,2016-10-13 07:20:00,2016-10-13 07:00:00,00:20:00
2,2016-10-13 07:21:00,2016-10-13 07:00:00,00:21:00
3,2016-10-13 07:21:00,2016-10-13 07:00:00,00:21:00
4,2016-10-13 07:21:00,2016-10-13 07:00:00,00:21:00
5,2016-10-13 07:21:00,2016-10-13 07:00:00,00:21:00
6,2016-10-13 07:26:00,2016-10-13 07:00:00,00:26:00
7,2016-10-13 07:26:00,2016-10-13 07:00:00,00:26:00
8,2016-10-13 07:26:00,2016-10-13 07:00:00,00:26:00
9,2016-10-13 07:26:00,2016-10-13 07:00:00,00:26:00


In [13]:
combined['TimeDifference'].value_counts(dropna=False)

00:21:00             20694
-1 days +23:51:00    20637
-1 days +23:41:00    20627
-1 days +23:31:00    20608
00:30:00             20593
00:01:00             20585
00:11:00             20498
00:16:00             20460
00:02:00             20459
-1 days +23:50:00    20381
-1 days +23:46:00    20378
00:06:00             20371
00:26:00             20368
00:22:00             20358
-1 days +23:45:00    20354
-1 days +23:52:00    20311
-1 days +23:40:00    20309
-1 days +23:36:00    20238
-1 days +23:56:00    20234
-1 days +23:32:00    20179
-1 days +23:42:00    20148
00:12:00             20138
00:10:00             20121
00:20:00             20104
-1 days +23:35:00    20101
00:07:00             20099
-1 days +23:47:00    20082
00:17:00             20076
-1 days +23:44:00    20071
00:00:00             20052
00:27:00             20048
00:03:00             20031
00:05:00             20024
-1 days +23:57:00    20022
00:25:00             19990
00:15:00             19987
-1 days +23:53:00    19935
-

In [91]:
len(combined) - 1203650   # 1203650 is the time-of-writing within +/- 30m count

10787

In [93]:
10787 / len(combined) * 100  # pct of shots that don't have weather data w/i 30m

0.8882305133983895

In [42]:
foo = abs(combined['TimeDifference']).value_counts(dropna=False)
foo.index = foo.index / np.timedelta64(1,'m') # convert to total minutes
foo.head(10) # use head instead of slice syntax because slice here actually uses index value?

16.0    40531
10.0    40502
21.0    40441
29.0    40424
20.0    40413
15.0    40341
1.0     40318
9.0     40306
19.0    40283
26.0    40236
Name: TimeDifference, dtype: int64

In [53]:
print(len(foo[foo.index <= 30]))
print(foo[foo.index <= 30])

31
16.0    40531
10.0    40502
21.0    40441
29.0    40424
20.0    40413
15.0    40341
1.0     40318
9.0     40306
19.0    40283
26.0    40236
2.0     40225
22.0    40149
25.0    40091
3.0     40053
7.0     40034
11.0    40028
8.0     40016
17.0    40010
28.0    39974
14.0    39964
13.0    39946
18.0    39914
6.0     39909
4.0     39898
27.0    39887
12.0    39858
24.0    39775
5.0     39774
23.0    39705
30.0    20593
0.0     20052
Name: TimeDifference, dtype: int64


In [81]:
sum(foo.index > 30), sum(foo[foo.index > 30])

(777, 10787)

In [72]:
sum(foo[foo.index <= 30]), sum(foo[foo.index > 30])

(1203650, 10787)

In [84]:
bar = combined[combined['TimeDifference'] > timedelta(minutes=30)]
bar.shape

(10571, 64)

In [85]:
bar[:3]

Unnamed: 0,TourCode,TourDescription,Year,TournamentNum,PlayerNum,CourseNum,PermanentTournamentNum,PlayerFirstName,PlayerLastName,Round,...,Humidity,Visibility,WindBearing,WindGust,WindSpeed,PrecipitationIntensity,PrecipitationType,CourseName_weather,WeatherDateAndHour,TimeDifference
96061,R,PGA TOUR,2017,60,30191,733,457,Julián,Etulain,1,...,0.66,7.78,78.0,,7.75,0.0,,,2016-11-10 13:00:00,00:31:00
96062,R,PGA TOUR,2017,60,25818,733,457,Scott,Piercy,1,...,0.66,7.78,78.0,,7.75,0.0,,,2016-11-10 13:00:00,00:31:00
96063,R,PGA TOUR,2017,60,12510,733,457,Chad,Campbell,1,...,0.66,7.78,78.0,,7.75,0.0,,,2016-11-10 13:00:00,00:31:00


In [87]:
len(bar['TimeDifference'].value_counts(dropna=False))

777

In [88]:
sum(bar['TimeDifference'].value_counts(dropna=False))

10571

In [90]:
bar.groupby(['TournamentName', 'Year']).size()

TournamentName                                   Year
OHL Classic at Mayakoba                          2017    3252
SBS Tournament of Champions                      2017    2294
Sony Open in Hawaii                              2017    5020
the Memorial Tournament presented by Nationwide  2017       5
dtype: int64

In [91]:
combined[combined['TournamentName'] == 'OHL Classic at Mayakoba']['Year'].value_counts()

2017    28562
Name: Year, dtype: int64

In [92]:
combined[combined['TournamentName'] == 'SBS Tournament of Champions']['Year'].value_counts()

2017    9063
Name: Year, dtype: int64

In [95]:
combined[combined['TournamentName'] == 'Sony Open in Hawaii']['Year'].value_counts()

2017    30430
Name: Year, dtype: int64

In [96]:
combined[combined['TournamentName'] == 'the Memorial Tournament presented by Nationwide']['Year'].value_counts()

2017    28697
Name: Year, dtype: int64