In [65]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import sys
sys.path.append('../../')

from pygolfdata.data import shotlink


plt.rcParams["figure.figsize"] = (10,5)
pd.options.display.max_rows = 100

In [9]:
data_path = '../../../golf_course_project_data'

In [76]:
weather = pd.read_csv(f'{data_path}/pga_tour_weather_data.csv')
weather.shape

(41622, 14)

In [77]:
weather[:3]

Unnamed: 0,Date,Hour,Latitude,Longitude,Summary,DegreesFahrenheit,Humidity,Visibility,WindBearing,WindGust,WindSpeed,PrecipitationIntensity,PrecipitationType,CourseName
0,2008-09-18,0,34.143845,-118.174506,Clear,66.65,0.74,9.5,88.0,,4.03,0.0,,Annandale GC
1,2008-09-18,1,34.143845,-118.174506,Clear,66.53,0.75,9.34,144.0,,4.3,0.0,,Annandale GC
2,2008-09-18,2,34.143845,-118.174506,Clear,65.86,0.78,8.95,131.0,,3.94,0.0,,Annandale GC


In [78]:
weather['WeatherDateAndHour'] = pd.to_datetime(weather['Date'] + ' ' + weather['Hour'].astype(str).str.zfill(2))
weather[:3]

Unnamed: 0,Date,Hour,Latitude,Longitude,Summary,DegreesFahrenheit,Humidity,Visibility,WindBearing,WindGust,WindSpeed,PrecipitationIntensity,PrecipitationType,CourseName,WeatherDateAndHour
0,2008-09-18,0,34.143845,-118.174506,Clear,66.65,0.74,9.5,88.0,,4.03,0.0,,Annandale GC,2008-09-18 00:00:00
1,2008-09-18,1,34.143845,-118.174506,Clear,66.53,0.75,9.34,144.0,,4.3,0.0,,Annandale GC,2008-09-18 01:00:00
2,2008-09-18,2,34.143845,-118.174506,Clear,65.86,0.78,8.95,131.0,,3.94,0.0,,Annandale GC,2008-09-18 02:00:00


In [79]:
shots = shotlink.get_shots_augmented([2017], data_path)
shots = shotlink.prepare_shots(shots)
shots.shape

(1214437, 47)

In [80]:
shots[:3]

Unnamed: 0,TourCode,TourDescription,Year,TournamentNum,PlayerNum,CourseNum,PermanentTournamentNum,PlayerFirstName,PlayerLastName,Round,...,DistanceFromEdge,Date,LeftRight,StrokesGainedBaseline,StrokesGainedCategory,RecoveryShot,AMWindSpd,PMWindSpd,AMWindDir,PMWindDir
0,R,PGA TOUR,2017,10,1810,552,464,Phil,Mickelson,1,...,645,2016-10-13,R,-0.091,Off the Tee,No,C,5-10,C,DW
1,R,PGA TOUR,2017,10,1810,552,464,Phil,Mickelson,1,...,159,2016-10-13,R,0.076,Approach the Green,No,C,5-10,C,DW
2,R,PGA TOUR,2017,10,1810,552,464,Phil,Mickelson,1,...,483,2016-10-13,L,0.07,Putting,No,C,5-10,C,DW


In [81]:
shots['ShotDateAndTime'] = pd.to_datetime(shots['Date'].astype(str) + ' ' + shots['Time'].astype(str).str.zfill(4), format='%Y-%m-%d %H%M')
shots[:3]

Unnamed: 0,TourCode,TourDescription,Year,TournamentNum,PlayerNum,CourseNum,PermanentTournamentNum,PlayerFirstName,PlayerLastName,Round,...,Date,LeftRight,StrokesGainedBaseline,StrokesGainedCategory,RecoveryShot,AMWindSpd,PMWindSpd,AMWindDir,PMWindDir,ShotDateAndTime
0,R,PGA TOUR,2017,10,1810,552,464,Phil,Mickelson,1,...,2016-10-13,R,-0.091,Off the Tee,No,C,5-10,C,DW,2016-10-13 12:41:00
1,R,PGA TOUR,2017,10,1810,552,464,Phil,Mickelson,1,...,2016-10-13,R,0.076,Approach the Green,No,C,5-10,C,DW,2016-10-13 12:46:00
2,R,PGA TOUR,2017,10,1810,552,464,Phil,Mickelson,1,...,2016-10-13,L,0.07,Putting,No,C,5-10,C,DW,2016-10-13 12:50:00


In [82]:
# without this, merge_asof gives me 'left keys must be sorted' (right key must be sorted too; I don't both here)
shots.sort_values(by=['ShotDateAndTime'], inplace=True)
weather.sort_values(by=['WeatherDateAndHour'], inplace=True)

In [83]:
combined = pd.merge_asof(shots, weather, left_on='ShotDateAndTime', right_on='WeatherDateAndHour', 
                         suffixes=['_shots', '_weather'], direction='nearest')
combined['TimeDifference'] = combined['ShotDateAndTime'] - combined['WeatherDateAndHour']
combined[:3]

Unnamed: 0,TourCode,TourDescription,Year,TournamentNum,PlayerNum,CourseNum,PermanentTournamentNum,PlayerFirstName,PlayerLastName,Round,...,Humidity,Visibility,WindBearing,WindGust,WindSpeed,PrecipitationIntensity,PrecipitationType,CourseName_weather,WeatherDateAndHour,TimeDifference
0,R,PGA TOUR,2017,10,27644,552,464,Brian,Harman,1,...,0.87,8.94,248.0,,5.32,0.0,,Silverado Resort and Spa North,2016-10-13 07:00:00,00:20:00
1,R,PGA TOUR,2017,10,24358,552,464,Robert,Garrigus,1,...,0.87,8.94,248.0,,5.32,0.0,,Silverado Resort and Spa North,2016-10-13 07:00:00,00:20:00
2,R,PGA TOUR,2017,10,25720,552,464,Chad,Collins,1,...,0.87,8.94,248.0,,5.32,0.0,,Silverado Resort and Spa North,2016-10-13 07:00:00,00:21:00


In [84]:
combined.iloc[0]

TourCode                                                R
TourDescription                                  PGA TOUR
Year                                                 2017
TournamentNum                                          10
PlayerNum                                           27644
CourseNum                                             552
PermanentTournamentNum                                464
PlayerFirstName                                     Brian
PlayerLastName                                     Harman
Round                                                   1
TournamentName                               Safeway Open
CourseName_shots           Silverado Resort and Spa North
Hole                                                   10
HoleScore                                               4
ParValue                                                4
Yardage                                               422
Shot                                                    1
ShotType      

In [85]:
combined[['ShotDateAndTime','WeatherDateAndHour','TimeDifference']]

Unnamed: 0,ShotDateAndTime,WeatherDateAndHour,TimeDifference
0,2016-10-13 07:20:00,2016-10-13 07:00:00,00:20:00
1,2016-10-13 07:20:00,2016-10-13 07:00:00,00:20:00
2,2016-10-13 07:21:00,2016-10-13 07:00:00,00:21:00
3,2016-10-13 07:21:00,2016-10-13 07:00:00,00:21:00
4,2016-10-13 07:21:00,2016-10-13 07:00:00,00:21:00
5,2016-10-13 07:21:00,2016-10-13 07:00:00,00:21:00
6,2016-10-13 07:26:00,2016-10-13 07:00:00,00:26:00
7,2016-10-13 07:26:00,2016-10-13 07:00:00,00:26:00
8,2016-10-13 07:26:00,2016-10-13 07:00:00,00:26:00
9,2016-10-13 07:26:00,2016-10-13 07:00:00,00:26:00


In [86]:
combined['TimeDifference'].value_counts(dropna=False)

00:21:00             20694
-1 days +23:51:00    20637
-1 days +23:41:00    20627
-1 days +23:31:00    20608
00:30:00             20593
00:01:00             20585
00:11:00             20498
00:16:00             20460
00:02:00             20459
-1 days +23:50:00    20381
-1 days +23:46:00    20378
00:06:00             20371
00:26:00             20368
00:22:00             20358
-1 days +23:45:00    20354
-1 days +23:52:00    20311
-1 days +23:40:00    20309
-1 days +23:36:00    20238
-1 days +23:56:00    20234
-1 days +23:32:00    20179
-1 days +23:42:00    20148
00:12:00             20138
00:10:00             20121
00:20:00             20104
-1 days +23:35:00    20101
00:07:00             20099
-1 days +23:47:00    20082
00:17:00             20076
-1 days +23:44:00    20071
00:00:00             20052
00:27:00             20048
00:03:00             20031
00:05:00             20024
-1 days +23:57:00    20022
00:25:00             19990
00:15:00             19987
-1 days +23:53:00    19935
-

In [None]:
pd.to_datetime(weather['Date'] + ' ' + weather['Hour'].astype(str).str.zfill(2))
weather[:3]

In [39]:
shots[:10].apply(lambda r: pd.Timestamp.combine(r['Date'], 
                                                pd.to_datetime(r['Time'].astype(str).str.zfill(4), format='%H%M')))

KeyError: ('Date', 'occurred at index TourCode')

In [33]:
pd.to_datetime(shots['Time'].astype(str).str.zfill(4), format='%H%M')

0         1900-01-01 12:41:00
1         1900-01-01 12:46:00
2         1900-01-01 12:50:00
3         1900-01-01 12:52:00
4         1900-01-01 12:52:00
5         1900-01-01 12:57:00
6         1900-01-01 13:00:00
7         1900-01-01 13:03:00
8         1900-01-01 13:03:00
9         1900-01-01 13:08:00
10        1900-01-01 13:13:00
11        1900-01-01 13:21:00
12        1900-01-01 13:24:00
13        1900-01-01 13:29:00
14        1900-01-01 13:35:00
15        1900-01-01 13:39:00
16        1900-01-01 13:49:00
17        1900-01-01 13:54:00
18        1900-01-01 13:57:00
19        1900-01-01 13:59:00
20        1900-01-01 14:04:00
21        1900-01-01 14:08:00
22        1900-01-01 14:08:00
23        1900-01-01 14:15:00
24        1900-01-01 14:21:00
25        1900-01-01 14:24:00
26        1900-01-01 14:26:00
27        1900-01-01 14:33:00
28        1900-01-01 14:37:00
29        1900-01-01 14:37:00
                  ...        
1214407   1900-01-01 15:49:00
1214408   1900-01-01 15:52:00
1214409   

In [None]:
pd.to_datetime(df.st_time, format='%H%M').dt.time)

In [19]:
shots.iloc[0]

TourCode                                                R
TourDescription                                  PGA TOUR
Year                                                 2017
TournamentNum                                          10
PlayerNum                                            1810
CourseNum                                             552
PermanentTournamentNum                                464
PlayerFirstName                                      Phil
PlayerLastName                                  Mickelson
Round                                                   1
TournamentName                               Safeway Open
CourseName                 Silverado Resort and Spa North
Hole                                                    1
HoleScore                                               5
ParValue                                                4
Yardage                                               436
Shot                                                    1
ShotType      

In [20]:
shots.dtypes

TourCode                         category
TourDescription                  category
Year                               uint16
TournamentNum                      uint16
PlayerNum                          uint16
CourseNum                          uint16
PermanentTournamentNum             uint16
PlayerFirstName                  category
PlayerLastName                   category
Round                               uint8
TournamentName                   category
CourseName                       category
Hole                                uint8
HoleScore                         float32
ParValue                            uint8
Yardage                            uint16
Shot                                uint8
ShotType                         category
NumStrokes                          uint8
FromLocationScorer               category
FromLocationEnhanced               object
ToLocationScorer                 category
ToLocationEnhanced               category
Distance                          