## Import Section

In [2]:
import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

  from ._conv import register_converters as _register_converters


## Read Data

Read 2016 Shot Link - Weather data combination and then looking at the sample data, the metadata info, and summary.

In [6]:
df = pd.read_csv('../../golf_course_project_data/combined2012to2016.zip')
df = df.dropna(subset=['StrokesGainedBaseline'])

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
df.head()

Unnamed: 0,TourCode,TourDescription,Year,TournamentNum,PlayerNum,CourseNum,PermanentTournamentNum,PlayerFirstName,PlayerLastName,Round,...,Humidity,Visibility,WindBearing,WindGust,WindSpeed,PrecipitationIntensity,PrecipitationType,CourseName_weather,WeatherDateAndHour,TimeDifference
0,R,PGA TOUR,2012,10,23800,656,16,Bryce,Molder,1,...,0.5,10.0,41.0,,9.4,0.0,,Plantation Course at Kapalua,2012-01-06 11:00:00,-1 days +23:35:00.000000000
1,R,PGA TOUR,2012,10,1116,656,16,Michael,Bradley,1,...,0.5,10.0,41.0,,9.4,0.0,,Plantation Course at Kapalua,2012-01-06 11:00:00,-1 days +23:36:00.000000000
2,R,PGA TOUR,2012,10,23800,656,16,Bryce,Molder,1,...,0.5,10.0,41.0,,9.4,0.0,,Plantation Course at Kapalua,2012-01-06 11:00:00,-1 days +23:41:00.000000000
3,R,PGA TOUR,2012,10,1116,656,16,Michael,Bradley,1,...,0.5,10.0,41.0,,9.4,0.0,,Plantation Course at Kapalua,2012-01-06 11:00:00,-1 days +23:42:00.000000000
4,R,PGA TOUR,2012,10,1116,656,16,Michael,Bradley,1,...,0.5,10.0,41.0,,9.4,0.0,,Plantation Course at Kapalua,2012-01-06 11:00:00,-1 days +23:45:00.000000000


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5522327 entries, 0 to 5522326
Data columns (total 65 columns):
TourCode                   object
TourDescription            object
Year                       int64
TournamentNum              int64
PlayerNum                  int64
CourseNum                  int64
PermanentTournamentNum     int64
PlayerFirstName            object
PlayerLastName             object
Round                      int64
TournamentName             object
CourseName_shots           object
Hole                       int64
HoleScore                  float64
ParValue                   int64
Yardage                    int64
Shot                       int64
ShotType                   object
NumStrokes                 int64
FromLocationScorer         object
FromLocationEnhanced       object
ToLocationScorer           object
ToLocationEnhanced         object
Distance                   int64
DistanceToPin              int64
InTheHoleFlag              object
AroundTheGreenF

In [9]:
df.describe()

Unnamed: 0,Year,TournamentNum,PlayerNum,CourseNum,PermanentTournamentNum,Round,Hole,HoleScore,ParValue,Yardage,...,Hour,Latitude,Longitude,DegreesFahrenheit,Humidity,Visibility,WindBearing,WindGust,WindSpeed,PrecipitationIntensity
count,5522327.0,5522327.0,5522327.0,5522327.0,5522327.0,5522327.0,5522327.0,5521849.0,5522327.0,5522327.0,...,5522327.0,5522327.0,5522327.0,5522327.0,5522327.0,5521234.0,5520552.0,0.0,5520566.0,5522327.0
mean,2014.0,238.1,25636.8,456.6,144.0,2.2,9.5,4.1,4.0,420.1,...,12.8,33.8,-93.8,73.4,0.6,9.5,181.4,,7.5,0.0
std,1.4,132.3,8297.3,314.4,204.6,1.1,5.2,0.9,0.6,120.0,...,2.7,5.7,19.4,8.9,0.2,1.1,99.2,,3.9,0.0
min,2012.0,10.0,1014.0,4.0,2.0,1.0,1.0,1.0,3.0,104.0,...,0.0,20.7,-157.8,37.0,0.1,1.0,0.0,,0.0,0.0
25%,2013.0,130.0,22961.0,24.0,11.0,1.0,5.0,4.0,4.0,386.0,...,11.0,29.9,-98.4,67.6,0.5,9.6,99.0,,4.5,0.0
50%,2014.0,230.0,26679.0,609.0,27.0,2.0,10.0,4.0,4.0,442.0,...,13.0,33.6,-86.2,74.8,0.6,9.9,181.0,,7.0,0.0
75%,2015.0,340.0,30926.0,729.0,457.0,3.0,14.0,5.0,4.0,490.0,...,15.0,38.2,-80.3,79.8,0.7,10.0,260.0,,9.9,0.0
max,2016.0,500.0,50844.0,822.0,518.0,4.0,18.0,13.0,5.0,667.0,...,21.0,45.5,-71.2,99.8,1.0,10.0,359.0,,24.0,0.5


In [4]:
# Drop columns which have more than 1% of their values missing
# Then drop rows missing values

pct_threshold = 0.01
no = d.apply(lambda x: x.isna().sum() / d.shape[0])
drop_these_lame_columns = no[no >= pct_threshold].index

dnew = d.drop(columns=drop_these_lame_columns)

#dnew = dnew.dropna(axis=0, how='any')

In [5]:
dnew.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1141966 entries, 0 to 1141965
Data columns (total 52 columns):
TourCode                   1141966 non-null category
TourDescription            1141966 non-null category
Year                       1141966 non-null uint16
TournamentNum              1141966 non-null uint16
PlayerNum                  1141966 non-null uint16
CourseNum                  1141966 non-null uint16
PermanentTournamentNum     1141966 non-null uint16
PlayerFirstName            1141966 non-null category
PlayerLastName             1141966 non-null category
Round                      1141966 non-null uint8
TournamentName             1141966 non-null category
CourseName_shots           1141966 non-null object
Hole                       1141966 non-null uint8
HoleScore                  1141601 non-null float32
ParValue                   1141966 non-null uint8
Yardage                    1141966 non-null uint16
Shot                       1141966 non-null uint8
ShotType     

In [6]:
dnew = d.dropna(axis=1, how='any')
dnew.columns

Index(['TourCode', 'TourDescription', 'Year', 'TournamentNum', 'PlayerNum',
       'CourseNum', 'PermanentTournamentNum', 'PlayerFirstName',
       'PlayerLastName', 'Round', 'TournamentName', 'CourseName_shots', 'Hole',
       'ParValue', 'Yardage', 'Shot', 'ShotType', 'NumStrokes', 'Distance',
       'DistanceToPin', 'InTheHoleFlag', 'AroundTheGreenFlag',
       'DistanceToHoleAfterShot', 'Time', 'XCoordinate', 'YCoordinate',
       'ZCoordinate', 'DistanceFromCenter', 'DistanceFromEdge', 'Date_shots',
       'StrokesGainedBaseline', 'RecoveryShot', 'ShotDateAndTime',
       'PlayerName', 'Date_weather', 'Hour', 'Latitude', 'Longitude',
       'Summary', 'DegreesFahrenheit', 'Humidity', 'Visibility',
       'PrecipitationIntensity', 'CourseName_weather', 'WeatherDateAndHour',
       'TimeDifference'],
      dtype='object')

In [7]:
col_for_model = ['Summary', 'DegreesFahrenheit', 'Humidity', 'Visibility', 'PrecipitationIntensity', 'StrokesGainedBaseline']
df = dnew[col_for_model]
df = df.dropna(axis=0, how='any')
df.describe()

Unnamed: 0,DegreesFahrenheit,Humidity,Visibility,PrecipitationIntensity,StrokesGainedBaseline
count,1141966.0,1141966.0,1141966.0,1141966.0,1141966.0
mean,74.04843,0.5882784,9.578472,0.001961086,0.001861721
std,8.995831,0.1886622,0.9515421,0.01251801,0.3120983
min,36.97,0.1,2.11,0.0,-3.206
25%,67.09,0.45,9.63,0.0,-0.11
50%,75.88,0.59,9.92,0.0,0.001
75%,80.8,0.73,10.0,0.0,0.119
max,92.75,1.0,10.0,0.5134,4.752


In [11]:
from sklearn import preprocessing, linear_model
from sklearn.model_selection import train_test_split
import copy

response = 'StrokesGainedBaseline'
X = copy.deepcopy(df)
y = X[response]
X = X.drop(columns=response)
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scalerX = preprocessing.StandardScaler().fit(X_train)
scalerY = preprocessing.StandardScaler().fit(y_train.reshape(-1, 1))

X_train_scaled = scalerX.transform(X_train)
X_test_scaled = scalerX.transform(X_test)
y_train_scaled = scalerY.transform(y_train.reshape(-1, 1))
y_test_scaled = scalerY.transform(y_test.reshape(-1, 1))

  if sys.path[0] == '':


  app.launch_new_instance()


In [13]:
from sklearn.metrics import mean_squared_error, r2_score

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train_scaled, y_train_scaled)

# Make predictions using the testing set
y_pred = regr.predict(X_test_scaled)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test_scaled, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test_scaled, y_pred))

Coefficients: 
 [[ 8.46852448e-03  1.57730065e-02  2.93760457e-03 -3.05083937e-03
  -6.47181661e+10 -5.56604798e+10 -9.01273375e+09 -1.76958803e+10
  -1.09608250e+10 -2.61683822e+09 -9.88929694e+09 -1.24589652e+10
  -2.93865696e+10 -1.20397205e+10 -1.97873793e+10 -1.91220524e+10
  -2.69689436e+09 -4.57104139e+09 -1.16113741e+10 -4.64228339e+09
  -3.43462463e+09]]
Mean squared error: 0.10
Variance score: 0.00


In [16]:
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeCV

kf = KFold(n_splits=10, shuffle=True)
alphas = (0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1)
rcv = RidgeCV(alphas=alphas, cv=kf)

rcv.fit(X_train_scaled, y_train_scaled)

RidgeCV(alphas=(0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1),
    cv=KFold(n_splits=10, random_state=None, shuffle=True),
    fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
    store_cv_values=False)

In [18]:
y_pred = rcv.predict(X_test_scaled)

# The coefficients
print('Coefficients: \n', rcv.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test_scaled, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test_scaled, y_pred))

Coefficients: 
 [[ 0.00842054  0.0156793   0.00293933 -0.00310603  0.00016025  0.00297598
   0.0018713  -0.00131867  0.00172717  0.00343741 -0.0016751  -0.00166654
   0.00012578 -0.00118315  0.00164353 -0.00566888  0.00013321  0.0002741
  -0.00650783 -0.00120694  0.00170048]]
Mean squared error: 0.99
Variance score: 0.00


In [None]:
type(y_test_scaled)
y_pred

In [21]:
y_test_scaled

array([[ 0.28515592],
       [ 0.48037544],
       [-0.2684995 ],
       ...,
       [-0.8221549 ],
       [-0.48292094],
       [-0.00607323]], dtype=float32)