# What is the most integral element to success on Tour?

In [18]:
# packages

import pandas as pd
import numpy as np
import requests
import altair as alt
from bs4 import BeautifulSoup

from scipy.stats import ks_2samp
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

### Gathering the data

pgatour.com includes statistics on various aspects of the game, however does not include a public API so I will need to scrape the appropriate data.

In [2]:
def retreive_golf_stats(stat, year, tourny):
    """
    Retreives data from pgatour.com
    
    Parameters:
    -----------
    stat: str
        - 120 (scoring average)
        - 102 (driving accuracy percentage)
        - 101 (driving distance)
        - 119 (putts per round)
    year: str or int
    tourny: str
        - 026 (US Open)
        
    Returns:
    --------
    pandas.DataFrame
    
    Examples:
    ---------
    >>> retreive_golf_stats('120', '2020', '026')
    """
    
    domain = "https://www.pgatour.com/content/pgatour/stats/"
    params = f"stat.{stat}.y{year}.eon.t{tourny}.html"

    request = requests.get(domain + params)

    if request.reason != 'OK':
        raise Exception('The request was unsuccessful')

    soup = BeautifulSoup(request.text, "lxml")
    info = soup.find('div', {'class': 'details-table-wrap'})

    player_name = []
    rank = []
    statistic = []

    for player in info.find_all('tr')[1:]:
        player_name.append(player.a.text)
        rank.append(player.find('td', {'class': ''}).text)
        statistic.append(player.find_all('td')[4].text)

    df =  pd.DataFrame({'year': year,
                        'player_name': player_name,
                        'rank': rank,
                        'stat': statistic})

    df['rank'] = df['rank'].str.lstrip('\n T')
    df = df.astype({'year': int, 'rank': int, 'stat': float})
    
    return df

In [3]:
def compile_data(years):
    
    final_df = pd.DataFrame()
    
    for year in years:
        score = retreive_golf_stats('120', year, '026')\
                .rename(columns = {'rank': 'rank_score', 'stat': 'score'})
        drive_acc_pct = retreive_golf_stats('102', year, '026')\
                        .rename(columns = {'rank': 'rank_drive_acc_pct', 'stat': 'drive_acc_pct'})
        drive_dist = retreive_golf_stats('101', year, '026')\
                     .rename(columns = {'rank': 'rank_drive_dist', 'stat': 'drive_dist'})
        greens_pct = retreive_golf_stats('103', year, '026')\
                     .rename(columns = {'rank': 'rank_greens_pct', 'stat': 'greens_pct'})
        putts_pr = retreive_golf_stats('119', year, '026')\
                   .rename(columns = {'rank': 'rank_putts', 'stat': 'putts'})
    
        initial_df = score.merge(drive_acc_pct).merge(drive_dist).merge(greens_pct).merge(putts_pr)
        final_df = final_df.append(initial_df)
    
    return final_df

In [4]:
# df = compile_data(list(range(2000, 2020)))
# df.to_pickle('golf_df.pkl')

In [5]:
df = pd.read_pickle('golf_df.pkl')

In [6]:
df.describe()

Unnamed: 0,year,rank_score,score,rank_drive_acc_pct,drive_acc_pct,rank_drive_dist,drive_dist,rank_greens_pct,greens_pct,rank_putts,putts
count,1418.0,1418.0,1418.0,1418.0,1418.0,1418.0,1418.0,1418.0,1418.0,1418.0,1418.0
mean,2009.540197,34.399154,69.260011,33.985896,62.089436,36.14457,291.122779,33.948519,58.629224,34.269394,30.047426
std,5.763647,20.966453,1.507044,21.043654,11.098042,21.037644,14.46647,21.012804,7.933643,20.994758,1.468319
min,2000.0,1.0,63.695,1.0,17.86,1.0,239.8,1.0,33.33,1.0,25.5
25%,2005.0,16.0,68.292,16.0,53.57,18.0,281.825,16.0,52.78,16.0,29.0
50%,2010.0,33.0,69.153,33.0,62.5,36.0,291.5,32.0,58.33,34.0,30.0
75%,2015.0,52.0,70.266,51.0,69.64,53.0,300.6,51.0,63.89,52.0,31.0
max,2019.0,83.0,76.362,83.0,91.07,83.0,339.3,83.0,86.11,83.0,34.75


In [7]:
df = df.reset_index().drop(columns = ['index', 'player_name'])

In [19]:
plot_1 = alt.Chart(df).mark_point(size = 5, opacity = 0.5).encode(
    alt.X('drive_dist', axis = alt.Axis(title = 'Drive Distance (yards)', labelAngle = 0), scale = alt.Scale(domain =(230, 350))),
    alt.Y('score', axis = alt.Axis(title = 'Score'), scale = alt.Scale(domain =(60, 80))), 
).properties(width = 400, height = 200)
plot_2 = alt.Chart(df).mark_point(size = 5, opacity = 0.5).encode(
    alt.X('drive_acc_pct', axis = alt.Axis(title = 'Drive Accuracy Percentage (%)', labelAngle = 0), scale = alt.Scale(domain =(15, 95))),
    alt.Y('score', axis = alt.Axis(title = 'Score'), scale = alt.Scale(domain =(60, 80))), 
).properties(width = 400, height = 200)
plot_3 = alt.Chart(df).mark_point(size = 5, opacity = 0.5).encode(
    alt.X('greens_pct', axis = alt.Axis(title = 'Greens in Regulation Percentage (%)', labelAngle = 0), scale = alt.Scale(domain =(30, 90))),
    alt.Y('score', axis = alt.Axis(title = 'Score'), scale = alt.Scale(domain =(60, 80))), 
).properties(width = 400, height = 200)
plot_4 = alt.Chart(df).mark_point(size = 5, opacity = 0.5).encode(
    alt.X('putts', axis = alt.Axis(title = 'Putts Per Round'), scale = alt.Scale(domain =(25, 35))),
    alt.Y('score', axis = alt.Axis(title = 'Score'), scale = alt.Scale(domain =(60, 80))), 
).properties(width = 400, height = 200)

(plot_1 & plot_2 | plot_3 & plot_4).properties(
    title = "Metrics for Players in the US Open (2000-2019)"
).configure_axis(
    labelFontSize = 10, titleFontSize = 12
).configure_title(
    fontSize = 20,
    dx = 250, dy = -25)

In this visualization there is no clear trend present as the data is quite noisy. 

Perhaps looking at relative ranking in more important because the competition changes every year and raw score does not matter as much for placement. 

In [9]:
plot_1 = alt.Chart(df).mark_point(size = 5, opacity = 0.5).encode(
    alt.X('rank_drive_dist', axis = alt.Axis(title = 'Drive Distance Rank', labelAngle = 0), scale = alt.Scale(domain =(0, 90))),
    alt.Y('rank_score', axis = alt.Axis(title = 'Overall Rank'), scale = alt.Scale(domain =(0, 90))), 
).properties(width = 400, height = 200)
plot_2 = alt.Chart(df).mark_point(size = 5, opacity = 0.5).encode(
    alt.X('rank_drive_acc_pct', axis = alt.Axis(title = 'Drive Accuracy Percentage Rank', labelAngle = 0), scale = alt.Scale(domain =(0, 90))),
    alt.Y('rank_score', axis = alt.Axis(title = 'Overall Rank'), scale = alt.Scale(domain =(0, 90))), 
).properties(width = 400, height = 200)
plot_3 = alt.Chart(df).mark_point(size = 5, opacity = 0.5).encode(
    alt.X('rank_greens_pct', axis = alt.Axis(title = 'Greens in Regulation Percentage Rank', labelAngle = 0), scale = alt.Scale(domain =(0, 90))),
    alt.Y('rank_score', axis = alt.Axis(title = 'Overall Rank'), scale = alt.Scale(domain =(0, 90))), 
).properties(width = 400, height = 200)
plot_4 = alt.Chart(df).mark_point(size = 5, opacity = 0.5).encode(
    alt.X('rank_putts', axis = alt.Axis(title = 'Putts Per Round Rank'), scale = alt.Scale(domain =(0, 90))),
    alt.Y('rank_score', axis = alt.Axis(title = 'Overall Rank'), scale = alt.Scale(domain =(0, 90))), 
).properties(width = 400, height = 200)

(plot_1 & plot_2 | plot_3 & plot_4).properties(
    title = "Metrics for Players in the US Open (2000-2019)"
).configure_axis(
    labelFontSize = 10, titleFontSize = 12
).configure_title(
    fontSize = 20,
    dx = 250, dy = -25)

The data is quite noisy so I will visualize the average rank of metrics for each overall rank.

In [10]:
plot_1 = alt.Chart(df.groupby('rank_score').mean().reset_index()).mark_point(size = 5, opacity = 0.75).encode(
    alt.X('rank_drive_dist', axis = alt.Axis(title = 'Average Drive Distance Rank', labelAngle = 0), scale = alt.Scale(domain =(0, 90))),
    alt.Y('rank_score', axis = alt.Axis(title = 'Overall Rank'), scale = alt.Scale(domain =(0, 90))), 
).properties(width = 400, height = 200)
plot_2 = alt.Chart(df.groupby('rank_score').mean().reset_index()).mark_point(size = 5, opacity = 0.75).encode(
    alt.X('rank_drive_acc_pct', axis = alt.Axis(title = 'Average Drive Accuracy Percentage Rank', labelAngle = 0), scale = alt.Scale(domain =(0, 90))),
    alt.Y('rank_score', axis = alt.Axis(title = 'Overall Rank'), scale = alt.Scale(domain =(0, 90))), 
).properties(width = 400, height = 200)
plot_3 = alt.Chart(df.groupby('rank_score').mean().reset_index()).mark_point(size = 5, opacity = 0.75).encode(
    alt.X('rank_greens_pct', axis = alt.Axis(title = 'Average Greens in Regulation Percentage Rank', labelAngle = 0), scale = alt.Scale(domain =(0, 90))),
    alt.Y('rank_score', axis = alt.Axis(title = 'Overall Rank'), scale = alt.Scale(domain =(0, 90))), 
).properties(width = 400, height = 200)
plot_4 = alt.Chart(df.groupby('rank_score').mean().reset_index()).mark_point(size = 5, opacity = 0.75).encode(
    alt.X('rank_putts', axis = alt.Axis(title = 'Average Putts Per Round Rank'), scale = alt.Scale(domain =(0, 90))),
    alt.Y('rank_score', axis = alt.Axis(title = 'Overall Rank'), scale = alt.Scale(domain =(0, 90))), 
).properties(width = 400, height = 200)

(plot_1 & plot_2 | plot_3 & plot_4).properties(
    title = "Metrics for Players in the US Open (2000-2019)"
).configure_axis(
    labelFontSize = 10, titleFontSize = 12
).configure_title(
    fontSize = 20,
    dx = 250, dy = -25)

In this visualization, we can see that there is more evident trend among the 4 metrics. A more vertical relationship indicates that there is more metric rank variation among players that are similarly ranked. For instance, for a given overall ranked player, the average rank for drive distance and drive accuracy percentage can vary a lot more than the average rankings for putts per round and greens in regulation percentage. We can also see that for higher overall ranked players, their average rankings for putts per round and greens in regulation percentage are much higher than drive distance and drive accuracy percentage.

In [12]:
features = df[['rank_drive_acc_pct', 'rank_drive_dist', 'rank_greens_pct', 'rank_putts']]
target = df['rank_score']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

print("R-squared value:", lin_reg.score(X_test, y_test))
print("Regression coefficeints:", lin_reg.coef_)

R-squared value: 0.6708297775902408
Regression coefficeints: [0.06931401 0.06109241 0.7694211  0.71454778]


As indicated by the larger regression coefficients, short game is indeed more important to ranking higher on Tour. Greens in regulation percentage rank has the largest coefficient, indicating that it may be the most integral element to success. The coefficient for drive accuracy percentage is slightly higher than drive distance so it may be more beneficial focusing on hitting the ball on target rather than further.

### Comparing Top 10 and Worst 10 Ranked Players

Another way to approach this problem is to compare the distributions of metric rankings of the top 10 and worst 10 players.

In [14]:
# top and worst 10 of each year
top_10 = df.groupby('year').head(10)
worst_10 = df.groupby('year').tail(10)

top_10.loc[:, 'rank'] = 'top_10'
worst_10.loc[:, 'rank'] = 'worst_10'
top_worst_10 = top_10.append(worst_10)

In [15]:
plot_1 = alt.Chart(top_worst_10).mark_boxplot(size = 40).encode(
    alt.X('rank:O', axis = alt.Axis(title = '', labelAngle = 0)),
    alt.Y('rank_drive_dist', axis = alt.Axis(title = 'Drive Distance Rank'), scale = alt.Scale(domain =(0, 90))), 
).properties(width = 200, height = 400)
plot_2 = alt.Chart(top_worst_10).mark_boxplot(size = 40).encode(
    alt.X('rank:O', axis = alt.Axis(title = '', labelAngle = 0)),
    alt.Y('rank_drive_acc_pct', axis = alt.Axis(title = 'Drive Accuracy Percentage Rank'), scale = alt.Scale(domain =(0, 90))), 
).properties(width = 200, height = 400)
plot_3 = alt.Chart(top_worst_10).mark_boxplot(size = 40).encode(
    alt.X('rank:O', axis = alt.Axis(title = '', labelAngle = 0)),
    alt.Y('rank_greens_pct', axis = alt.Axis(title = 'Greens in Regulation Percentage Rank'), scale = alt.Scale(domain =(0, 90))), 
).properties(width = 200, height = 400)
plot_4 = alt.Chart(top_worst_10).mark_boxplot(size = 40).encode(
    alt.X('rank:O', axis = alt.Axis(title = '', labelAngle = 0)),
    alt.Y('rank_putts', axis = alt.Axis(title = 'Putts Per Round Rank'), scale = alt.Scale(domain =(0, 90))), 
).properties(width = 200, height = 400)

(plot_1 | plot_2 | plot_3 | plot_4).properties(
    title = "Metric Ranks for Top 10 and Worst 10 Ranked Players in the US Open (2000-2019)"
).configure_axis(
    labelFontSize = 10, titleFontSize = 12
).configure_title(
    fontSize = 20,
    dx = 125, dy = -25)

In this visualization, we can see that the distributions between the top 10 and worst 10 players for drive distance and drive accuracy percentage ranks are much closer together than the distributions of greens in regulation percentage and putts per round ranks. This indicates that the main factors differentiating the top 10 and worst 10 players are their short game. Furthermore, we can see that the top 10 ranked players are on average, better ranked in their short game.

In [16]:
metrics = ['rank_drive_acc_pct', 'rank_drive_dist', 'rank_greens_pct', 'rank_putts']

for metric in metrics:
    x = top_worst_10.query('rank == "top_10"')[metric]
    y = top_worst_10.query('rank == "worst_10"')[metric]
    
    print(f"KS test for {metric}: {ks_2samp(x, y)}")

KS test for rank_drive_acc_pct: Ks_2sampResult(statistic=0.3, pvalue=2.4125498977354686e-08)
KS test for rank_drive_dist: Ks_2sampResult(statistic=0.265, pvalue=1.3921047931216453e-06)
KS test for rank_greens_pct: Ks_2sampResult(statistic=0.61, pvalue=5.022256080866513e-35)
KS test for rank_putts: Ks_2sampResult(statistic=0.475, pvalue=8.832345232887228e-21)


The Kolmogorov-Smirnov test confirms that the distributions are in fact significantly different and that there is a larger difference seen in the short game metrics. 

### Future Work

- Consider more variables
- Analyze different tournaments / years
- Consider the amount of times a player has played a specific course to control for a player's ability to "learn" a course
- Control for the idiosyncrasies between players
- Fit different regression models such as a Random Forest Regressor, KNN regressor, etc.
- More rigorous feature selection