In [1]:
import pandas as pd
import numpy as np
import mord as m
import matplotlib.pyplot as plt
import warnings
from scipy.stats import spearmanr
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_absolute_error, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier  
from sklearn.ensemble import RandomForestRegressor  
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

import tensorflow as tf
from tensorflow.keras import layers, models
import lightgbm as lgb

# Set some options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
np.random.seed(42)  
warnings.filterwarnings('ignore')

# Filepaths for source data files
path1 = 'Downloads/ASA All PGA Raw Data - Tourn Level.csv'
path2 = 'Downloads/OWGR Historical.csv'

2025-10-14 20:14:15.659892: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Data Import

In [2]:
# Import dataset 
df_main = pd.read_csv(path1)

# Clean Data and Create Features
df_main.columns = df_main.columns.str.lower()
df_main['player'] = df_main['player'].str.upper()
df_main['date'] = pd.to_datetime(df_main['date'])
df_main['year'] = df_main['date'].dt.year
df_main['Year-Week'] = df_main['date'].dt.strftime('%Y-%U')
df_main['strokes_to_par'] = df_main['strokes'] - df_main['hole_par']

# Filter Data to Recent 5 years
df_main = df_main[df_main['year'] != 2014]

# Filter Data to Players that have played more than 30 tournaments
player_tournament_counts = df_main['player'].value_counts()
players_meeting_threshold = player_tournament_counts[player_tournament_counts >= 30].index
df_main = df_main[df_main['player'].isin(players_meeting_threshold)]

# Clean up 'finish position' - convert ties to actual numbers
df_main['finish'] = df_main['finish'].str.lstrip("T")

# Clean up 'finish position' - remove rows with non-numeric entries ('WD', 'CUT', etc.)
df_main['finish'] = pd.to_numeric(df_main['finish'], downcast='integer', errors='coerce')
df_main = df_main.dropna(subset=['finish'])
df_main['finish'] = df_main['finish'].astype(int)

# Focus on the top XX finishers in every tournament
df_main = df_main[df_main['finish'] <= 80]

# Delete unused columns
df_main = df_main.drop(columns=[
    'player_initial_last',
    'player id',
    'hole_par',
    'hole_dkp',
    'hole_fdp',
    'hole_sdp',
    'streak_dkp',
    'streak_fdp',
    'streak_sdp',
    'finish_dkp',
    'finish_fdp',
    'finish_sdp',
    'total_dkp',
    'total_fdp',
    'total_sdp',
    'unnamed: 2',
    'unnamed: 3',
    'unnamed: 4',
    'purse',
    'tournament name'])


Index(['tournament id', 'strokes', 'n_rounds', 'made_cut', 'pos', 'player',
       'course', 'date', 'season', 'no_cut', 'finish', 'sg_putt', 'sg_arg',
       'sg_app', 'sg_ott', 'sg_t2g', 'sg_total', 'year', 'Year-Week',
       'strokes_to_par'],
      dtype='object')

# Add Features

In [3]:
# Calculate player rest
# "How many days since this player last played?"
df_main['player_rest'] = df_main.groupby('player')['date'].diff().dt.days
df_main['player_rest'] = df_main['player_rest'] * -1
# Assign rest of 28 days to players appearing for the first time in the table
df_main['player_rest'] = df_main['player_rest'].fillna(28) 


# Calculate measures of recent finishing performance
# Sort by player and date to calculate recency features
df_main1 = df_main.sort_values(['player', 'date'])

# Add recency measures - average finishing position
df_main1['AFP_last5'] = (
    df_main1.groupby('player')['finish']
      .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
)

df_main1['AFP_last10'] = (
    df_main1.groupby('player')['finish']
      .transform(lambda x: x.shift(1).rolling(window=10, min_periods=1).mean())
)

# Add recency measures - strokes gained
df_main1['SG_last5'] = (
    df_main1.groupby('player')['sg_total']
      .transform(lambda x: x.shift(1).rolling(window=3, min_periods=1).mean())
)

# Remove rows where there is no value for AFP
df_main2 = df_main1.dropna(subset=['AFP_last5', 'AFP_last10'])


# One-hot encode the course
df_main2['course_copy'] = df_main2['course']
df_main2 = pd.get_dummies(df_main2, columns=['course_copy'])

In [4]:
# Examine individual features here
new_df = df_main1[['player', 'finish','AFP_last5', 'date']]
#new_df

# Test / Train Split

In [5]:
# Train on all tournaments up to 2018 and test on all tournaments in 2019
df_train = df_main2[df_main2['date'] < '2021-01-01']
df_test1 = df_main2.loc[(df_main2['date'] >= '2021-01-01') & (df_main2['date'] <= '2021-12-31')]

# Get tournaments in test and train datasets
train_tournament_list = df_train['tournament id'].unique().tolist()
len(train_tournament_list)

# Remove courses that only show up in the test set
train_course_list = df_train['course'].unique().tolist()
test_course_list = df_test1['course'].unique().tolist()
unique_test_courses = [x for x in test_course_list if x not in train_course_list]

df_test = df_test1[~df_test1['course'].isin(unique_test_courses)]
test_tournament_list = df_test['tournament id'].unique().tolist()

# Feature Engineering

In [8]:
# Populate the train and test dataframes with features from the training dataset

# Add averge strokes to par for each player to their respective rows
# "How good is this player over their whole career?"
avg_strokes_to_par = df_train.groupby('player')['strokes_to_par'].mean()
df_train['lifetime_avg_strokes_to_par1'] = df_train['player'].map(avg_strokes_to_par)
df_test['lifetime_avg_strokes_to_par1'] = df_test['player'].map(avg_strokes_to_par)
# Create a LUT for later use
lifetime_stp_LUT = df_train[['player', 'lifetime_avg_strokes_to_par1']].drop_duplicates()
player_to_lifetime_stp = dict(zip(lifetime_stp_LUT['player'], lifetime_stp_LUT['lifetime_avg_strokes_to_par1']))
# Delete players who appear for the first time in the test data
df_test = df_test.dropna(subset=['lifetime_avg_strokes_to_par1'])


# Calculate the field strength for each tournament and add to their respective rows
# "How strong is the field at this tournament?"
#field_strength = df_train.groupby('tournament id')['lifetime_avg_strokes_to_par'].mean()
#df_train['field_strength'] = df_train['tournament id'].map(field_strength)
#df_test['field_strength'] = df_test['tournament id'].map(field_strength)

# Calculate course difficulty (player agnostic)
# "How tough is this course overall?"
course_difficulty = df_train.groupby('course')['strokes_to_par'].mean()
df_train['course_difficulty'] = df_train['course'].map(course_difficulty)
df_test['course_difficulty'] = df_test['course'].map(course_difficulty)
# Create a LUT for later use
course_difficulty_LUT = df_train[['course', 'course_difficulty']].drop_duplicates()
#course_to_difficulty = dict(zip(course_difficulty_LUT['course'], course_difficulty_LUT['course_difficulty']))


# Calculate avg strokes to par for all unique course-player combos
# "How well does this player usually play at this course?"
df_train['course_avg_strokes_to_par1'] = (df_train.groupby(['course', 'player'])['strokes_to_par'].transform('mean'))
# Create a LUT for later use
avg_stp_at_course_LUT = df_train[['course', 'player', 'course_avg_strokes_to_par1']].drop_duplicates()
mapping_dict = dict(zip(zip(avg_stp_at_course_LUT['course'], avg_stp_at_course_LUT['player']), avg_stp_at_course_LUT['course_avg_strokes_to_par1']))
df_test['course_avg_strokes_to_par1'] = df_test.apply(lambda row: mapping_dict.get((row['course'], row['player'])), axis=1)
#df_test['course_avg_strokes_to_par'] = df_test['course_avg_strokes_to_par'].fillna(df_test['course'].map(course_difficulty_LUT))

# For NaN values (players who play the course for the first time in test data), replace NaN with the course difficulty
df_test['course_avg_strokes_to_par1'] = df_test['course_avg_strokes_to_par1'].fillna(df_test['course_difficulty'])

# Delete NaN values
#df_test = df_test.dropna(subset=['course_avg_strokes_to_par1'])
df_train.shape

(12069, 96)

# Model Training & Evaluation

### Model #1: Multinomial Logistic Regression

In [9]:
# Create the actual X and y dataframes for training
course_columns = [col for col in df_main.columns if col.startswith('course_')]
feature_list = ['player_rest',
                'course_avg_strokes_to_par1',
                'lifetime_avg_strokes_to_par1',
                
                
                'AFP_last5',
                'AFP_last10', 
                'SG_last5'
            ] + course_columns 

# Limit the training data to top XX
df_train = df_train[df_train['finish'] <= 80]
print(df_train.shape)
# Create training data
X_train = df_train[feature_list]
y_train = df_train['finish']

#feature_list = ['player_rest', 
#            'lifetime_avg_strokes_to_par',
#            'course_difficulty',
#            'field_strength',
#            'course_avg_strokes_to_par',
#            'AFP_last5',
#            'AFP_last10'
#            'SG_last5'
#            ] + course_columns 

(12069, 96)


In [10]:
# Train model #1
le = LabelEncoder()
y_enc = le.fit_transform(y_train)


# Pipeline: scale -> multinomial logistic regression
model1 = make_pipeline(
    StandardScaler(),
    LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=500)
)

model1.fit(X_train, y_enc)

In [11]:
# Evaluate model #1
avg_corr1 = []
avg_mae1 = []
for tournament in test_tournament_list:
    df_test2 = df_test[df_test['tournament id'] == tournament]
    
    # Limit the training data to top XX
    df_test2 = df_test2[df_test2['finish'] <= 80]


    X_test_pre = df_test2[feature_list]
    X_test = X_test_pre#.fillna(X_test_pre.mean())
    y_test = df_test2['finish']
    
    probs = model1.predict_proba(X_test)
    predictions = model1.predict(X_test)
    expected_rank = np.sum(probs * np.arange(1, probs.shape[1]+1), axis=1)
    X_test['Expected Rank'] = expected_rank


    X_test['Predicted Finish'] = (
    X_test['Expected Rank']
        .rank(method='first')   # ensures no ties
        .astype(int)
    )

    X_test['Actual Finish'] = y_test
    y_pred = X_test['Predicted Finish']
    y_actual = X_test['Actual Finish']
    X_test.sort_values(by='Actual Finish')

    rho, _ = spearmanr(y_pred, y_actual)
    print("Spearman correlation:", rho)
    avg_corr1.append(rho)
    
    mae = mean_absolute_error(y_pred, y_actual)
    avg_mae1.append(mae)
    print("MAE:", mae)

overall_corr1 = np.nanmean(avg_corr1)
overall_mae1 = np.nanmean(avg_mae1)

print("Average Spearman correlation:", overall_corr1)
print("Average MAE:", overall_mae1)

Spearman correlation: -0.08173889521866688
MAE: 22.517857142857142
Spearman correlation: 0.391740714676453
MAE: 17.46511627906977
Spearman correlation: 0.19812545338638632
MAE: 22.45
Spearman correlation: -0.36180013839726005
MAE: 23.295454545454547
Spearman correlation: 0.27587427947461085
MAE: 17.96
Spearman correlation: 0.145306793196128
MAE: 19.448979591836736
Spearman correlation: -0.037374508461227696
MAE: 20.352941176470587
Spearman correlation: 0.23961970851838488
MAE: 19.06
Spearman correlation: 0.3447694346780586
MAE: 19.948275862068964
Spearman correlation: 0.011095379357367331
MAE: 22.017241379310345
Spearman correlation: 0.08082535196794999
MAE: 23.733333333333334
Spearman correlation: 0.1645325919673441
MAE: 19.24
Spearman correlation: 0.214852005486371
MAE: 21.076923076923077
Spearman correlation: 0.022338405722244183
MAE: 21.975609756097562
Spearman correlation: 0.17220083412006948
MAE: 23.393939393939394
Spearman correlation: 0.13591857355700426
MAE: 18.73076923076923


### Model #2: Random Forest Classifier

In [13]:
# Train model #2
rf_reg = RandomForestClassifier(
    n_estimators=500,     # number of trees
    max_depth=None,       # or specify depth, e.g. 5
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',  # number of features per split
    random_state=42,
    n_jobs=-1             # use all CPU cores
)

rf_reg.fit(X_train, y_train)

In [14]:
# Evaluate Random Forest
avg_corr2 = []
avg_mae2 = []
for tournament in test_tournament_list:
    df_test2 = df_test[df_test['tournament id'] == tournament]
    
    overall_accuracy_score = []
    overall_spearman_corr = []
    
    # Limit the training data to top XX
    df_test2 = df_test2[df_test2['finish'] <= 80]
    
    X_test_pre = df_test2[feature_list]
    X_test = X_test_pre#.fillna(X_test_pre.mean())
    y_test = df_test2['finish']
    
    y_pred = rf_reg.predict(X_test)
    #print(y_pred)
    
    df_results = pd.DataFrame({
    'true_finish': y_test.values,
    'Predicted Finish': y_pred,
    })
    #print(df_results)
    rho, _ = spearmanr(df_results['true_finish'], df_results['Predicted Finish'])
    avg_corr2.append(rho)
    print("Spearman correlation:", rho)
    
    mae = mean_absolute_error(df_results['true_finish'], df_results['Predicted Finish'])
    avg_mae2.append(mae)
    print("MAE:", mae)

overall_corr2 = np.nanmean(avg_corr2)
overall_mae2 = np.nanmean(avg_mae2)

print("Average Spearman correlation:", overall_corr2)
print("Average MAE:", overall_mae2)

Spearman correlation: 0.03839970140002214
MAE: 24.017857142857142
Spearman correlation: 0.10746177069163551
MAE: 21.627906976744185
Spearman correlation: 0.11225778991552336
MAE: 24.575
Spearman correlation: -0.18856831986472197
MAE: 24.818181818181817
Spearman correlation: 0.034734986340769755
MAE: 22.18
Spearman correlation: -0.1061502409198078
MAE: 25.755102040816325
Spearman correlation: 0.08590097967497382
MAE: 24.372549019607842
Spearman correlation: 0.06947779002455008
MAE: 25.24
Spearman correlation: 0.30023240700473297
MAE: 20.086206896551722
Spearman correlation: -0.030665137962887556
MAE: 24.620689655172413
Spearman correlation: 0.1419255376839184
MAE: 25.0
Spearman correlation: 0.11176223400381867
MAE: 23.9
Spearman correlation: 0.2398391115963117
MAE: 21.307692307692307
Spearman correlation: 0.035048775614326816
MAE: 23.073170731707318
Spearman correlation: 0.254529841684586
MAE: 21.575757575757574
Spearman correlation: 0.055293834634902145
MAE: 24.865384615384617
Spearman

### Model #3: Light Gradient Boosted Machine (GBM)

In [15]:
# Prepare LightGBM dataset

# Limit the training data to top XX
df_train3 = df_train[df_train['finish'] <= 30]


train_data = lgb.Dataset(
    df_train3[feature_list],
    label= (df_train3['finish'].max() - df_train3['finish']),
    group=df_train3.groupby('tournament id').size().to_numpy()
)

# Define model #3
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'ndcg_eval_at': [5],
    'verbosity': -1
}

# Train model #3
model = lgb.train(params, train_data, num_boost_round=200)

In [17]:
# Evaluate Light GBM
avg_corr3 = []
avg_mae3 = []

for tournament in test_tournament_list:
    # Limit the test data to top XX
    df_test = df_test[df_test['finish'] <= 80]
    
    X_test1 = df_test[df_test['tournament id'] == tournament]
    X_test2 = X_test1[feature_list]
    X_test2['pred_score'] = model.predict(X_test2)
    
    X_test2['pred_rank'] = X_test2['pred_score'].rank(ascending=False, method='first')
   
    X_test2['finish'] = X_test1['finish']
    X_test3 = X_test2.sort_values('finish')
    #print(X_test3[['finish','pred_rank']])
    
    rho, _ = spearmanr(X_test2['finish'], X_test2['pred_rank'])
    avg_corr3.append(rho)
    print("Spearman correlation:", rho)

    mae = mean_absolute_error(X_test2['finish'], X_test2['pred_score'])
    avg_mae3.append(mae)
    print("MAE:", mae)

    
overall_corr3 = np.nanmean(avg_corr3)
overall_mae3 = np.nanmean(avg_mae3)

print("Average Spearman correlation:", overall_corr3)
print("Average MAE:", overall_mae3)

Spearman correlation: -0.05656208066117569
MAE: 34.22397351176354
Spearman correlation: 0.26030139593632734
MAE: 33.47618233244043
Spearman correlation: 0.21492694490748052
MAE: 36.38094861086573
Spearman correlation: -0.4561735625395699
MAE: 32.50304009838274
Spearman correlation: 0.12231615641857528
MAE: 33.67068051084179
Spearman correlation: -0.010554724752610144
MAE: 34.00466203395779
Spearman correlation: -0.19954263691207103
MAE: 33.39023279728706
Spearman correlation: 0.09663794436829114
MAE: 34.3766079579376
Spearman correlation: 0.30818241342464253
MAE: 39.73034785973723
Spearman correlation: 0.026998756436260502
MAE: 34.34796770009601
Spearman correlation: -0.09866267102294583
MAE: 39.12305594075515
Spearman correlation: 0.1471981661235502
MAE: 33.65864176980881
Spearman correlation: 0.14029579163034347
MAE: 35.30640989978018
Spearman correlation: 0.06797883466847643
MAE: 34.83709721546122
Spearman correlation: 0.12273710864254221
MAE: 35.75551047836513
Spearman correlation:

### Model #4: Simple Neural Network

In [18]:
# Scale the data
scaler = StandardScaler()
X_train4 = scaler.fit_transform(X_train)
#X_test4 = scaler.transform(X_test)

In [19]:
model = models.Sequential([
    layers.Input(shape=(X_train4.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='linear')  
])

model.compile(
    optimizer='adam',
    loss='mean_absolute_error',  
    metrics=['mean_absolute_error']
)

history = model.fit(
    X_train4, y_train,
    #validation_split=0.2,
    epochs=50,
    batch_size=16,
    verbose=1
)

Epoch 1/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 23.7991 - mean_absolute_error: 23.7991
Epoch 2/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 15.2088 - mean_absolute_error: 15.2088
Epoch 3/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 15.0750 - mean_absolute_error: 15.0750
Epoch 4/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 14.8266 - mean_absolute_error: 14.8266
Epoch 5/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 15.1352 - mean_absolute_error: 15.1352
Epoch 6/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 14.8900 - mean_absolute_error: 14.8900
Epoch 7/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 15.0248 - mean_absolute_error: 15.0248
Epoch 8/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [20]:
tst_tournament_list = test_tournament_list[0:1]
overall_spearman_corr2 = []
overall_mae = []

for tournament in test_tournament_list:
     # Limit the test data to top XX
    df_test = df_test[df_test['finish'] <= 80]
    
    X_test1 = df_test[df_test['tournament id'] == tournament]
    X_test2 = X_test1[feature_list]
    X_test4 = scaler.transform(X_test2)
   
    
    y_test1 = X_test1['finish']
    
    y_pred = model.predict(X_test4).flatten()
    #print(y_pred)
    
    
    # Implement forced ranking
    df_forced_rank = pd.DataFrame({
    'Actual': y_test1,
    'predicted_finish': y_pred
    })

    # Sort by predicted finish (ascending)
    df_forced_rank = df_forced_rank.sort_values('predicted_finish')

    # Assign forced rank
    df_forced_rank['predicted_rank'] = range(1, len(df_forced_rank) + 1)
    
    #print(df_forced_rank)
    
    rho, _ = spearmanr(y_test1, y_pred)
    overall_spearman_corr2.append(rho)
    #print("Spearman correlation:", rho)
    
    mae = mean_absolute_error(y_test1, y_pred)
    overall_mae.append(mae)
    #print("Test MAE:", mae)
    
overall_corr = np.nanmean(overall_spearman_corr2)
overall_mae = np.nanmean(overall_mae)

print("Spearman correlation:", overall_corr)
print("MAE:", overall_mae)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17