In [1]:
import pandas as pd
import numpy as np
import mord as m
import matplotlib.pyplot as plt
import lightgbm as lgb
import tensorflow as tf
import warnings

from scipy.stats import spearmanr
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier  
from sklearn.ensemble import RandomForestRegressor  
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from tensorflow.keras import layers, models


# Set some options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
np.random.seed(42)  
warnings.filterwarnings('ignore')

# Filepaths for source data files
path1 = 'Downloads/ASA All PGA Raw Data - Tourn Level.csv'
path2 = 'Downloads/OWGR Historical.csv'

2025-10-17 00:39:33.351403: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Data Import

In [2]:
# Import dataset 
df_main = pd.read_csv(path1)

# Clean Data and Create Features
df_main.columns = df_main.columns.str.lower()
df_main['player'] = df_main['player'].str.upper()
df_main['date'] = pd.to_datetime(df_main['date'])
df_main['year'] = df_main['date'].dt.year
df_main['Year-Week'] = df_main['date'].dt.strftime('%Y-%U')
df_main['strokes_to_par'] = df_main['strokes'] - df_main['hole_par']

# Filter Data to Recent 5 years
df_main = df_main[df_main['year'] != 2014]

# Filter Data to Players that have played more than 30 tournaments
player_tournament_counts = df_main['player'].value_counts()
players_meeting_threshold = player_tournament_counts[player_tournament_counts >= 30].index
df_main = df_main[df_main['player'].isin(players_meeting_threshold)]

# Clean up 'finish position' - convert ties to actual numbers
df_main['finish'] = df_main['finish'].str.lstrip("T")

# Clean up 'finish position' - remove rows with non-numeric entries ('WD', 'CUT', etc.)
df_main['finish'] = pd.to_numeric(df_main['finish'], downcast='integer', errors='coerce')
df_main = df_main.dropna(subset=['finish'])
df_main['finish'] = df_main['finish'].astype(int)

# Focus on the top XX finishers in every tournament
df_main = df_main[df_main['finish'] <= 80]

# Delete unused columns
df_main = df_main.drop(columns=[
    'player_initial_last',
    'player id',
    'hole_par',
    'hole_dkp',
    'hole_fdp',
    'hole_sdp',
    'streak_dkp',
    'streak_fdp',
    'streak_sdp',
    'finish_dkp',
    'finish_fdp',
    'finish_sdp',
    'total_dkp',
    'total_fdp',
    'total_sdp',
    'unnamed: 2',
    'unnamed: 3',
    'unnamed: 4',
    'purse',
    'tournament name'])

# Part 1: Supervised Learning

## Add Features

In [3]:
# Calculate player rest
# "How many days since this player last played?"
df_main['player_rest'] = df_main.groupby('player')['date'].diff().dt.days
df_main['player_rest'] = df_main['player_rest'] * -1
# Assign rest of 28 days to players appearing for the first time in the table
df_main['player_rest'] = df_main['player_rest'].fillna(28) 


# Calculate measures of recent finishing performance
# Sort by player and date to calculate recency features
df_main1 = df_main.sort_values(['player', 'date'])

# Add recency measures - average finishing position
df_main1['AFP_last5'] = (
    df_main1.groupby('player')['finish']
      .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
)

df_main1['AFP_last10'] = (
    df_main1.groupby('player')['finish']
      .transform(lambda x: x.shift(1).rolling(window=10, min_periods=1).mean())
)

# Add recency measures - strokes gained
df_main1['SG_last5'] = (
    df_main1.groupby('player')['sg_total']
      .transform(lambda x: x.shift(1).rolling(window=3, min_periods=1).mean())
)

# Remove rows where there is no value for AFP
df_main2 = df_main1.dropna(subset=['AFP_last5', 'AFP_last10'])


# One-hot encode the course
df_main2['course_copy'] = df_main2['course']
df_main2 = pd.get_dummies(df_main2, columns=['course_copy'])

In [4]:
# Examine individual features here
new_df = df_main1[['player', 'finish','AFP_last5', 'date']]
#new_df

## Test / Train Split

In [5]:
# Train on all tournaments up to 2018 and test on all tournaments in 2019
df_train = df_main2[df_main2['date'] < '2021-01-01']
df_test1 = df_main2.loc[(df_main2['date'] >= '2021-01-01') & (df_main2['date'] <= '2021-12-31')]

# Get tournaments in test and train datasets
train_tournament_list = df_train['tournament id'].unique().tolist()
len(train_tournament_list)

# Remove courses that only show up in the test set
train_course_list = df_train['course'].unique().tolist()
test_course_list = df_test1['course'].unique().tolist()
unique_test_courses = [x for x in test_course_list if x not in train_course_list]
df_test = df_test1[~df_test1['course'].isin(unique_test_courses)]
test_tournament_list = df_test['tournament id'].unique().tolist()

## Feature Engineering

In [6]:
# Populate the train and test dataframes with features from the training dataset

# Add averge strokes to par for each player to their respective rows
# "How good is this player over their whole career?"
avg_strokes_to_par = df_train.groupby('player')['strokes_to_par'].mean()
df_train['lifetime_avg_strokes_to_par1'] = df_train['player'].map(avg_strokes_to_par)
df_test['lifetime_avg_strokes_to_par1'] = df_test['player'].map(avg_strokes_to_par)
# Create a LUT for later use
lifetime_stp_LUT = df_train[['player', 'lifetime_avg_strokes_to_par1']].drop_duplicates()
player_to_lifetime_stp = dict(zip(lifetime_stp_LUT['player'], lifetime_stp_LUT['lifetime_avg_strokes_to_par1']))
# Delete players who appear for the first time in the test data
df_test = df_test.dropna(subset=['lifetime_avg_strokes_to_par1'])


# Calculate the field strength for each tournament and add to their respective rows
# "How strong is the field at this tournament?"
#field_strength = df_train.groupby('tournament id')['lifetime_avg_strokes_to_par'].mean()
#df_train['field_strength'] = df_train['tournament id'].map(field_strength)
#df_test['field_strength'] = df_test['tournament id'].map(field_strength)

# Calculate course difficulty (player agnostic)
# "How tough is this course overall?"
course_difficulty = df_train.groupby('course')['strokes_to_par'].mean()
df_train['course_difficulty'] = df_train['course'].map(course_difficulty)
df_test['course_difficulty'] = df_test['course'].map(course_difficulty)
# Create a LUT for later use
course_difficulty_LUT = df_train[['course', 'course_difficulty']].drop_duplicates()
#course_to_difficulty = dict(zip(course_difficulty_LUT['course'], course_difficulty_LUT['course_difficulty']))


# Calculate avg strokes to par for all unique course-player combos
# "How well does this player usually play at this course?"
df_train['course_avg_strokes_to_par1'] = (df_train.groupby(['course', 'player'])['strokes_to_par'].transform('mean'))
# Create a LUT for later use
avg_stp_at_course_LUT = df_train[['course', 'player', 'course_avg_strokes_to_par1']].drop_duplicates()
mapping_dict = dict(zip(zip(avg_stp_at_course_LUT['course'], avg_stp_at_course_LUT['player']), avg_stp_at_course_LUT['course_avg_strokes_to_par1']))
df_test['course_avg_strokes_to_par1'] = df_test.apply(lambda row: mapping_dict.get((row['course'], row['player'])), axis=1)
#df_test['course_avg_strokes_to_par'] = df_test['course_avg_strokes_to_par'].fillna(df_test['course'].map(course_difficulty_LUT))

# For NaN values (players who play the course for the first time in test data), replace NaN with the course difficulty
df_test['course_avg_strokes_to_par1'] = df_test['course_avg_strokes_to_par1'].fillna(df_test['course_difficulty'])

# Delete NaN values
#df_test = df_test.dropna(subset=['course_avg_strokes_to_par1'])

## Model Training & Evaluation

### Model #1: Multinomial Logistic Regression

In [7]:
# Create the actual X and y dataframes for training
course_columns = [col for col in df_main.columns if col.startswith('course_')]
feature_list = ['player_rest',
                'course_avg_strokes_to_par1',
                'lifetime_avg_strokes_to_par1',
                
                
                'AFP_last5',
                'AFP_last10'
            ] + course_columns 

# Limit the training data to top XX
df_train = df_train[df_train['finish'] <= 80]

# Create training data
X_train = df_train[feature_list]
y_train = df_train['finish']



#feature_list = ['player_rest', 
#            'lifetime_avg_strokes_to_par',
#            'course_difficulty',
#            'field_strength',
#            'course_avg_strokes_to_par',
#            'AFP_last5',
#            'AFP_last10'
#            'SG_last5'
#            ] + course_columns 

In [8]:
# Train model #1
le = LabelEncoder()
y_enc = le.fit_transform(y_train)


# Pipeline: scale -> multinomial logistic regression
model1 = make_pipeline(
    StandardScaler(),
    LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=500)
)

model1.fit(X_train, y_enc)

In [9]:
# Evaluate model #1
avg_corr1 = []
avg_mae1 = []
for tournament in test_tournament_list:
    df_test2 = df_test[df_test['tournament id'] == tournament]
    #print(df_test2['tournament id'])
    
    # Limit the training data to top XX
    df_test2 = df_test2[df_test2['finish'] <= 80]


    X_test_pre = df_test2[feature_list]
    X_test = X_test_pre#.fillna(X_test_pre.mean())
    y_test = df_test2['finish']
    
    probs = model1.predict_proba(X_test)
    predictions = model1.predict(X_test)
    expected_rank = np.sum(probs * np.arange(1, probs.shape[1]+1), axis=1)
    X_test['Expected Rank'] = expected_rank


    X_test['Predicted Finish'] = (
    X_test['Expected Rank']
        .rank(method='first')   # ensures no ties
        .astype(int)
    )

    X_test['Actual Finish'] = y_test
    y_pred = X_test['Predicted Finish']
    y_actual = X_test['Actual Finish']
    X_test.sort_values(by='Actual Finish')

    rho, _ = spearmanr(y_pred, y_actual)
    #print("Spearman correlation:", rho)
    avg_corr1.append(rho)
    
    mae = mean_absolute_error(y_pred, y_actual)
    avg_mae1.append(mae)
    #print("MAE:", mae)

overall_corr1 = np.nanmean(avg_corr1)
overall_mae1 = np.nanmean(avg_mae1)

print("Average Spearman correlation:", overall_corr1)
print("Average MAE:", overall_mae1)

Average Spearman correlation: 0.19955517045080318
Average MAE: 19.71734545506199


### Model #2: Random Forest Classifier

In [10]:
# Train model #2
rf_reg = RandomForestClassifier(
    n_estimators=500,     # number of trees
    max_depth=None,       # or specify depth, e.g. 5
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',  # number of features per split
    random_state=42,
    n_jobs=-1             # use all CPU cores
)

rf_reg.fit(X_train, y_train)

In [11]:
# Evaluate Random Forest
avg_corr2 = []
avg_mae2 = []
for tournament in test_tournament_list:
    df_test2 = df_test[df_test['tournament id'] == tournament]
    
    overall_accuracy_score = []
    overall_spearman_corr = []
    
    # Limit the training data to top XX
    df_test2 = df_test2[df_test2['finish'] <= 80]
    
    X_test_pre = df_test2[feature_list]
    X_test = X_test_pre#.fillna(X_test_pre.mean())
    y_test = df_test2['finish']
    
    y_pred = rf_reg.predict(X_test)
    #print(y_pred)
    
    df_results = pd.DataFrame({
    'true_finish': y_test.values,
    'Predicted Finish': y_pred,
    })
    #print(df_results)
    rho, _ = spearmanr(df_results['true_finish'], df_results['Predicted Finish'])
    avg_corr2.append(rho)
    #print("Spearman correlation:", rho)
    
    mae = mean_absolute_error(df_results['true_finish'], df_results['Predicted Finish'])
    avg_mae2.append(mae)
    #print("MAE:", mae)

overall_corr2 = np.nanmean(avg_corr2)
overall_mae2 = np.nanmean(avg_mae2)

print("Average Spearman correlation:", overall_corr2)
print("Average MAE:", overall_mae2)

Average Spearman correlation: 0.10863978187558648
Average MAE: 22.51837237494626


### Model #3: Light Gradient Boosted Machine (GBM)

In [12]:
# Prepare LightGBM dataset

# Limit the training data to top XX
df_train3 = df_train[df_train['finish'] <= 30]


train_data = lgb.Dataset(
    df_train3[feature_list],
    label= (df_train3['finish'].max() - df_train3['finish']),
    group=df_train3.groupby('tournament id').size().to_numpy()
)

# Define model #3
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'ndcg_eval_at': [5],
    'verbosity': -1
}

# Train model #3
model = lgb.train(params, train_data, num_boost_round=200)

In [13]:
# Evaluate Light GBM
avg_corr3 = []
avg_mae3 = []

for tournament in test_tournament_list:
    # Limit the test data to top XX
    df_test = df_test[df_test['finish'] <= 80]
    
    X_test1 = df_test[df_test['tournament id'] == tournament]
    X_test2 = X_test1[feature_list]
    X_test2['pred_score'] = model.predict(X_test2)
    
    X_test2['pred_rank'] = X_test2['pred_score'].rank(ascending=False, method='first')
   
    X_test2['finish'] = X_test1['finish']
    X_test3 = X_test2.sort_values('finish')
    #print(X_test3[['finish','pred_rank']])
    
    rho, _ = spearmanr(X_test2['finish'], X_test2['pred_rank'])
    avg_corr3.append(rho)
    #print("Spearman correlation:", rho)

    mae = mean_absolute_error(X_test2['finish'], X_test2['pred_rank'])
    avg_mae3.append(mae)
    #print("MAE:", mae)

    
overall_corr3 = np.nanmean(avg_corr3)
overall_mae3 = np.nanmean(avg_mae3)

print("Average Spearman correlation:", overall_corr3)
print("Average MAE:", overall_mae3)

Average Spearman correlation: 0.13382483995257782
Average MAE: 20.77025650806503


### Model #4: Simple Neural Network

In [14]:
# Scale the data
X_test = df_test[feature_list]
y_test = df_test['finish']

scaler = StandardScaler()
X_train4 = scaler.fit_transform(X_train)
X_test4 = scaler.transform(X_test)

In [15]:
model = models.Sequential([
    layers.Input(shape=(X_train4.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='linear')  
])

model.compile(
    optimizer='adam',
    loss='mean_absolute_error',  
    metrics=['mean_absolute_error']
)

history = model.fit(
    X_train4, y_train,
    #validation_split=0.2,
    epochs=50,
    batch_size=16,
    verbose=1
)

Epoch 1/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 23.4204 - mean_absolute_error: 23.4204
Epoch 2/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 15.6254 - mean_absolute_error: 15.6254
Epoch 3/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 15.0937 - mean_absolute_error: 15.0937
Epoch 4/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 15.0019 - mean_absolute_error: 15.0019
Epoch 5/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 14.9343 - mean_absolute_error: 14.9343
Epoch 6/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 14.9176 - mean_absolute_error: 14.9176
Epoch 7/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 14.9337 - mean_absolute_error: 14.9337
Epoch 8/50
[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [16]:
# Evaluate Simple NN
avg_corr4 = []
avg_mae4 = []

for tournament in test_tournament_list:
     # Limit the test data to top XX
    df_test = df_test[df_test['finish'] <= 80]
    
    X_test1 = df_test[df_test['tournament id'] == tournament]
    X_test2 = X_test1[feature_list]
    X_test4 = scaler.transform(X_test2)
   
    
    y_test1 = X_test1['finish']
    
    y_pred = model.predict(X_test4).flatten()
    #print(y_pred)
    
    
    # Implement forced ranking
    df_forced_rank = pd.DataFrame({
    'Actual': y_test1,
    'predicted_finish': y_pred
    })

    # Sort by predicted finish (ascending)
    df_forced_rank = df_forced_rank.sort_values('predicted_finish')

    # Assign forced rank
    df_forced_rank['predicted_rank'] = range(1, len(df_forced_rank) + 1)
    
    #print(df_forced_rank)
    
    rho, _ = spearmanr(y_test1, y_pred)
    avg_corr4.append(rho)
    #print("Spearman correlation:", rho)
    
    mae = mean_absolute_error(y_test1, y_pred)
    avg_mae4.append(mae)
    #print("Test MAE:", mae)
    
overall_corr4 = np.nanmean(avg_corr4)
overall_mae4 = np.nanmean(avg_mae4)

print("Spearman correlation:", overall_corr4)
print("MAE:", overall_mae4)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10

Modeling results for ordinal placement showed performance slightly better than random assignment, so we tried a simpler approach and changed the target to examine classifier performance of cut / made cut using the 2 most successful aproaches.

### Re-import Data and Create Features (to include players who didn't make the cut)

In [17]:
# Import dataset 
df_main = pd.read_csv(path1)

# Clean Data and Create Features
df_main.columns = df_main.columns.str.lower()
df_main['player'] = df_main['player'].str.upper()
df_main['date'] = pd.to_datetime(df_main['date'])
df_main['year'] = df_main['date'].dt.year
df_main['Year-Week'] = df_main['date'].dt.strftime('%Y-%U')
df_main['strokes_to_par'] = df_main['strokes'] - df_main['hole_par']

# Filter Data to Recent 5 years
df_main = df_main[df_main['year'] != 2014]

# Filter Data to Players that have played more than 30 tournaments
player_tournament_counts = df_main['player'].value_counts()
players_meeting_threshold = player_tournament_counts[player_tournament_counts >= 30].index
df_main = df_main[df_main['player'].isin(players_meeting_threshold)]

# Clean up 'finish position' - convert ties to actual numbers
df_main['finish'] = df_main['pos']#.str.lstrip("T")

# Delete unused columns
df_main = df_main.drop(columns=[
    'player_initial_last',
    'player id',
    'hole_par',
    'hole_dkp',
    'hole_fdp',
    'hole_sdp',
    'streak_dkp',
    'streak_fdp',
    'streak_sdp',
    'finish_dkp',
    'finish_fdp',
    'finish_sdp',
    'total_dkp',
    'total_fdp',
    'total_sdp',
    'unnamed: 2',
    'unnamed: 3',
    'unnamed: 4',
    'purse',
    'tournament name'])

# ADD FEATURES
# Calculate player rest
# "How many days since this player last played?"
df_main['player_rest'] = df_main.groupby('player')['date'].diff().dt.days
df_main['player_rest'] = df_main['player_rest'] * -1
# Assign rest of 28 days to players appearing for the first time in the table
df_main['player_rest'] = df_main['player_rest'].fillna(28) 


# Calculate measures of recent finishing performance
# Sort by player and date to calculate recency features
df_main1 = df_main.sort_values(['player', 'date'])

# Add recency measures - average finishing position
df_main1['AFP_last5'] = (
    df_main1.groupby('player')['finish']
      .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
)

df_main1['AFP_last10'] = (
    df_main1.groupby('player')['finish']
      .transform(lambda x: x.shift(1).rolling(window=10, min_periods=1).mean())
)

# Add recency measures - strokes gained
df_main1['SG_last5'] = (
    df_main1.groupby('player')['sg_total']
      .transform(lambda x: x.shift(1).rolling(window=3, min_periods=1).mean())
)

# Remove rows where there is no value for AFP
df_main2 = df_main1.dropna(subset=['AFP_last5', 'AFP_last10'])


# One-hot encode the course
df_main2['course_copy'] = df_main2['course']
df_main2 = pd.get_dummies(df_main2, columns=['course_copy'])


# TEST TRAIN SPLIT
# Train on all tournaments up to 2018 and test on all tournaments in 2019
df_train = df_main2[df_main2['date'] < '2021-01-01']
df_test1 = df_main2.loc[(df_main2['date'] >= '2021-01-01') & (df_main2['date'] <= '2021-12-31')]

# Get tournaments in test and train datasets
train_tournament_list = df_train['tournament id'].unique().tolist()
len(train_tournament_list)

# Remove courses that only show up in the test set
train_course_list = df_train['course'].unique().tolist()
test_course_list = df_test1['course'].unique().tolist()
unique_test_courses = [x for x in test_course_list if x not in train_course_list]

df_test = df_test1[~df_test1['course'].isin(unique_test_courses)]
test_tournament_list = df_test['tournament id'].unique().tolist()


# FEATURE ENGINEERING
# Populate the train and test dataframes with features from the training dataset

# Add averge strokes to par for each player to their respective rows
# "How good is this player over their whole career?"
avg_strokes_to_par = df_train.groupby('player')['strokes_to_par'].mean()
df_train['lifetime_avg_strokes_to_par1'] = df_train['player'].map(avg_strokes_to_par)
df_test['lifetime_avg_strokes_to_par1'] = df_test['player'].map(avg_strokes_to_par)
# Create a LUT for later use
lifetime_stp_LUT = df_train[['player', 'lifetime_avg_strokes_to_par1']].drop_duplicates()
player_to_lifetime_stp = dict(zip(lifetime_stp_LUT['player'], lifetime_stp_LUT['lifetime_avg_strokes_to_par1']))
# Delete players who appear for the first time in the test data
df_test = df_test.dropna(subset=['lifetime_avg_strokes_to_par1'])


# Calculate the field strength for each tournament and add to their respective rows
# "How strong is the field at this tournament?"
#field_strength = df_train.groupby('tournament id')['lifetime_avg_strokes_to_par'].mean()
#df_train['field_strength'] = df_train['tournament id'].map(field_strength)
#df_test['field_strength'] = df_test['tournament id'].map(field_strength)

# Calculate course difficulty (player agnostic)
# "How tough is this course overall?"
course_difficulty = df_train.groupby('course')['strokes_to_par'].mean()
df_train['course_difficulty'] = df_train['course'].map(course_difficulty)
df_test['course_difficulty'] = df_test['course'].map(course_difficulty)
# Create a LUT for later use
course_difficulty_LUT = df_train[['course', 'course_difficulty']].drop_duplicates()
#course_to_difficulty = dict(zip(course_difficulty_LUT['course'], course_difficulty_LUT['course_difficulty']))


# Calculate avg strokes to par for all unique course-player combos
# "How well does this player usually play at this course?"
df_train['course_avg_strokes_to_par1'] = (df_train.groupby(['course', 'player'])['strokes_to_par'].transform('mean'))
# Create a LUT for later use
avg_stp_at_course_LUT = df_train[['course', 'player', 'course_avg_strokes_to_par1']].drop_duplicates()
mapping_dict = dict(zip(zip(avg_stp_at_course_LUT['course'], avg_stp_at_course_LUT['player']), avg_stp_at_course_LUT['course_avg_strokes_to_par1']))
df_test['course_avg_strokes_to_par1'] = df_test.apply(lambda row: mapping_dict.get((row['course'], row['player'])), axis=1)
#df_test['course_avg_strokes_to_par'] = df_test['course_avg_strokes_to_par'].fillna(df_test['course'].map(course_difficulty_LUT))

# For NaN values (players who play the course for the first time in test data), replace NaN with the course difficulty
df_test['course_avg_strokes_to_par1'] = df_test['course_avg_strokes_to_par1'].fillna(df_test['course_difficulty'])

# Delete NaN values
#df_test = df_test.dropna(subset=['course_avg_strokes_to_par1'])

### Model #5: Multinomial Logistic Regression (with new target)

In [18]:
# Create the actual X and y dataframes for training
course_columns = [col for col in df_main.columns if col.startswith('course_')]
feature_list = ['player_rest',
                'course_avg_strokes_to_par1',
                'lifetime_avg_strokes_to_par1',
                'course_difficulty',
                
                'AFP_last5',
                'AFP_last10',
            ] + course_columns 

# Create training data
X_train = df_train[feature_list]
y_train = df_train['made_cut']

#feature_list = ['player_rest', 
#            'lifetime_avg_strokes_to_par',
#            'course_difficulty',
#            'field_strength',
#            'course_avg_strokes_to_par',
#            'AFP_last5',
#            'AFP_last10'
#            'SG_last5'
#            ] + course_columns 

In [19]:
# Train model #5
le = LabelEncoder()
y_enc = le.fit_transform(y_train)


# Pipeline: scale -> multinomial logistic regression
model5 = make_pipeline(
    StandardScaler(),
    LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=500)
)

model5.fit(X_train, y_enc)

In [20]:
# Evaluate model #5
avg_precision1 = []
avg_recall1 = []
avg_f1_1 = []
for tournament in test_tournament_list:
    df_test2 = df_test[df_test['tournament id'] == tournament]

    X_test = df_test2[feature_list]
    y_test = df_test2['made_cut']
    
    probs = model5.predict_proba(X_test)
   
    predictions = model5.predict(X_test)
    #print(y_test)
    #expected_rank = np.sum(probs * np.arange(1, probs.shape[1]+1), axis=1)
    #X_test['Expected Rank'] = expected_rank


    df_results = pd.DataFrame({
    'true_finish': y_test.values,
    'Predicted Finish': predictions,
    })
    #print(df_results)

    #X_test['Actual Finish'] = y_test
    #y_pred = X_test['Predicted Finish']
    #y_actual = X_test['Actual Finish']
    #X_test.sort_values(by='Actual Finish')
    
    precision1 = precision_score(y_test.values, predictions)
    avg_precision1.append(precision1)
    #print(f"Precision: {precision1}")

    recall1 = recall_score(y_test.values, predictions)
    avg_recall1.append(recall1)
    #print(f"Recall: {recall1}")
    
    f1 = f1_score(y_test.values, predictions)
    avg_f1_1.append(f1)
    #print(f"F1 Score (binary): {f1}")

overall_precision1 = np.nanmean(avg_precision1)
overall_recall1 = np.nanmean(avg_recall1)
overall_f1_1 = np.nanmean(avg_f1_1)

print("Average Precision:", overall_precision1)
print("Average Recall:", overall_recall1)
print("Average f1 Score:", overall_f1_1)

Average Precision: 0.628998862138165
Average Recall: 0.8514979923544082
Average f1 Score: 0.7145765571400606


### Model #6: Simple Neural Network (with new target)

In [21]:
# Scale the data
scaler = StandardScaler()
X_train6 = scaler.fit_transform(X_train)
#X_test4 = scaler.transform(X_test)

In [22]:
model6 = models.Sequential([
    layers.Input(shape=(X_train6.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='linear')  
])

model6.compile(
    optimizer='adam',
    loss='mean_absolute_error',  
    metrics=['mean_absolute_error']
)

history = model6.fit(
    X_train6, y_train,
    #validation_split=0.2,
    epochs=50,
    batch_size=16,
    verbose=1
)

Epoch 1/50
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.3905 - mean_absolute_error: 0.3905
Epoch 2/50
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.3067 - mean_absolute_error: 0.3067
Epoch 3/50
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.2911 - mean_absolute_error: 0.2911
Epoch 4/50
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.2846 - mean_absolute_error: 0.2846
Epoch 5/50
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.2847 - mean_absolute_error: 0.2847
Epoch 6/50
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.2804 - mean_absolute_error: 0.2804
Epoch 7/50
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.2806 - mean_absolute_error: 0.2806
Epoch 8/50
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [24]:
# Evaluate model #6
avg_precision4 = []
avg_recall4 = []
avg_f1_4 = []

for tournament in test_tournament_list:

    X_test1 = df_test[df_test['tournament id'] == tournament]
    X_test2 = X_test1[feature_list]
    X_test4 = scaler.transform(X_test2)
   
    
    y_test1 = X_test1['made_cut']
    
    y_pred = model6.predict(X_test4).flatten()
    y_pred1 = np.where(np.array(y_pred) >= 0.5, 1, 0)
    #print(y_pred)
    
    
    # Implement forced ranking
    df_forced_rank = pd.DataFrame({
    'Actual': y_test1,
    'predicted_finish': y_pred
    })

    # Sort by predicted finish (ascending)
    df_forced_rank = df_forced_rank.sort_values('predicted_finish')

    # Assign forced rank
    df_forced_rank['predicted_rank'] = range(1, len(df_forced_rank) + 1)
    
    #print(df_forced_rank)
    
    
    
    #print(type(y_test[0]))
    #print(type(y_pred[0]))
    
    precision4 = precision_score(y_test1, y_pred1)
    avg_precision4.append(precision4)
    #print(f"Precision: {precision4}")
    
    recall4 = recall_score(y_test1, y_pred1)
    avg_recall4.append(recall4)
    #print(f"Recall: {recall4}")
    
    f1_4 = f1_score(y_test1, y_pred1)
    avg_f1_4.append(f1_4)
    #print(f"F1 Score (binary): {f1}")
    


overall_precision4 = np.nanmean(avg_precision4)
overall_recall4 = np.nanmean(avg_recall4)
overall_f1_4 = np.nanmean(avg_f1_4)

print("Average Precision:", overall_precision4)
print("Average Recall:", overall_recall4)
print("Average f1 Score:", overall_f1_4)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6m

# Part 2: Unsupervised Learning

In [None]:
# Pete stuff goes here