In [1]:
## imports and dependencies

# standard libraries
import pandas as pd
import numpy as np

# standard libraries
from sklearn.model_selection import GroupKFold

# accuracy
from sklearn.metrics import mean_absolute_error

# model
from sklearn.ensemble import RandomForestRegressor

# cross validation
from sklearn.model_selection import cross_val_score

# encoding
from sklearn.preprocessing import LabelEncoder

# scaler 
from sklearn.preprocessing import StandardScaler  

#### Data

In [2]:
df = pd.read_csv("region_dataset.csv")
df = df.drop(['Weekly Gross', 'Total Gross', 'Annual Gross', 'Weekly Net', 'Annual Net', 'Ln Annual Net', 'AgeSQ', 'Pos_AM', 'Pos_CB', 'Pos_CF', 'Pos_CM', 'Pos_DF', 'Pos_DM', 'Pos_FW', 'Pos_FW,DF', 'Pos_FW,MF', 'Pos_LB', 'Pos_LM', 'Pos_LW', 'Pos_MF', 'Pos_MF,DF', 'Pos_RB', 'Pos_RM',	'Pos_RW', 'Pos_SS'], axis=1)

In [3]:
# encoder 
label_encoder = LabelEncoder()

In [4]:
# integer encode player names
column_to_encode = df["Name"]
encoded_column = label_encoder.fit_transform(column_to_encode)
df["Unique ID"] = encoded_column

In [5]:
# target and explanatory variables and groups
IDs = df['Unique ID']
x = df.drop(['Ln Weekly Gross', 'Name', 'Unique ID'], axis=1)
y = df['Ln Weekly Gross']

# as arrays
x = x.values
y = y.values

In [6]:
# standard scaler
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [7]:
# random forest model
rf = RandomForestRegressor(random_state=42)

In [8]:
# create groups
groups = df['Unique ID'].values

In [9]:
# grid search w/ group k fold cross validation
group_kfold = GroupKFold(n_splits=5)

In [10]:
# cross validation 
mae_scores = cross_val_score(rf, x, y, cv=group_kfold, groups=groups, scoring='neg_mean_absolute_error')
mae_scores = -mae_scores  # Convert negative scores to positive
print("Cross-Validation Mean Absolute Errors:", mae_scores)
print("Average Mean Absolute Error:", np.mean(mae_scores))

Cross-Validation Mean Absolute Errors: [0.40804248 0.41582052 0.40675745 0.33659374 0.3961688 ]
Average Mean Absolute Error: 0.39267659845658887


In [11]:
# IDs in each fold
for i, (train_index, test_index) in enumerate(group_kfold.split(x, y, IDs)):
    print(f"Fold {i}:")
    print()
    print("Train:")
    for j, k in enumerate(IDs[train_index], 1):
        print(k, end=", ")
        if j % 50 == 0 or j == len(train_index):
            print()
    print()
    print("Test:")
    for j, k in enumerate(IDs[test_index], 1):
        print(k, end=", ")
        if j % 50 == 0 or j == len(test_index):
            print()
    print()
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    mae = mean_absolute_error(y_test, y_pred)
    print(f'Mean Absolute Error: {mae}')
    print()

Fold 0:

Train:
66, 81, 81, 81, 81, 334, 391, 391, 391, 470, 6, 6, 6, 190, 190, 190, 190, 462, 462, 462, 462, 462, 521, 80, 80, 80, 385, 385, 106, 106, 106, 106, 172, 180, 197, 197, 197, 197, 252, 252, 252, 252, 252, 256, 256, 257, 257, 272, 272, 272, 
287, 287, 287, 297, 297, 297, 297, 298, 308, 308, 308, 308, 308, 342, 342, 342, 342, 368, 368, 369, 369, 417, 417, 417, 417, 469, 471, 482, 482, 482, 483, 483, 484, 500, 500, 500, 507, 507, 507, 507, 507, 544, 544, 544, 551, 551, 551, 550, 550, 550, 
265, 265, 265, 265, 404, 622, 30, 30, 30, 140, 140, 523, 523, 523, 523, 601, 614, 614, 162, 162, 162, 162, 162, 421, 421, 457, 457, 522, 522, 522, 574, 574, 574, 12, 29, 29, 29, 29, 29, 53, 70, 71, 71, 71, 91, 91, 91, 91, 93, 115, 
115, 115, 153, 153, 181, 204, 238, 261, 266, 267, 267, 311, 311, 311, 311, 313, 313, 313, 315, 383, 384, 372, 376, 376, 376, 382, 382, 382, 418, 458, 548, 548, 548, 552, 552, 554, 554, 554, 554, 554, 561, 561, 561, 561, 565, 565, 634, 637, 637, 637, 
637, 638, 95,