#### Set Up

In [1]:
## imports and dependencies

# standard libraries
import pandas as pd
import numpy as np

# standard libraries
from sklearn.model_selection import GroupKFold

# accuracy
from sklearn.metrics import mean_absolute_error

# model
from sklearn.ensemble import GradientBoostingRegressor

# cross validation
from sklearn.model_selection import cross_val_score

# scaler 
from sklearn.preprocessing import StandardScaler  


#### Data

In [2]:
df = pd.read_csv("C:/Users/barry/Desktop/tar_enc_dataset.csv", encoding="latin1")
df = df.drop(['Weekly Gross', 'Total Gross', 'Annual Gross', 'Weekly Net', 'Annual Net', 'Ln Annual Net', 'AgeSQ'], axis=1)

In [3]:
# target and explanatory variables and groups
IDs = df['Unique ID']
x = df.drop(['Ln Weekly Gross', 'Name', 'Unique ID'], axis=1)
y = df['Ln Weekly Gross']

# as arrays
x = x.values
y = y.values

In [4]:
# standard scaler
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [5]:
# random forest model
gb = GradientBoostingRegressor(random_state=42)

In [6]:
# create groups
groups = df['Unique ID'].values

In [7]:
# grid search w/ group k fold cross validation
group_kfold = GroupKFold(n_splits=5)

In [8]:
# cross validation 
mae_scores = cross_val_score(gb, x, y, cv=group_kfold, groups=groups, scoring='neg_mean_absolute_error')
mae_scores = -mae_scores  # Convert negative scores to positive
print("Cross-Validation Mean Absolute Errors:", mae_scores)
print("Average Mean Absolute Error:", np.mean(mae_scores))

Cross-Validation Mean Absolute Errors: [0.37945959 0.35906365 0.38419598 0.40231964 0.35335411]
Average Mean Absolute Error: 0.3756785933757439


In [9]:
# IDs in each fold
for i, (train_index, test_index) in enumerate(group_kfold.split(x, y, IDs)):
    print(f"Fold {i}:")
    print()
    print("Train:")
    for j, k in enumerate(IDs[train_index], 1):
        print(k, end=", ")
        if j % 50 == 0 or j == len(train_index):
            print()
    print()
    print("Test:")
    for j, k in enumerate(IDs[test_index], 1):
        print(k, end=", ")
        if j % 50 == 0 or j == len(test_index):
            print()
    print()
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    mae = mean_absolute_error(y_test, y_pred)
    print(f'Mean Absolute Error: {mae}')
    print()

Fold 0:

Train:
3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 7, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 15, 17, 18, 18, 19, 20, 21, 21, 21, 21, 22, 22, 22, 23, 23, 23, 23, 23, 24, 24, 25, 25, 
26, 26, 26, 27, 27, 27, 28, 28, 28, 28, 29, 30, 30, 30, 30, 30, 31, 31, 31, 31, 32, 32, 34, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 40, 40, 41, 42, 42, 42, 43, 43, 43, 43, 43, 45, 45, 45, 46, 46, 
46, 47, 47, 47, 47, 49, 50, 50, 50, 51, 52, 52, 53, 53, 53, 53, 55, 55, 55, 55, 56, 56, 56, 57, 58, 58, 59, 59, 59, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 63, 64, 64, 64, 65, 66, 66, 66, 66, 66, 67, 
67, 69, 70, 70, 70, 71, 71, 71, 71, 72, 73, 73, 73, 75, 75, 75, 76, 76, 77, 79, 80, 82, 83, 83, 84, 84, 84, 85, 85, 85, 85, 86, 86, 86, 87, 89, 90, 90, 90, 90, 91, 92, 92, 92, 93, 93, 93, 95, 96, 97, 
97, 97, 98, 98, 99, 100, 100, 100, 100, 100, 102, 102, 103, 104, 104, 104, 104, 105, 107, 107, 107, 107, 108, 108, 110, 110, 110, 111, 111, 113, 113, 113, 113, 115, 115, 115, 113, 111, 117, 117, 117

NameError: name 'rf' is not defined