### Functions

In [1]:
def get_datasets(df, feat_target, feat_norm, norm_shift, flg_ma):
    season_min = 2023
    # calculate mean and std
    eps = 1e-3
    feats_static = [
        'static_height_cm',
        'static_weight_kg',
        'static_bmi_calc',
    ]
    if feat_norm != None:
        df_ag = df.groupby(feat_norm)[feat_target].agg([np.mean, np.std]).fillna(0)
        df_ag.columns = ['m','s']
        df_ag.reset_index(inplace = True)
        # create lag -1 to prevent look into the future
        df_ag['season'] = df_ag['season'] + norm_shift
        df_ag['s'] = df_ag['s'] + eps
        df = df.merge(df_ag, how = 'left')
        
        # normalize static features
        for f in feats_static:
            df[f] = (df[f] - df[f].mean()) / df[f].std()
    else:
        df['m'] = 0
        df['s']  = 1 + eps

    # normalization of target
    df['target'] = (df[feat_target] - df['m']) / df['s'] 

    # create lag features of the target
    max_lag = season_min - 2016
    for i in range(1,max_lag):
        df[f'lag{i}'] = df.groupby('player_id')['target'].shift(i)
    feats_lag = [f'lag{i}' for i in range(1, max_lag)]

    # feats for training
    feats_used = feats_static + feats_lag

    # moving averages features
    if flg_ma:
        for i in range(1,max_lag):
            df[f'ma{i}'] = df.loc[:,'lag1':f'lag{i}'].mean(axis = 1)
        feats_ma = [f'ma{i}' for i in range(1, max_lag)]
        feats_used = feats_used + feats_ma

    # nulls filling (for sklearn models)
    for f in feats_lag:
        df[f] = df[f].fillna(-1)

    # filtering
    filt = df.season >= season_min
    feats_info = ['player_id','display_name','season','position','fantasy_points_ppr','games','fantasy_points_per_game']
    return df[filt][feats_used], df[filt]['target'], df[filt]['m'], df[filt]['s'], df[filt][feats_info]

In [2]:
# utility function to get kfold avg and std scores
def cv_scores(clf, kfold):
    pred = ms.cross_val_predict(clf, x, y, cv = kfold)
    pred = (pred * s + m).values
    list_res = list()
    for cv in kfold.split(real):
        ind = list(cv[1])
        list_res.append(metrics.mean_absolute_error(pred[ind], real[ind]))
    return np.mean(list_res), np.std(list_res)


def model_searching(kfold1, kfold2):
    dict_out = dict()
    dict_params = dict()
    
    ###
    modelname = 'Decision tree'
    from sklearn.tree import DecisionTreeRegressor
    clf = DecisionTreeRegressor()
    params = {
        'max_depth': list(range(2,10)),
    }
    grid = ms.GridSearchCV(clf, params, cv = kfold1, scoring = 'neg_mean_absolute_error')
    grid.fit(x, y)
    clf = DecisionTreeRegressor(**grid.best_params_)
    dict_out[modelname] = cv_scores(clf = clf, kfold = kfold2)
    dict_params[modelname] = str(grid.best_params_)
    
    ###
    modelname = 'Xgboost'
    import xgboost as xgb
    clf = xgb.XGBRegressor()
    params = {
        'learning_rate': [0.05, 0.1],
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 6, 9],
        'gamma': [0, 1],
        'subsample': [0.7, 1],
        'colsample_bytree': [0.7, 1]
    }
    grid = ms.GridSearchCV(clf, params, cv = kfold1, scoring = 'neg_mean_absolute_error')
    grid.fit(x, y)
    clf = xgb.XGBRegressor(**grid.best_params_)
    dict_out[modelname] = cv_scores(clf = clf, kfold = kfold2)
    dict_params[modelname] = str(grid.best_params_)
    
    ###
    modelname = 'Random forest'
    from sklearn.ensemble import RandomForestRegressor
    clf = RandomForestRegressor(n_estimators=100)
    params = {
        'max_depth': [3, 6, None],
    }
    grid = ms.GridSearchCV(clf, params, cv = kfold1, scoring = 'neg_mean_absolute_error')
    grid.fit(x, y)
    clf = RandomForestRegressor(**grid.best_params_)
    dict_out[modelname] = cv_scores(clf = clf, kfold = kfold2)
    dict_params[modelname] = str(grid.best_params_)
    
    ###
    modelname = 'KNN'
    from sklearn.neighbors import KNeighborsRegressor
    clf = KNeighborsRegressor(n_neighbors=2)
    params = {
        'n_neighbors': [2, 5, 10, 25, 50],
    }
    grid = ms.GridSearchCV(clf, params, cv = kfold1, scoring = 'neg_mean_absolute_error')
    grid.fit(x, y)
    clf = KNeighborsRegressor(**grid.best_params_)
    dict_out[modelname] = cv_scores(clf = clf, kfold = kfold2)
    dict_params[modelname] = str(grid.best_params_)
    
    ###
    modelname = 'Linear model'
    from sklearn.linear_model import ElasticNet
    clf = ElasticNet(alpha = 0.005)
    params = {
        'alpha': [1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001],
        'l1_ratio': [10, 2, 1, 0.5, 0.1, 0.01],
    }
    grid = ms.GridSearchCV(clf, params, cv = kfold1, scoring = 'neg_mean_absolute_error')
    grid.fit(x, y)
    clf = ElasticNet(**grid.best_params_)
    dict_out[modelname] = cv_scores(clf = clf, kfold = kfold2)
    dict_params[modelname] = str(grid.best_params_)
    
    ###
    modelname = 'Neural network (MLP)'
    from sklearn.neural_network import MLPRegressor
    clf = MLPRegressor()
    params = {
        'activation' : ['identity', 'logistic', 'tanh', 'relu'],
        'hidden_layer_sizes': [(2,), (4,), (8,), (16,), (2,2), (4,2), (8,2) ],
    }
    grid = ms.GridSearchCV(clf, params, cv = kfold1, scoring = 'neg_mean_absolute_error')
    grid.fit(x, y)
    clf = MLPRegressor(**grid.best_params_)
    dict_out[modelname] = cv_scores(clf = clf, kfold = kfold2)
    dict_params[modelname] = str(grid.best_params_)

    ###
    # moving averages
    if flg_ma:
        for i in range(1,6):
            pred = x[f'ma{i}']
            pred = (pred * s + m).values
            list_res = list()
            for cv in kfold2.split(real):
                ind = list(cv[1])
                list_res.append(metrics.mean_absolute_error(pred[ind], real[ind]))
            dict_out[f'MA{i}'] = np.mean(list_res), np.std(list_res)
        
    return dict_out, dict_params

### Training

In [3]:
%%time
#####################
# TRAINING
#####################

# The targets we want to predict (comparing "total" vs. "games" * "per game")
feats_target = [
    'games',
    'fantasy_points_per_game',
    'fantasy_points_ppr',
]
# Normalization features
feats_norm = [
    None,                        # without normalization
    ['season'],                  # only season
    ['season','position'],       # by the position + season
    ['season','player_cluster'], # by the player cluster + season
    ['season','team_cluster'],   # by the team cluster + season
]
norm_shift = 1 # number of seasons to shift for normalization, 0 - for the new 2024 predictions, 1 - for training 

########## 
# read main preprocessed dataset
df = pd.read_pickle('data/df_model.pkl')

from sklearn import model_selection as ms
from sklearn import metrics
kfold1 = ms.KFold(n_splits = 5, shuffle = True, random_state = 2017) # k folds for models parameters searching
kfold2 = ms.KFold(n_splits = 5, shuffle = True, random_state = 2022) # k folds for best models evaluation

# main loop
df_out_mean = pd.DataFrame()
df_out_std  = pd.DataFrame()
df_param = pd.DataFrame()
for feat_target in feats_target:
    for feat_norm in feats_norm:
        for flg_ma in [True, False]:
            try:
                print(feat_target, feat_norm, flg_ma)
                # 1) create datasets
                x, y, m, s, df_out = get_datasets(df = df[df.season_len > 1], 
                                          feat_target = feat_target, 
                                          feat_norm = feat_norm, 
                                          norm_shift = norm_shift, 
                                          flg_ma = flg_ma)
                real = (y * s + m).values

                # 2) searching the best models parameters on kfold1 and get their scores on kfold2 
                res1, param1 = model_searching(kfold1, kfold2)

                df_out1 = pd.DataFrame(res1)
                df_out1['target'] = feat_target
                df_out1['norm'] = str(feat_norm)
                df_out1['ma'] = flg_ma
                df_out_mean = pd.concat([df_out_mean, df_out1[:1]])
                df_out_std = pd.concat([df_out_std, df_out1[1:]])
                
                df_out2 = pd.DataFrame([[k,v] for k,v in param1.items()]).set_index(0).T
                df_out2['target'] = feat_target
                df_out2['norm'] = str(feat_norm)
                df_out2['ma'] = flg_ma
                df_param = pd.concat([df_param, df_out2])
                
            except Exception as e:
                print(e)

games None True
games None False
games ['season'] True
games ['season'] False
games ['season', 'position'] True
games ['season', 'position'] False
games ['season', 'player_cluster'] True
games ['season', 'player_cluster'] False
games ['season', 'team_cluster'] True
games ['season', 'team_cluster'] False
fantasy_points_per_game None True
fantasy_points_per_game None False
fantasy_points_per_game ['season'] True
fantasy_points_per_game ['season'] False
fantasy_points_per_game ['season', 'position'] True
fantasy_points_per_game ['season', 'position'] False
fantasy_points_per_game ['season', 'player_cluster'] True
fantasy_points_per_game ['season', 'player_cluster'] False
fantasy_points_per_game ['season', 'team_cluster'] True
fantasy_points_per_game ['season', 'team_cluster'] False
fantasy_points_ppr None True
fantasy_points_ppr None False
fantasy_points_ppr ['season'] True
fantasy_points_ppr ['season'] False
fantasy_points_ppr ['season', 'position'] True
fantasy_points_ppr ['season', 'po

In [31]:
print('Average MAE scores')
filt = df_out_mean.target == 'fantasy_points_ppr'
df_out1 = df_out_mean[filt]
del df_out1['target']
display(df_out1.set_index(['norm','ma']).ffill().T.style.background_gradient(cmap='RdYlGn_r').set_precision(1))

print('STD of MAE scores')
filt = df_out_std.target == 'fantasy_points_ppr'
df_out1 = df_out_std[filt]
del df_out1['target']
display(df_out1.set_index(['norm','ma']).ffill().T.style.background_gradient(cmap='RdYlGn_r').set_precision(1))

Average MAE scores


norm,None,None,['season'],['season'],"['season', 'position']","['season', 'position']","['season', 'player_cluster']","['season', 'player_cluster']","['season', 'team_cluster']","['season', 'team_cluster']"
ma,True,False,True,False,True,False,True,False,True,False
Decision tree,45.6,46.5,45.7,45.3,46.9,50.0,49.2,49.6,47.9,46.3
Xgboost,41.8,42.6,42.8,43.3,44.8,46.3,44.0,43.8,43.5,44.1
Random forest,42.9,43.0,42.9,43.7,44.0,44.8,45.3,43.6,43.7,44.0
KNN,42.6,42.9,42.5,45.0,44.0,45.2,42.0,45.4,42.4,44.5
Linear model,42.4,42.0,42.4,42.6,42.9,42.9,56.7,52.9,42.5,42.7
Neural network (MLP),58.4,43.3,42.7,43.6,43.1,44.4,42.2,43.9,42.2,44.0
MA1,46.1,46.1,44.8,44.8,46.8,46.8,47.3,47.3,44.8,44.8
MA2,46.0,46.0,44.2,44.2,45.7,45.7,46.9,46.9,44.3,44.3
MA3,46.1,46.1,44.0,44.0,45.5,45.5,46.5,46.5,43.9,43.9
MA4,46.1,46.1,44.6,44.6,46.0,46.0,136.6,136.6,44.4,44.4


STD of MAE scores


norm,None,None,['season'],['season'],"['season', 'position']","['season', 'position']","['season', 'player_cluster']","['season', 'player_cluster']","['season', 'team_cluster']","['season', 'team_cluster']"
ma,True,False,True,False,True,False,True,False,True,False
Decision tree,2.7,3.5,3.1,2.7,3.4,2.1,3.9,2.5,3.1,1.3
Xgboost,2.0,2.0,1.7,1.8,2.7,2.7,2.4,3.3,1.5,2.0
Random forest,2.1,2.0,2.0,2.3,1.9,2.1,3.5,3.1,2.1,1.3
KNN,2.2,3.7,1.2,1.2,1.0,2.7,1.5,2.5,1.2,2.6
Linear model,1.5,1.7,1.8,2.0,2.2,2.0,9.8,3.3,1.7,1.5
Neural network (MLP),19.4,3.1,1.9,2.0,2.0,2.7,1.8,1.5,1.3,1.0
MA1,2.8,2.8,2.5,2.5,2.5,2.5,2.3,2.3,2.6,2.6
MA2,2.5,2.5,2.6,2.6,2.8,2.8,2.7,2.7,2.5,2.5
MA3,3.5,3.5,3.4,3.4,3.7,3.7,3.5,3.5,3.7,3.7
MA4,3.7,3.7,3.6,3.6,4.2,4.2,177.5,177.5,4.0,4.0


In [23]:
# best model: 
f1 = df_param.target == 'fantasy_points_ppr'
f2 = df_param.ma == False
f3 = df_param.norm == 'None'

df_param[f1&f2&f3]['Linear model']

1    {'alpha': 1, 'l1_ratio': 0.5}
Name: Linear model, dtype: object

In [24]:
#####################
# PREDICTING
#####################
# 1) create datasets

# for training
x, y, m, s, df_out = get_datasets(df = df[df.season_len > 1], 
                          feat_target = 'fantasy_points_ppr', 
                          feat_norm = None, 
                          norm_shift = 1, 
                          flg_ma = False)
print(x.shape)

# for predictions
x1, y1, m1, s1, df_out1 = get_datasets(df = df,   # change here
                          feat_target = 'fantasy_points_ppr', 
                          feat_norm = None, 
                          norm_shift = 0,         # change here
                          flg_ma = False)
print(x1.shape)


# 2) train model with the best params
from sklearn.linear_model import ElasticNet
params = {
    'alpha': 1, 
    'l1_ratio': 0.5
}
clf = ElasticNet(**params)
clf.fit(x, y)

# 3) make predictions
pred = clf.predict(x1)
df_out1['pred'] = (pred * s1 + m1).values
df_out1 = df_out1[['display_name','pred']]


# 4) add Fullback position predictions and save to file
df_out2 = pd.read_csv('data/output_add1.csv')
df_out1.columns = df_out2.columns

df_out = pd.concat([df_out1, df_out2])
df_out.to_csv('data/output_nfl.csv')
print(df_out.shape)
df_out.sort_values('Projected fantasy points scored PPR for 2023', ascending = False)[:10]

(444, 9)
(560, 9)
(572, 2)


Unnamed: 0,Player Name,Projected fantasy points scored PPR for 2023
1066,Patrick Mahomes,365.05
1445,Josh Allen,359.2
1120,Cooper Kupp,343.05
972,Deshaun Watson,322.88
1856,Justin Herbert,290.39
1015,Austin Ekeler,281.96
7,Tom Brady,278.42
388,Davante Adams,267.73
1776,Jonathan Taylor,265.53
1836,Justin Jefferson,254.48


In [33]:
# formula of the linear model:
c = clf.intercept_
f'{c:.2f} + ' + ' + '.join([f'{b:.3f}*{a}' for a,b in zip(x.columns,clf.coef_)])

'-121.72 + 0.932*static_height_cm + -0.388*static_weight_kg + -0.000*static_bmi_calc + 0.639*lag1 + 0.078*lag2 + 0.050*lag3 + 0.181*lag4 + -0.052*lag5 + -0.211*lag6'

### check

In [38]:
(-121.72 + 0.932*x1.static_height_cm - 0.388*x1.static_weight_kg + 0.639*x1.lag1 + 0.078*x1.lag2 + 0.05*x1.lag3 + 0.181*x1.lag4 - 0.052*x1.lag5 - 0.211*x1.lag6)[:5]

7    278.31
15   240.08
23    36.77
31   176.76
39     7.59
dtype: float64

In [39]:
clf.predict(x1)[:5]

array([278.13807745, 239.8787224 ,  36.65319858, 176.58283338,
         7.45200599])