In [40]:
import os
import pandas as pd
import numpy as np
import sklearn


In [161]:
stats = pd.read_csv('Stats.csv')
if 'Unnamed: 0.1' in stats.columns:
        stats.pop('Unnamed: 0.1')
if 'Unnamed: 0' in stats.columns:
        stats.pop('Unnamed: 0')
if 'Unnamed: 0.2' in stats.columns:
        stats.pop('Unnamed: 0.2')
if 'Unnamed: 0.3' in stats.columns:
        stats.pop('Unnamed: 0.3')
stats.to_csv('Stats.csv')


In [162]:
stats = stats.dropna()
stats = stats.groupby('Player-additional', group_keys=False).filter(lambda s: s.shape[0] > 3)

In [6]:
print(stats)

      Unnamed: 0.2  Unnamed: 0.1  Unnamed: 0            Player   Tm  Age Pos  \
0                0             0           0    BrandonZylstra  CAR   28  WR   
1                2             2           2    BrandonZylstra  CAR   27  WR   
2                1             1           1    BrandonZylstra  CAR   26  WR   
3                3             3           3    BrandonZylstra  MIN   25  WR   
5                5             5           5  OlamideZaccheaus  ATL   25  WR   
...            ...           ...         ...               ...  ...  ...  ..   
2057          2055          2055        2055      DavanteAdams  GNB   26  WR   
2058          2059          2059        2059      DavanteAdams  GNB   25  WR   
2059          2057          2057        2057      DavanteAdams  GNB   24  WR   
2060          2060          2060        2060      DavanteAdams  GNB   23  WR   
2061          2061          2061        2061      DavanteAdams  GNB   22  WR   

       G    Tgt  Rec  ...  TD  Y/Tgt  R

In [163]:
stats.to_csv('Stats.csv')

In [164]:
def next_stat(player):
    player = player.sort_values('Year')
    player['Next_Yds'] = player['Yds'].shift(-1)
    return player
    
stats = stats.groupby('Player-additional', group_keys=False).apply(next_stat)

In [52]:
print(stats[['Player','Year','Yds','Next_Yds']])

                Player  Year   Yds  Next_Yds
2061      DavanteAdams  2014   446     483.0
2060      DavanteAdams  2015   483     997.0
2059      DavanteAdams  2016   997     885.0
2058      DavanteAdams  2017   885    1386.0
2057      DavanteAdams  2018  1386     997.0
...                ...   ...   ...       ...
5     OlamideZaccheaus  2022   533       NaN
3       BrandonZylstra  2018    23     106.0
2       BrandonZylstra  2019   106      35.0
1       BrandonZylstra  2020    35     250.0
0       BrandonZylstra  2021   250       NaN

[1206 rows x 4 columns]


In [165]:
stats_complete = stats.copy()
stats = stats.dropna().copy()
stats['Ctch%'] = stats['Ctch%'].str.replace('\W','',regex = True)


In [166]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

RR = Ridge(alpha = 3)
split = TimeSeriesSplit(n_splits = 4)
sfs = SequentialFeatureSelector(RR, n_features_to_select = 10, direction = 'backward', n_jobs = 4)



In [167]:
remove = ['Player', 'Tm', 'Pos', 'Player-additional', 'Next_Yds', 'Year']
params  = stats.columns[~stats.columns.isin(remove)]

In [168]:
from sklearn.preprocessing import MinMaxScaler



scale = MinMaxScaler()
stats.loc[:,params] = scale.fit_transform(stats[params])



  stats.loc[:,params] = scale.fit_transform(stats[params])


In [169]:
sfs.fit(stats[params], stats['Next_Yds'])

In [170]:
pred = list(params[sfs.get_support()])
print(pred)
print(stats)

['Age', 'Ctch%', 'Yds', 'TD', 'Y/Tgt', 'R/G', 'Y/G', 'Cmp', 'Att', 'P_Yds']
                Player   Tm       Age Pos       G       Tgt       Rec  \
1205      DavanteAdams  GNB  0.066667  WR  0.9375  0.321782  0.250000   
1204      DavanteAdams  GNB  0.133333  WR  0.7500  0.460396  0.331081   
1203      DavanteAdams  GNB  0.200000  WR  0.9375  0.594059  0.500000   
1202      DavanteAdams  GNB  0.266667  WR  0.8125  0.574257  0.493243   
1201      DavanteAdams  GNB  0.333333  WR  0.8750  0.831683  0.743243   
...                ...  ...       ...  ..     ...       ...       ...   
6     OlamideZaccheaus  ATL  0.133333  WR  0.6250  0.153465  0.128378   
5     OlamideZaccheaus  ATL  0.200000  WR  1.0000  0.257426  0.202703   
3       BrandonZylstra  MIN  0.266667  WR  0.9375  0.004950  0.000000   
2       BrandonZylstra  CAR  0.333333  WR  0.4375  0.054455  0.047297   
1       BrandonZylstra  CAR  0.400000  WR  0.9375  0.014851  0.013514   

         Ctch%       Yds       Y/R  ...     Y/T

In [171]:
def backtest(data, model, predictors, start = 4, step = 1):
    final_prediction = []
    years = sorted(data['Year'].unique())
    for i in  range(start,len(years),step):
        curr = years[i]
        train = data[data['Year'] < curr]
        test = data[data['Year'] == curr]
        model.fit(train[predictors], train['Next_Yds'])
        prediction = model.predict(test[predictors])
        prediction = pd.Series(prediction, index=test.index)
        compare = pd.concat([test['Next_Yds'], prediction], axis=1)
        compare.columns = ['Actual', 'Prediction']
        final_prediction.append(compare)
    return pd.concat(final_prediction)    
        
    

In [172]:
new_pred = backtest(stats,RR,pred)

In [173]:
print(new_pred)

      Actual   Prediction
1202  1386.0   831.665330
1194   736.0   705.101145
1185    53.0   235.039593
1179  1196.0  1046.055701
1172   303.0   331.192940
...      ...          ...
68      74.0   128.286261
62     895.0   853.527088
35     136.0   521.671878
21     527.0   633.622177
5      533.0   412.332029

[587 rows x 2 columns]


In [174]:
from sklearn.metrics import mean_squared_error

mean_squared_error(new_pred['Actual'], new_pred['Prediction'])


84485.39312816794

In [175]:
stats['Next_Yds'].describe()

count     993.000000
mean      529.619335
std       405.262875
min        -4.000000
25%       185.000000
50%       451.000000
75%       804.000000
max      1947.000000
Name: Next_Yds, dtype: float64

In [176]:
def player_past(data):
    data.sort_values('Year')

    data['Season'] = range(0,data.shape[0])
    data['Yd_corr'] = list(data[['Season','Yds']].expanding().corr().loc[(slice(None),'Season'),'Yds'])
    data['Yd_corr'].fillna(1, inplace = True)
    data['Yd_diff'] = data['Yds']/data['Yds'].shift(1)
    data['Yd_diff'].fillna(1, inplace = True)
    data['Yd_diff'][data['Yd_diff'] == np.inf] = 1

    return data
stats = stats.groupby('Player-additional', group_keys= False).apply(player_past)

    

In [177]:
def season_avg(data):
    return data['Yds']/data['Yds'].mean()

stats['Yds_season'] = stats.groupby('Year', group_keys = False).apply(season_avg)

In [178]:
print(stats['Yd_corr'])

1205    1.000000
1204    1.000000
1203    0.894495
1202    0.846364
1201    0.925874
          ...   
6       1.000000
5       0.998568
3       1.000000
2       1.000000
1       0.133752
Name: Yd_corr, Length: 993, dtype: float64


In [179]:
updated_predictors = pred + ['Yds_season','Yd_diff','Yd_corr', 'Season']
updated_predictions = backtest(stats,RR,updated_predictors)

In [180]:
print(updated_predictions)

      Actual  Prediction
1202  1386.0  781.893371
1194   736.0  694.889059
1185    53.0  182.993404
1179  1196.0  964.466995
1172   303.0  346.369640
...      ...         ...
68      74.0  117.102343
62     895.0  838.077971
35     136.0  530.900255
21     527.0  468.790447
5      533.0  395.815466

[587 rows x 2 columns]


In [181]:
mean_squared_error(updated_predictions['Actual'], updated_predictions['Prediction'])

84639.41608911063

In [183]:
stats = pd.concat([stats, updated_predictions], axis=1)

In [184]:
stats.to_csv('Results.csv')

In [185]:
84639.41608911063 ** 0.5

290.928541207477