In [233]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

In [235]:
stats_2024 = pd.read_csv('Data/2024PredFantasy - one.csv')
stats_2023 = pd.read_csv('Data/2023FantasyStats - one.csv')

In [237]:
stats_2024.head()

Unnamed: 0,2024Rk,Player,Tm,FantPos,2024Age,2024G,2024GS,2024Cmp,2024Att,2024PassYds,...,2024RushTD,2024Rec,2024RecYds,2024RecTD,2024FL,2024TD,2024PPR,2024AVGPPR,2024PosRank,2024OvRank
0,1,Josh Allen,BUF,QB,28,17,17,368,565,4060,...,9,0,0,0,4,37,360,377.0,1,1
1,2,Jalen Hurts,PHI,QB,26,17,17,340,520,3810,...,11,0,0,0,4,36,360,359.0,2,2
2,3,Christian McCaffrey,SF,RB,28,17,17,0,0,0,...,12,70,568,5,2,16,351,371.0,1,3
3,4,Lamar Jackson,BAL,QB,27,17,17,304,464,3567,...,5,0,0,0,4,30,339,335.0,3,4
4,5,Patrick Mahomes II,KC,QB,29,17,17,392,585,4371,...,2,0,0,0,3,35,337,308.0,4,5


In [239]:
# Merge the datasets on player name, ensuring the names align
merged_data = pd.merge(stats_2023, stats_2024, on=['Player', 'Tm'], how='right')
# merged_data = pd.merge(stats_2023, stats_2024, on=['Player', 'Tm'], how='left')

In [241]:
merged_data.head()

Unnamed: 0,2023Rk,Player,Tm,FantPos_x,2023Age,2023G,2023GS,2023Cmp,2023Att,2023PassYds,...,2024RushTD,2024Rec,2024RecYds,2024RecTD,2024FL,2024TD,2024PPR,2024AVGPPR,2024PosRank,2024OvRank
0,2.0,Josh Allen,BUF,QB,27.0,17.0,17.0,385.0,579.0,4306.0,...,9,0,0,0,4,37,360,377.0,1,1
1,5.0,Jalen Hurts,PHI,QB,25.0,17.0,17.0,352.0,538.0,3858.0,...,11,0,0,0,4,36,360,359.0,2,2
2,,Christian McCaffrey,SF,,,,,,,,...,12,70,568,5,2,16,351,371.0,1,3
3,7.0,Lamar Jackson,BAL,QB,26.0,16.0,16.0,307.0,457.0,3678.0,...,5,0,0,0,4,30,339,335.0,3,4
4,,Patrick Mahomes II,KC,,,,,,,,...,2,0,0,0,3,35,337,308.0,4,5


In [243]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247 entries, 0 to 246
Data columns (total 45 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   2023Rk       116 non-null    float64
 1   Player       247 non-null    object 
 2   Tm           247 non-null    object 
 3   FantPos_x    116 non-null    object 
 4   2023Age      116 non-null    float64
 5   2023G        116 non-null    float64
 6   2023GS       116 non-null    float64
 7   2023Cmp      116 non-null    float64
 8   2023Att      116 non-null    float64
 9   2023PassYds  116 non-null    object 
 10  2023PassTD   116 non-null    float64
 11  2023Int      116 non-null    float64
 12  2023RushAtt  116 non-null    float64
 13  2023RushYds  116 non-null    object 
 14  2023RushTD   116 non-null    float64
 15  2023Rec      116 non-null    float64
 16  2023RecYds   116 non-null    object 
 17  2023RecTD    116 non-null    float64
 18  2023FL       116 non-null    float64
 19  2023TD  

In [245]:
# Convert columns to numeric, handling errors
numeric_columns = [
    '2023PassYds', '2023RushYds', '2023RecYds', '2024PassYds', '2024RushYds', '2024RecYds'
]

In [247]:
for col in numeric_columns:
    merged_data[col] = pd.to_numeric(merged_data[col], errors='coerce')

## Evaluate date using Train, Test, and Split

In [255]:
# Function to train and predict for a specific position
def train_and_predict(position, features):
    # Filter data for the specific position
    position_data = merged_data[merged_data['FantPos_y'] == position].copy()
    
    if position_data.empty:
        print(f"No data found for position: {position}")
        return pd.DataFrame()

    X = position_data[features]
    y = position_data['2024AVGPPR']

 # Fill missing values with 0 for simplicity
    X = X.fillna(0)
    y = y.fillna(0)

# Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the model with adjusted parameters
    rf = RandomForestRegressor(n_estimators=200, max_depth=30, random_state=42)
    rf.fit(X_train, y_train)

    # Evaluate the model
    y_pred_train = rf.predict(X_train)
    y_pred_test = rf.predict(X_test)

    print(f"\n{position} - Training MAE: {mean_absolute_error(y_train, y_pred_train)}")
    print(f"{position} - Testing MAE: {mean_absolute_error(y_test, y_pred_test)}")

    # Feature importances
    feature_importances = rf.feature_importances_
    importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    print(f"\n{position} - Feature Importances:")
    print(importance_df)

    # Predict the 2024 fantasy points
    position_data.loc[:, 'Predicted Fantasy Points'] = rf.predict(X)
    
    return position_data[['Player', 'FantPos_x', 'Predicted Fantasy Points', '2024AVGPPR']]

# Define feature sets for each position, including past PPR points and new features
rb_features = ['2023Age', '2024Age', '2023RushAtt', '2023RushYds', '2023RushTD', '2023Rec', '2023RecTD', '2023G', '2023FL', '2023TD', '2024G', '2024RushAtt', '2024RushYds', '2024RushTD', '2024Rec', '2024RecYds', '2024RecTD', '2024FL', '2024TD']
wr_features = ['2023Age', '2024Age', '2023RushAtt', '2023RushYds', '2023RushTD', '2023Rec', '2023RecTD', '2023G', '2023FL', '2023TD', '2024G', '2024RushAtt', '2024RushYds', '2024RushTD', '2024Rec', '2024RecYds', '2024RecTD', '2024FL', '2024TD']
te_features = ['2023Age', '2024Age', '2023Rec', '2023RecTD', '2023G', '2023FL', '2023TD', '2024G', '2024Rec', '2024RecYds', '2024RecTD', '2024FL', '2024TD']
qb_features = ['2023Age', '2024Age', '2023PassYds', '2023PassTD', '2023RushAtt', '2023RushYds', '2023RushTD', '2023G', '2023FL', '2023TD', '2024G', '2024RushAtt', '2024RushYds', '2024RushTD', '2024FL', '2024PassYds', '2024PassTD', '2024TD']

# Train and predict for each position, excluding QBs
rb_predictions = train_and_predict('RB', rb_features)
wr_predictions = train_and_predict('WR', wr_features)
te_predictions = train_and_predict('TE', te_features)
qb_predictions = train_and_predict('QB', qb_features)

# Combine all predictions
all_predictions = pd.concat([rb_predictions, wr_predictions, te_predictions, qb_predictions])

# Save the projected data to a new CSV file in the Downloads directory, only including player name and predicted points
output_path = "Data/FantasyFootballPrediction2.csv"
all_predictions[['Player', 'FantPos_x', 'Predicted Fantasy Points', '2024AVGPPR']].to_csv(output_path, index=False)


RB - Training MAE: 5.067666666666665
RB - Testing MAE: 23.539375

RB - Feature Importances:
        Feature  Importance
18       2024TD    0.347948
11  2024RushAtt    0.242385
14      2024Rec    0.210761
15   2024RecYds    0.099269
12  2024RushYds    0.038852
13   2024RushTD    0.032737
5       2023Rec    0.006188
16    2024RecTD    0.005167
1       2024Age    0.004212
0       2023Age    0.001922
3   2023RushYds    0.001917
4    2023RushTD    0.001832
2   2023RushAtt    0.001452
9        2023TD    0.001305
6     2023RecTD    0.001299
7         2023G    0.000941
8        2023FL    0.000742
17       2024FL    0.000652
10        2024G    0.000420

WR - Training MAE: 5.482910447761195
WR - Testing MAE: 19.726764705882353

WR - Feature Importances:
        Feature  Importance
14      2024Rec    0.760076
18       2024TD    0.075484
5       2023Rec    0.057134
16    2024RecTD    0.029592
15   2024RecYds    0.022994
1       2024Age    0.010683
9        2023TD    0.009307
12  2024RushYds    0.

## Now we will use Cross Validation to determine the fantasy football projected scores.

In [253]:
# Function to train and predict for a specific position using cross-validation
def train_and_predict_cv(position, features, cv=5):
    # Filter data for the specific position
    position_data = merged_data[merged_data['FantPos_y'] == position]
    
    if position_data.empty:
        print(f"No data found for position: {position}")
        return pd.DataFrame()

    X = position_data[features]
    y = position_data['2024AVGPPR']

    # Fill missing values with 0 for simplicity
    X = X.fillna(0)
    y = y.fillna(0)

    # Initialize the model
    rf = RandomForestRegressor(n_estimators=200, max_depth=30, random_state=42)

    # Create a cross-validation strategy
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)

    # Define the scoring method
    mae_scorer = make_scorer(mean_absolute_error)

    # Perform cross-validation
    cv_scores = cross_val_score(rf, X, y, cv=kf, scoring=mae_scorer)
    
    # Fit the model on the entire dataset after cross-validation
    rf.fit(X, y)

    # Predict the 2024 fantasy points
    position_data.loc[:, 'Predicted Fantasy Points'] = rf.predict(X)

    # Calculate the mean MAE from cross-validation
    mean_cv_score = cv_scores.mean()
    print(f"\n{position} - Mean CV MAE: {mean_cv_score}")

    # Feature importances
    feature_importances = rf.feature_importances_
    importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    print(f"\n{position} - Feature Importances:")
    print(importance_df)

    return position_data[['Player', 'FantPos_x', 'Predicted Fantasy Points', '2024AVGPPR']]

# Define feature sets for each position, including past PPR points and new features
rb_features = ['2023Age', '2024Age', '2023RushAtt', '2023RushYds', '2023RushTD', '2023Rec', '2023RecTD', '2023G', '2023FL', '2023TD', '2024G', '2024RushAtt', '2024RushYds', '2024RushTD', '2024Rec', '2024RecYds', '2024RecTD', '2024FL', '2024TD']
wr_features = ['2023Age', '2024Age', '2023RushAtt', '2023RushYds', '2023RushTD', '2023Rec', '2023RecTD', '2023G', '2023FL', '2023TD', '2024G', '2024RushAtt', '2024RushYds', '2024RushTD', '2024Rec', '2024RecYds', '2024RecTD', '2024FL', '2024TD']
te_features = ['2023Age', '2024Age', '2023Rec', '2023RecTD', '2023G', '2023FL', '2023TD', '2024G', '2024Rec', '2024RecYds', '2024RecTD', '2024FL', '2024TD']
qb_features = ['2023Age', '2024Age', '2023PassYds', '2023PassTD', '2023RushAtt', '2023RushYds', '2023RushTD', '2023G', '2023FL', '2023TD', '2024G', '2024RushAtt', '2024RushYds', '2024RushTD', '2024FL', '2024PassYds', '2024PassTD', '2024TD']

# Train and predict for each position, excluding QBs
rb_predictions = train_and_predict_cv('RB', rb_features)
wr_predictions = train_and_predict_cv('WR', wr_features)
te_predictions = train_and_predict_cv('TE', te_features)
qb_predictions = train_and_predict_cv('QB', qb_features)

# Combine all predictions
all_predictions = pd.concat([rb_predictions, wr_predictions, te_predictions, qb_predictions])

# Save the projected data to a new CSV file in the Downloads directory, only including player name and predicted points
output_path = "Data/FantasyFootballPrediction_CV.csv"
all_predictions[['Player', 'FantPos_x', 'Predicted Fantasy Points', '2024AVGPPR']].to_csv(output_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  position_data.loc[:, 'Predicted Fantasy Points'] = rf.predict(X)



RB - Mean CV MAE: 17.329120833333334

RB - Feature Importances:
        Feature  Importance
11  2024RushAtt    0.530832
18       2024TD    0.224657
12  2024RushYds    0.059131
15   2024RecYds    0.050747
13   2024RushTD    0.045090
14      2024Rec    0.037986
16    2024RecTD    0.021866
5       2023Rec    0.008284
1       2024Age    0.005700
17       2024FL    0.002427
2   2023RushAtt    0.002389
3   2023RushYds    0.002360
0       2023Age    0.002028
4    2023RushTD    0.001761
7         2023G    0.001523
9        2023TD    0.001261
6     2023RecTD    0.000852
10        2024G    0.000614
8        2023FL    0.000492


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  position_data.loc[:, 'Predicted Fantasy Points'] = rf.predict(X)



WR - Mean CV MAE: 17.120147058823527

WR - Feature Importances:
        Feature  Importance
14      2024Rec    0.795544
5       2023Rec    0.066401
18       2024TD    0.027447
15   2024RecYds    0.019609
16    2024RecTD    0.019582
9        2023TD    0.014231
1       2024Age    0.011700
12  2024RushYds    0.010665
11  2024RushAtt    0.008245
6     2023RecTD    0.007019
2   2023RushAtt    0.005392
3   2023RushYds    0.004387
0       2023Age    0.002877
7         2023G    0.002775
8        2023FL    0.001420
17       2024FL    0.001008
4    2023RushTD    0.000991
13   2024RushTD    0.000708
10        2024G    0.000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  position_data.loc[:, 'Predicted Fantasy Points'] = rf.predict(X)



TE - Mean CV MAE: 14.153144444444445

TE - Feature Importances:
       Feature  Importance
8      2024Rec    0.571541
9   2024RecYds    0.310316
12      2024TD    0.035910
10   2024RecTD    0.028658
1      2024Age    0.019602
2      2023Rec    0.011523
0      2023Age    0.010669
3    2023RecTD    0.004649
6       2023TD    0.003479
4        2023G    0.002220
5       2023FL    0.000865
11      2024FL    0.000567
7        2024G    0.000000

QB - Mean CV MAE: 42.15086111111111

QB - Feature Importances:
        Feature  Importance
17       2024TD    0.370366
16   2024PassTD    0.304806
10        2024G    0.084065
12  2024RushYds    0.062275
14       2024FL    0.033292
11  2024RushAtt    0.031696
1       2024Age    0.026724
15  2024PassYds    0.018298
13   2024RushTD    0.015233
4   2023RushAtt    0.012682
5   2023RushYds    0.010654
8        2023FL    0.008392
6    2023RushTD    0.005396
9        2023TD    0.004350
0       2023Age    0.003966
7         2023G    0.003371
3    2023PassTD  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  position_data.loc[:, 'Predicted Fantasy Points'] = rf.predict(X)
