In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

In [119]:
# Load the data
fantasy_stats = pd.read_csv("Data/FantasyFootballTest.csv")
redzone_stats = pd.read_csv("Data/RedZoneReceptions.csv")

In [121]:
# Merge the dataframes on 'Player' and 'Team'
merged_data = pd.merge(fantasy_stats, redzone_stats, on=['Player', 'Tm'], how='inner')

In [135]:
# Convert columns to numeric, handling errors
numeric_columns = [
    'PassYds', 'PassTD', 'Int', 
    'RushYds', 'RushTD', 'RecYds', 
    'RecTD', 'RedZoneTgt', 'PPR'
]

In [137]:
for col in numeric_columns:
    merged_data[col] = pd.to_numeric(merged_data[col], errors='coerce')

In [117]:
# Function to train and predict for a specific position
def train_and_predict(position, features):
    # Filter data for the specific position
    position_data = merged_data[merged_data['FantPos'] == position]
    
    if position_data.empty:
        print(f"No data found for position: {position}")
        return pd.DataFrame()

    # Scale PPR Scoring Points to give it more weight
    position_data.loc[:, 'PPR'] = position_data['PPR'] * 2

    X = position_data[features]
    y = position_data['FantPt']

    # Fill missing values with 0 for simplicity
    X = X.fillna(0)
    y = y.fillna(0)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the model with adjusted parameters
    rf = RandomForestRegressor(n_estimators=200, max_depth=30, random_state=42)
    rf.fit(X_train, y_train)

    # Evaluate the model
    y_pred_train = rf.predict(X_train)
    y_pred_test = rf.predict(X_test)

    print(f"\n{position} - Training MAE: {mean_absolute_error(y_train, y_pred_train)}")
    print(f"{position} - Testing MAE: {mean_absolute_error(y_test, y_pred_test)}")

    # Feature importances
    feature_importances = rf.feature_importances_
    importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    print(f"\n{position} - Feature Importances:")
    print(importance_df)

    # Predict the 2023 fantasy points
    position_data['Predicted Fantasy Points'] = rf.predict(X)
    
    return position_data[['Player', 'Predicted Fantasy Points']]

In [143]:
# Define feature sets for each position, including past PPR points and new features
rb_features = ['RushYds', 'RushTD', 'RecYds', 'RecTD', 'G', 'PPR']
wr_features = ['RecYds', 'RecTD', 'RedZoneTgt', 'G', 'Tgt_x', 'PPR']
te_features = ['RecYds', 'RecTD', 'RedZoneTgt', 'G', 'Tgt_x', 'PPR']

# Train and predict for each position, excluding QBs
rb_predictions = train_and_predict('RB', rb_features)
wr_predictions = train_and_predict('WR', wr_features)
te_predictions = train_and_predict('TE', te_features)

# Combine all predictions
all_predictions = pd.concat([rb_predictions, wr_predictions, te_predictions])

# Save the projected data to a new CSV file in the Downloads directory, only including player name and predicted points
output_path = "Data/FantasyFootballPrediction.csv"
all_predictions[['Player', 'Predicted Fantasy Points']].to_csv(output_path, index=False)

# Display the projected data
print("\nProjected Fantasy Points for 2023:")
print(all_predictions[['Player', 'Predicted Fantasy Points']].head())


RB - Training MAE: 3.0810000000000004
RB - Testing MAE: 13.465000000000002

RB - Feature Importances:
   Feature  Importance
5      PPR    0.801233
0  RushYds    0.178877
1   RushTD    0.010937
2   RecYds    0.004423
4        G    0.002279
3    RecTD    0.002252


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  position_data['Predicted Fantasy Points'] = rf.predict(X)



WR - Training MAE: 2.5483522727272727
WR - Testing MAE: 6.866956521739129

WR - Feature Importances:
      Feature  Importance
5         PPR    0.920332
0      RecYds    0.061009
1       RecTD    0.008142
2  RedZoneTgt    0.005327
4       Tgt_x    0.003877
3           G    0.001314


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  position_data['Predicted Fantasy Points'] = rf.predict(X)



TE - Training MAE: 2.9194594594594596
TE - Testing MAE: 5.0870000000000015

TE - Feature Importances:
      Feature  Importance
5         PPR    0.801524
0      RecYds    0.122015
4       Tgt_x    0.044917
1       RecTD    0.018310
3           G    0.008141
2  RedZoneTgt    0.005093

Projected Fantasy Points for 2023:
                 Player  Predicted Fantasy Points
0   Christian McCaffrey                   220.670
3        Raheem Mostert                   233.250
7        Travis Etienne                   219.525
8        Kyren Williams                   221.225
11        Derrick Henry                   214.710


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  position_data['Predicted Fantasy Points'] = rf.predict(X)


In [109]:
fantasy_stats.head()

Unnamed: 0,Rk,Player,Tm,FantPos,Age,G,GS,Cmp,Att,PassYds,...,TD,2:00 PM,2PP,FantPt,PPR,DKPt,FDPt,VBD,PosRank,OvRank
0,1,Christian McCaffrey,SFO,RB,27,16,16,0,0,0,...,21,,,324.0,391.3,399.3,357.8,157.0,1.0,1.0
1,2,CeeDee Lamb,DAL,WR,24,17,17,0,0,0,...,14,1.0,,268.0,403.2,411.2,335.7,131.0,1.0,2.0
2,3,Josh Allen,BUF,QB,27,17,17,385,579,4306,...,15,,3.0,393.0,392.6,420.6,410.6,122.0,1.0,3.0
3,4,Tyreek Hill,MIA,WR,29,16,16,0,0,0,...,13,,,257.0,376.4,380.4,316.9,120.0,2.0,4.0
4,5,Jalen Hurts,PHI,QB,25,17,17,352,538,3858,...,15,,,357.0,356.8,382.8,371.8,89.0,2.0,5.0


In [131]:
fantasy_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295 entries, 0 to 294
Data columns (total 33 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Rk       295 non-null    int64  
 1   Player   295 non-null    object 
 2   Tm       295 non-null    object 
 3   FantPos  295 non-null    object 
 4   Age      295 non-null    int64  
 5   G        295 non-null    int64  
 6   GS       295 non-null    int64  
 7   Cmp      295 non-null    int64  
 8   Att      295 non-null    int64  
 9   PassYds  295 non-null    int64  
 10  PassTD   295 non-null    int64  
 11  Int      295 non-null    int64  
 12  RushAtt  295 non-null    int64  
 13  RushYds  295 non-null    int64  
 14  Y/A      208 non-null    float64
 15  RushTD   295 non-null    int64  
 16  Tgt      295 non-null    int64  
 17  Rec      295 non-null    int64  
 18  RecYds   295 non-null    int64  
 19  Y/R      250 non-null    float64
 20  RecTD    295 non-null    int64  
 21  Fmb      295 non

In [141]:
merged_data.head()

Unnamed: 0,Rk,Player,Tm,FantPos,Age,G,GS,Cmp,Att,PassYds,...,Yds,TD_y,%Tgt,RedZoneTgt,Rec.1,Ctch%.1,Yds.1,TD.1,%Tgt.1,Link
0,1,Christian McCaffrey,SFO,RB,27,16,16,0,0,0,...,61,5,22.90%,8,6,75.00%,32,4,30.80%,All Christian McCaffrey red zone receiving plays
1,2,CeeDee Lamb,DAL,WR,24,17,17,0,0,0,...,131,8,29.80%,17,8,47.06%,31,4,33.30%,All CeeDee Lamb red zone receiving plays
2,4,Tyreek Hill,MIA,WR,29,16,16,0,0,0,...,54,4,35.80%,16,8,50.00%,17,4,36.40%,All Tyreek Hill red zone receiving plays
3,6,Raheem Mostert,MIA,RB,31,15,15,0,0,0,...,36,3,9.00%,3,3,100.00%,12,2,6.80%,All Raheem Mostert red zone receiving plays
4,7,Amon-Ra St. Brown,DET,WR,24,16,16,0,0,0,...,107,5,35.40%,12,7,58.33%,35,3,34.30%,All Amon-Ra St. Brown red zone receiving plays


In [145]:
# Example: Plotting the feature importance
feature_importance = model.feature_importances_
sns.barplot(x=feature_importance, y=X.columns)
plt.title('Feature Importance')
plt.show()

NameError: name 'model' is not defined