In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense

2024-08-21 00:04:38.534835: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-21 00:04:38.534862: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-21 00:04:38.536365: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-21 00:04:38.540938: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


ModuleNotFoundError: No module named 'tensorflow.keras'

In [None]:
def LoadData(positions, years):
    # Initialize an empty dictionary to store DataFrames by position
    data_dict = {}
    
    for position in positions:
        # Initialize an empty list to collect DataFrames for each year
        df_list = []
        
        for year in years:
            filepath = f'NFL-data-Players/{year}/{position}_season.csv'
            df = pd.read_csv(filepath)
            df['Year'] = year
            df_list.append(df)
        
        # Concatenate all yearly DataFrames into a single DataFrame for the current position
        position_data = pd.concat(df_list, ignore_index=True)
        position_data = position_data.fillna(0)
        
        # Store the DataFrame in the dictionary with position as the key
        data_dict[position] = position_data
    
    return data_dict


In [None]:
def prepare_sequences(data, player_id_col, target_col, feature_cols, timesteps=3):
    # Sort data by player and year
    data = data.sort_values(by=[player_id_col, 'Year'])
    
    # Normalize features
    scaler = StandardScaler()
    data[feature_cols] = scaler.fit_transform(data[feature_cols])
    
    sequences = []
    targets = []
    
    unique_players = data[player_id_col].unique()
    
    for player_id in unique_players:
        player_data = data[data[player_id_col] == player_id]
        
        for i in range(len(player_data) - timesteps):
            sequence = player_data.iloc[i:i + timesteps][feature_cols].values
            target = player_data.iloc[i + timesteps][target_col]
            
            sequences.append(sequence)
            targets.append(target)
    
    sequences = np.array(sequences)
    targets = np.array(targets)
    
    return sequences, targets

# Example usage
feature_columns = ['PassingYDS', 'PassingTD', 'RushingYDS', 'ReceivingYDS', 'etc...']  # Replace with your features
X, y = prepare_sequences(data, 'PlayerId', 'TotalPoints', feature_columns, timesteps=3)

In [None]:
def prepare_data(data, train_years, validation_years, test_year):
    # Split data by year
    train_data = data[data['Year'].isin(train_years)]
    validation_data = data[data['Year'].isin(validation_years)]
    test_data = data[data['Year'] == test_year]
    
    print("Training Data Shape:", train_data.shape)
    print("Validation Data Shape:", validation_data.shape)
    print("Test Data Shape:", test_data.shape)
    
    # Prepare feature and target variables for each split
    X_train = train_data.drop(['PlayerName', 'PlayerId', 'TotalPoints', 'Year'], axis=1)
    y_train = train_data['TotalPoints']
    X_val = validation_data.drop(['PlayerName', 'PlayerId', 'TotalPoints', 'Year'], axis=1)
    y_val = validation_data['TotalPoints']
    X_test = test_data.drop(['PlayerName', 'PlayerId', 'TotalPoints', 'Year'], axis=1)
    y_test = test_data['TotalPoints']
    
    print("X_train Shape:", X_train.shape)
    print("X_val Shape:", X_val.shape)
    print("X_test Shape:", X_test.shape)
    
    # Preserve identifiers for later use
    train_ids = train_data[['PlayerName', 'PlayerId']]
    val_ids = validation_data[['PlayerName', 'PlayerId']]
    test_ids = test_data[['PlayerName', 'PlayerId']]
    
    return X_train, X_val, X_test, y_train, y_val, y_test, train_ids, val_ids, test_ids


In [None]:


# Define the model
model = Sequential()
model.add(GRU(units=64, input_shape=(X.shape[1], X.shape[2]), return_sequences=False))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=1))  # Single output for the regression task

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

# Print model summary
model.summary()

In [None]:
# Split your data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model on test data
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f'Test MAE: {test_mae}')

In [None]:
predictions = model.predict(X_test)

# You can then compare these predictions with actual values
for i in range(5):  # Display the first 5 predictions for example
    print(f"Actual: {y_test[i]}, Predicted: {predictions[i][0]}")
