In [None]:
import pandas as pd
import os
import numpy as np
import wandb

# Train model
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
!wandb login

In [None]:
# from google.colab import drive
# import os

# # 1. Mounting Google Drive: This allows Colab to access files in your Google Drive
# drive.mount('/content/drive')

In [None]:
df21 = pd.read_csv('data/train/merged_gw_2021_22.csv')
df22 = pd.read_csv('data/train/merged_gw_2022_23.csv')
df23 = pd.read_csv('data/train/merged_gw_2023_24.csv')

In [None]:
df23.columns

Columns to drop: Name, team, position, element, fixture, kickoff time (tbd), opponent team, round, transfer_balance


Target data: total_points of next gameweek

preprocessing steps:
- Change was_home to 0/1

# Quick and dirty baseline model

Initial preprocessing steps:

- Drop any unnecessary columns
- Specify total points for next gameweek as ground truth

In [None]:
# Dropping unnecessary columns
train_df = df21.drop(["name", "team", "position", "fixture", "kickoff_time", "opponent_team", "round", "transfers_balance"], axis=1)

# Create a column for ground truth
ground_truth = df21.loc[:, ["element", "total_points", "GW"]]

# total points for GW 2 becomes the ground truth for GW 1
ground_truth['GW'] -= 1
ground_truth = ground_truth.rename(columns={"total_points":"y"})

# inner join with train df
train_df = train_df.merge(ground_truth, how="inner", on=["element", "GW"])

# Dropping 'element' and 'gw' columns as they are no longer needed
train_df = train_df.drop(["element", "GW"], axis=1)
train_df

In [None]:
#Pre-processing
train_df = train_df.astype(np.float32)
train_df

In [None]:
class FplDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return torch.tensor(self.inputs[idx], dtype=torch.float32), torch.tensor(self.targets[idx], dtype=torch.float32)

# Convert data to PyTorch Dataset and DataLoader
inputs = train_df.drop('y', axis=1).to_numpy()
num_features = inputs.shape[1]
targets = train_df.loc[:,"y"].to_numpy().reshape(-1,1)
dataset = FplDataset(inputs, targets)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [None]:
# Define a simple model as a baseline
class SimpleModel(nn.Module):
    def __init__(self, num_features):
        super(SimpleModel, self).__init__()
        self.layer1 = nn.Linear(num_features, 64)  # 64 output units
        self.layer2 = nn.Linear(64, 1)   # 64 input units, 1 output value

    def forward(self, x):
        x = torch.relu(self.layer1(x))  # Apply ReLU activation after first layer
        x = self.layer2(x)              # Output layer
        return x

# Initialize the model, loss function, and optimizer
model = SimpleModel(num_features)
criterion = nn.MSELoss()  # Mean Squared Error loss for regression task
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
model

In [None]:
# 1. Initialize wandb
wandb.init(project="cs6365-fpl-model", config={
    "epochs": 10,
    "batch_size": dataloader.batch_size if hasattr(dataloader, 'batch_size') else 'unknown',
    "learning_rate": optimizer.param_groups[0]['lr'],
    "loss_fn": criterion.__class__.__name__,
    "model": model.__class__.__name__,
})

In [None]:
# Training loop
num_epochs = wandb.config.epochs
model.train()  # Set model to training mode
for epoch in range(num_epochs):
    running_loss = 0.0

    for inputs_batch, targets_batch in dataloader:
        # Forward pass
        outputs = model(inputs_batch)
        loss = criterion(outputs, targets_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # 2. Log metrics
    wandb.log({"epoch": epoch+1, "loss": avg_loss})

# 3. (Optional) Save model checkpoint
# torch.save(model.state_dict(), "model.pth")
wandb.save("baseline.pth")

# Finish the run
wandb.finish()

In [None]:
# Save the trained model
torch.save(model.state_dict(), 'models/baseline.pth')

# Training a better model

- An additional preprocessing step here is to get the 'is_home' feature from the next gameweek, just like we get the ground truth (total_points), since that is what we need.

In [None]:
# Dropping unnecessary columns
train_df = df21.drop(["name", "team", "position", "fixture", "kickoff_time", "opponent_team", "round", "transfers_balance"], axis=1)

# Create a column for ground truth
ground_truth = df21.loc[:, ["was_home", "total_points", "element", "GW"]]

# total points for GW 2 becomes the ground truth for GW 1
ground_truth['GW'] -= 1
ground_truth = ground_truth.rename(columns={"total_points":"y", "was_home":"is_home"})

# inner join with train df
train_df = train_df.merge(ground_truth, how="inner", on=["element", "GW"])

# Dropping 'element' and 'gw' columns as they are no longer needed
train_df = train_df.drop(["element", "GW"], axis=1)
train_df

- This step is dropping all columns that are not present in the live API.

In [None]:
#Pre-processing

# Dropping columns not present in API
train_df = train_df.drop(['xP', 'selected', 'team_a_score', 'team_h_score', 'value', 'was_home'], axis=1)
train_df = train_df.astype(np.float32)
train_df

In [None]:
print(train_df.shape[1])
train_df.columns

In [None]:
# Convert data to PyTorch Dataset and DataLoader
inputs = train_df.drop('y', axis=1).to_numpy()
num_features = inputs.shape[1]
print("Num features", num_features)
targets = train_df.loc[:,["y"]].to_numpy()
dataset = FplDataset(inputs, targets)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [None]:
train_df.drop('y', axis=1).columns

In [None]:
# Set experiment hyperparameters
num_epochs = 70
lr = 5e-6
weight_decay = 1e-2
num_blocks = 3 # Number of residual blocks in the model

# Start a new wandb run to track this script.
run = wandb.init(
    # Set the wandb entity where your project will be logged (generally your team name).
    entity="aamirmd-georgia-institute-of-technology",
    # Set the wandb project where this run will be logged.
    project="cs6365-fpl-model",
    # Track hyperparameters and run metadata.
    config={
        "learning_rate": lr,
        "architecture": "ResidualModel",
        "epochs": num_epochs,
        "weight_decay": weight_decay,
        "number of residual blocks": num_blocks
    },
)

In [None]:
class ResidualBlock(nn.Module):
    def __init__(self, dim, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(dim, dim)
        self.fc2 = nn.Linear(dim, dim)
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.LeakyReLU(0.1)

    def forward(self, x):
        residual = x
        out = self.fc1(self.norm1(x))
        out = self.activation(out)
        out = self.dropout(out)
        out = self.fc2(self.norm2(out))
        return self.activation(out + residual)


# Define a better model
class FplModel(nn.Module):
    def __init__(self, num_features, output_dim=1, hidden_dim=128, num_blocks=3, dropout=0.1):
        super(FplModel, self).__init__()
        self.input_layer = nn.Linear(num_features, hidden_dim)
        self.blocks = nn.Sequential(*[
            ResidualBlock(hidden_dim, dropout) for _ in range(num_blocks)
        ])
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.input_layer(x)
        x = self.blocks(x)
        return self.output_layer(x)

# Initialize the model, loss function, and optimizer
model = FplModel(num_features, num_blocks=num_blocks)
criterion = nn.MSELoss()  # Mean Squared Error loss for regression task
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

In [None]:
# Training loop
model.train()  # Set model to training mode
for epoch in range(num_epochs):
    running_loss = 0.0

    for inputs_batch, targets_batch in dataloader:
        # Forward pass
        outputs = model(inputs_batch)
        loss = criterion(outputs, targets_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(dataloader)
    run.log({'avg_loss': avg_loss})
    if (epoch+1) % 5 == 0:
      print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

run.finish()

In [None]:
# Save the trained model
torch.save(model.state_dict(), 'models/residual-2.pth')

In [None]:
model

# Model testing for KO

In [None]:
# Dropping unnecessary columns
test_df = df22.drop(["name", "team", "position", "fixture", "kickoff_time", "opponent_team", "round", "transfers_balance"], axis=1)

# Create a column for ground truth
ground_truth = df22.loc[:, ["element", "total_points", "GW"]]

# total points for GW 2 becomes the ground truth for GW 1
ground_truth['GW'] -= 1
ground_truth = ground_truth.rename(columns={"total_points":"y"})

# inner join with train df
test_df = test_df.merge(ground_truth, how="inner", on=["element", "GW"])

# Dropping 'element' and 'gw' columns as they are no longer needed
test_df = test_df.drop(["element"], axis=1)
test_df

In [None]:
#Pre-processing

# Dropping columns not present in API
test_df = test_df.drop(['xP', 'selected', 'team_a_score', 'team_h_score', 'value', 'was_home'], axis=1)
test_df = test_df.astype(np.float32)
test_df

In [None]:
# Keeping only required test columns
stats = ['assists', 'bonus', 'bps', 'clean_sheets', 'creativity',
       'goals_conceded', 'goals_scored', 'ict_index', 'influence', 'minutes',
       'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards',
       'saves', 'threat', 'total_points', 'transfers_in', 'transfers_out',
       'yellow_cards']

test_df = test_df[stats + ['y', "GW"]]
test_df

In [None]:
# Load model
num_features = 20
model = FplModel(num_features)
model.load_state_dict(torch.load('models/residual-1.pth'))
model.eval()

In [None]:
test_df[test_df["GW"] == 6]

In [None]:
# Get accuracy for testing df per gameweek
loss_over_gameweeks = []
for gw in range(1,38):
  playerData = test_df.loc[test_df["GW"] == gw, :].drop(["GW", "y"], axis=1).to_numpy()
  playerData = torch.tensor(playerData, dtype=torch.float32)
  with torch.no_grad():  # Disables gradient calculation for inference
      predictions = model(playerData)
  targets_batch = test_df.loc[test_df["GW"] == gw, "y"].to_numpy().reshape(-1,1)
  targets_batch = torch.tensor(targets_batch, dtype=torch.float32)
  loss = criterion(predictions, targets_batch)
  if loss.isnan():
    loss_over_gameweeks.append(loss_over_gameweeks[-1])
  else:
    loss_over_gameweeks.append(loss.item())
# loss_over_gameweeks

In [None]:
# Get accuracy for testing df per gameweek, training at 10,20,30
loss_over_gameweeks_3 = []
train_at = [10,20,30]
def get_prev(i):
  if i == 10:
    return 0
  elif i == 20:
    return 11
  elif i == 30:
    return 21
  else:
    assert False

for gw in range(1,38):

  if gw in train_at:
    # Training loop
    num_epochs = 10
    # Convert data to PyTorch Dataset and DataLoader
    inputs = test_df.loc[test_df["GW"].isin(np.arange(get_prev(gw), gw)), :].drop(['y','GW'], axis=1).to_numpy()
    print(inputs.shape)
    num_features = inputs.shape[1]
    # print("Num features", num_features)
    targets = test_df.loc[test_df["GW"].isin(np.arange(get_prev(gw), gw)), "y"].to_numpy().reshape(-1,1)
    dataset = FplDataset(inputs, targets)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

    # model = FplModel(num_features)
    criterion = nn.MSELoss()  # Mean Squared Error loss for regression task
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        running_loss = 0.0

        for inputs_batch, targets_batch in dataloader:
            # Forward pass
            outputs = model(inputs_batch)
            loss = criterion(outputs, targets_batch)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_loss = running_loss / len(dataloader)
    print(f"model trained for {gw}")
    model.eval()
        # run.log({'avg_loss': avg_loss})
        # if (epoch+1) % 5 == 0:
        #   print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")


  playerData = test_df.loc[test_df["GW"] == gw, :].drop(["GW", "y"], axis=1).to_numpy()
  playerData = torch.tensor(playerData, dtype=torch.float32)
  with torch.no_grad():  # Disables gradient calculation for inference
      predictions = model(playerData)
  targets_batch = test_df.loc[test_df["GW"] == gw, "y"].to_numpy().reshape(-1,1)
  targets_batch = torch.tensor(targets_batch, dtype=torch.float32)
  loss = criterion(predictions, targets_batch)
  if loss.isnan():
    loss_over_gameweeks_3.append(loss_over_gameweeks_3[-1])
  else:
    loss_over_gameweeks_3.append(loss.item())
# loss_over_gameweeks

In [None]:
# Get accuracy for testing df per gameweek, training at 10,20,30
loss_over_gameweeks_5 = []
train_at = [7,14,21,28,35]
def get_prev(i):
  if i == 7:
    return 0
  elif i == 14:
    return 8
  elif i == 21:
    return 15
  elif i == 28:
    return 22
  elif i == 35:
    return 29
  else:
    assert False

for gw in range(1,38):

  if gw in train_at:
    # Training loop
    num_epochs = 10
    # Convert data to PyTorch Dataset and DataLoader
    inputs = test_df.loc[test_df["GW"].isin(np.arange(get_prev(gw), gw)), :].drop(['y','GW'], axis=1).to_numpy()
    print(inputs.shape)
    num_features = inputs.shape[1]
    # print("Num features", num_features)
    targets = test_df.loc[test_df["GW"].isin(np.arange(get_prev(gw), gw)), "y"].to_numpy().reshape(-1,1)
    dataset = FplDataset(inputs, targets)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

    # model = FplModel(num_features)
    criterion = nn.MSELoss()  # Mean Squared Error loss for regression task
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        running_loss = 0.0

        for inputs_batch, targets_batch in dataloader:
            # Forward pass
            outputs = model(inputs_batch)
            loss = criterion(outputs, targets_batch)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_loss = running_loss / len(dataloader)
    print(f"model trained for {gw}")
    model.eval()
        # run.log({'avg_loss': avg_loss})
        # if (epoch+1) % 5 == 0:
        #   print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")


  playerData = test_df.loc[test_df["GW"] == gw, :].drop(["GW", "y"], axis=1).to_numpy()
  playerData = torch.tensor(playerData, dtype=torch.float32)
  with torch.no_grad():  # Disables gradient calculation for inference
      predictions = model(playerData)
  targets_batch = test_df.loc[test_df["GW"] == gw, "y"].to_numpy().reshape(-1,1)
  targets_batch = torch.tensor(targets_batch, dtype=torch.float32)
  loss = criterion(predictions, targets_batch)
  if loss.isnan():
    loss_over_gameweeks_5.append(loss_over_gameweeks_5[-1])
  else:
    loss_over_gameweeks_5.append(loss.item())
# loss_over_gameweeks

In [None]:
import matplotlib.pyplot as plt

# Example data
list1 = loss_over_gameweeks
list2 = loss_over_gameweeks_3
list3 = loss_over_gameweeks_5
# list2 = list3

# Plot each list
plt.plot(list1, label='No re-training')
plt.plot(list2, label='10,20,30')
plt.plot(list3, label='7,14,21,28,35')

# Plot times of re-training
plt.scatter([10,20,30], [list2[i] for i in [10,20,30]], color='orange', marker='x')
plt.scatter([7,14,21,28,35], [list3[i] for i in [7,14,21,28,35]], color='green', marker='x')

# Add labels and legend
plt.xlabel('Gameweek')
plt.ylabel('Loss')
plt.title('Model fine-tuning at different intervals')
plt.legend()

# Show the plot
plt.show()

In [None]:
print("Average loss for no re-training: ", np.mean(loss_over_gameweeks).round(3))
print("Average loss for 10,20,30: ", np.mean(loss_over_gameweeks_3).round(3))
print("Average loss for 7,14,21,28,35: ", np.mean(loss_over_gameweeks_5).round(3))

# Actual re-training at different intervals

In [None]:
# Dropping unnecessary columns
test_df = df22.drop(["name", "team", "position", "fixture", "kickoff_time", "opponent_team", "round", "transfers_balance"], axis=1)

# Create a column for ground truth
ground_truth = df22.loc[:, ["element", "total_points", "GW"]]

# total points for GW 2 becomes the ground truth for GW 1
ground_truth['GW'] -= 1
ground_truth = ground_truth.rename(columns={"total_points":"y"})

# inner join with train df
test_df = test_df.merge(ground_truth, how="inner", on=["element", "GW"])

# Dropping 'element' and 'gw' columns as they are no longer needed
test_df = test_df.drop(["element"], axis=1)
test_df

In [None]:
#Pre-processing

# Dropping columns not present in API
test_df = test_df.drop(['xP', 'selected', 'team_a_score', 'team_h_score', 'value', 'was_home'], axis=1)
test_df = test_df.astype(np.float32)
test_df

In [None]:
# Keeping only required test columns
stats = ['assists', 'bonus', 'bps', 'clean_sheets', 'creativity',
       'goals_conceded', 'goals_scored', 'ict_index', 'influence', 'minutes',
       'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards',
       'saves', 'threat', 'total_points', 'transfers_in', 'transfers_out',
       'yellow_cards']

test_df = test_df[stats + ['y', "GW"]]
test_df

In [None]:
# Load model
num_features = 20
model = FplModel(num_features)
model.load_state_dict(torch.load('models/residual-1.pth'))
model.eval()

In [None]:
# Get accuracy for testing df per gameweek
loss_over_gameweeks = []
for gw in range(1,38):
  playerData = test_df.loc[test_df["GW"] == gw, :].drop(["GW", "y"], axis=1).to_numpy()
  playerData = torch.tensor(playerData, dtype=torch.float32)
  with torch.no_grad():  # Disables gradient calculation for inference
      predictions = model(playerData)
  targets_batch = test_df.loc[test_df["GW"] == gw, "y"].to_numpy().reshape(-1,1)
  targets_batch = torch.tensor(targets_batch, dtype=torch.float32)
  loss = criterion(predictions, targets_batch)
  if loss.isnan():
    loss_over_gameweeks.append(loss_over_gameweeks[-1])
  else:
    loss_over_gameweeks.append(loss.item())
# loss_over_gameweeks

In [None]:
# Get accuracy for testing df per gameweek, training at 10,20,30
loss_over_gameweeks_3 = []
train_at = [10,20,30]
def get_prev(i):
  if i == 10:
    return 0
  elif i == 20:
    return 11
  elif i == 30:
    return 21
  else:
    assert False

for gw in range(1,38):

  if gw in train_at:
    # Training loop
    model = FplModel(num_features)
    # model.load_state_dict(torch.load('/content/drive/MyDrive/cs6365/residual-1.pth'))
    model.train()
    num_epochs = 50
    # Convert data to PyTorch Dataset and DataLoader
    inputs = test_df.loc[test_df["GW"] < gw, :].drop(['y','GW'], axis=1).to_numpy()
    print(inputs.shape)
    num_features = inputs.shape[1]
    # print("Num features", num_features)
    targets = test_df.loc[test_df["GW"] < gw, "y"].to_numpy().reshape(-1,1)
    dataset = FplDataset(inputs, targets)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

    # model = FplModel(num_features)
    criterion = nn.MSELoss()  # Mean Squared Error loss for regression task
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        running_loss = 0.0

        for inputs_batch, targets_batch in dataloader:
            # Forward pass
            outputs = model(inputs_batch)
            loss = criterion(outputs, targets_batch)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_loss = running_loss / len(dataloader)
    print(f"model trained for {gw}")
    model.eval()
        # run.log({'avg_loss': avg_loss})
        # if (epoch+1) % 5 == 0:
        #   print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")


  playerData = test_df.loc[test_df["GW"] == gw, :].drop(["GW", "y"], axis=1).to_numpy()
  playerData = torch.tensor(playerData, dtype=torch.float32)
  with torch.no_grad():  # Disables gradient calculation for inference
      predictions = model(playerData)
  targets_batch = test_df.loc[test_df["GW"] == gw, "y"].to_numpy().reshape(-1,1)
  targets_batch = torch.tensor(targets_batch, dtype=torch.float32)
  loss = criterion(predictions, targets_batch)
  if loss.isnan():
    loss_over_gameweeks_3.append(loss_over_gameweeks_3[-1])
  else:
    loss_over_gameweeks_3.append(loss.item())
# loss_over_gameweeks

In [None]:
import matplotlib.pyplot as plt

# Example data
list1 = loss_over_gameweeks
list2 = loss_over_gameweeks_3
# list3 = loss_over_gameweeks_5
# list2 = list3

# Plot each list
plt.plot(list1, label='No re-training')
plt.plot(list2, label='10,20,30')
# plt.plot(list3, label='7,14,21,28,35')

# Plot times of re-training
plt.scatter([10,20,30], [list2[i] for i in [10,20,30]], color='orange', marker='x')
# plt.scatter([7,14,21,28,35], [list3[i] for i in [7,14,21,28,35]], color='green', marker='x')

# Add labels and legend
plt.xlabel('Gameweek')
plt.ylabel('Loss')
plt.title('Full re-training at different points')
plt.legend()

# Show the plot
plt.show()

In [None]:
print("Average loss for no re-training: ", np.mean(loss_over_gameweeks).round(3))
print("Average loss for 10,20,30: ", np.mean(loss_over_gameweeks_3).round(3))
# print("Average loss for 7,14,21,28,35: ", np.mean(loss_over_gameweeks_5).round(3))