In [1]:
import pandas as pd
import torch
import numpy as np
import random
from random import shuffle

import wget
import os
import aiohttp
import asyncio
from fpl import FPL
from torch.utils.data import TensorDataset, DataLoader
from player import Player
from team import Team
from data_processor import get_fpl, get_players, get_teams, get_training_datasets, get_all_player_features
from matplotlib import pyplot as plt
import seaborn as sns
import torch.nn as nn
import math
import torch.optim as optim

In [2]:
fpl = await get_fpl()
team_feature_names = ["npxGA"]
teams = get_teams(team_feature_names=team_feature_names, visualize=False)
player_feature_names = ["total_points", "ict_index", "clean_sheets", "saves", "assists"]
players = await get_players(player_feature_names, team_feature_names, visualize=False, num_players=590)

In [35]:
def get_timeseries_dataset(players, input_features = 5, context_window=6, epsilon=1e-6):
    '''
        Args
            players - List of players 
            input_features - Number of feature dimensions; len(player_feature_names)
            context_window - length of prediction context window
        Returns
            Time series dataset that is a numpy array of shape (N, context_window, input_features)
    '''
    timeseries_dataset = []
    for player in players:
        player_points = player.player_features.T
        for i in range(len(player_points) - context_window):
            timeseries_dataset.append(player_points[i:i+context_window,:])
    random.shuffle(timeseries_dataset)
    timeseries_dataset = np.array(timeseries_dataset).astype(float) # (N, context_window, input_features)
    return timeseries_dataset


timeseries_dataset = get_timeseries_dataset(players)
timeseries_dataset.shape # (N, context_window, input_features)

(22522, 6, 5)

In [37]:
def normalize(x, epislon=1e-6):
    '''
    Args
        x - numpy array of shape (N, L, D)
    Returns
        normalized_x - normalized numpy array of shape (N, L, D). Normalized along dimension D
    '''
    means = np.mean(np.mean(x, axis=0), axis=0)
    stds = np.std(np.std(x, axis=0), axis=0)
    normalized_x = (x - means) / (stds+epislon)
    return normalized_x

print(timeseries_dataset[10])
print(normalize(timeseries_dataset[10]))
normalized_timeseries_dataset = normalize(timeseries_dataset)
normalized_timeseries_dataset.shape


[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


(22522, 6, 5)

# Dynamic data augmentation
- Input window is x [0:7]. 2 elements are set to 0 at random
- Target is sampled form x[7:9]
- Why - I want an nearly infinite dataset

In [27]:
class AvgModel(nn.Module):
    def __init__(self):
        pass 
    def forward(self, x):
        # shape of x is (Batch_size, L * D)
        return x.mean(dim=-1)

class PrevModel(nn.Module):
    def __init__(self):
        pass 
    def forward(self, x):
        # shape of x is (Batch_size, L, D)
        return x[:,-1, 0]

class LinearModel(nn.Module):
    def __init__(self, input_window):
        super(LinearModel, self).__init__()
        self.input_window = input_window
        self.fc1 = nn.Linear(input_window, 1).double()
    
    def forward(self, x):
        # shape of x is (Batch_size, L * D)
        return self.fc1(x)


class NonLinearModel(nn.Module):
    def __init__(self, input_window):
        super(NonLinearModel, self).__init__()
        self.model = nn.Sequential(*[nn.Linear(input_window, 100).double(),
                                     nn.ReLU(),
                                     nn.Linear(100, 100).double(),
                                     nn.ReLU(),
                                     nn.Linear(100, 1).double()])
    def forward(self, x):
        # shape of x is (Batch_size, L * D)
        return self.model(x)


input_tensor = torch.tensor([4, 6, 8]).double().reshape((1, 3, 1))
print(input_tensor)

avg_model = AvgModel()
print(avg_model.forward(input_tensor.reshape((-1, 3))))

prev_model = PrevModel()
print(prev_model.forward(input_tensor))

linear_model = LinearModel(input_window=3)
print(linear_model.forward((input_tensor.reshape((-1, 3)))))

non_linear_model = NonLinearModel(input_window=3)
print(non_linear_model.forward((input_tensor.reshape((-1, 3)))))

tensor([[[4.],
         [6.],
         [8.]]], dtype=torch.float64)
tensor([6.], dtype=torch.float64)
tensor([8.], dtype=torch.float64)
tensor([[2.5555]], dtype=torch.float64, grad_fn=<AddmmBackward>)
tensor([[1.6050]], dtype=torch.float64, grad_fn=<AddmmBackward>)


In [38]:
train_indices = int(0.8 * len(timeseries_dataset))
train_dataset, test_dataset = timeseries_dataset[:train_indices], timeseries_dataset[train_indices:]
train_loader = DataLoader(TensorDataset(torch.tensor(train_dataset)), batch_size=1)
test_loader = DataLoader(TensorDataset(torch.tensor(test_dataset)), batch_size=1)
input_window_length = 4

In [55]:
def fit(model, train_loader, epochs=30, input_context_window=6, len_features=5, input_window_length=4):
    optimizer = optim.Adam(model.parameters(), 1e-3)
    for epoch in range(epochs): 
        for [x] in train_loader:
            optimizer.zero_grad()
            input_vector = x[:,:input_window_length]
            if isinstance(model, LinearModel) or isinstance(model, NonLinearModel) or isinstance(model, AvgModel):
                input_vector = input_vector.reshape((-1, input_window_length * len_features))
            outputs_choice = np.random.choice([4])
            outputs = x[:,outputs_choice,0]
            predictions = model.forward(input_vector)
            #print(x.shape, input_vector.shape, outputs.shape, predictions.shape)
            residual = (predictions - outputs)
            loss = (residual * residual).sum() 
            loss.backward()
            optimizer.step()

def eval(model, test_loader, test_choices, input_context_window=6, len_features=5, input_window_length=4):
    sum_loss = 0
    count_loss = 0
    for [ [x], outputs_choice] in zip(test_loader, test_choices):
        input_vector = x[:,:input_window_length]
        if isinstance(model, LinearModel) or isinstance(model, NonLinearModel) or isinstance(model, AvgModel):
            input_vector = input_vector.reshape((-1, input_window_length * len_features))
        outputs = x[:,outputs_choice, 0]
        predictions = model.forward(input_vector)
        sum_loss += (predictions - outputs).abs().mean().item()
        count_loss += 1
        #break
    return sum_loss / count_loss

In [56]:
avg_losses = []
prev_losses = []
linear_losses = []
non_linear_losses = []
for seed in [35]:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    test_choices = [np.random.choice([4]) for _ in test_loader]

    avg_model = AvgModel()
    prev_model = PrevModel()
    linear_model = LinearModel(input_window_length * len(player_feature_names))
    non_linear_model = NonLinearModel(input_window_length * len(player_feature_names))
    
    fit(linear_model, train_loader)
    #fit(non_linear_model, train_loader)
    avg_losses.append(eval(avg_model, test_loader, test_choices))
    prev_losses.append(eval(prev_model, test_loader, test_choices))
    linear_losses.append(eval(linear_model, test_loader, test_choices))
    #non_linear_losses.append(eval(non_linear_model, test_loader, test_choices))
print(sum(avg_losses) / len(avg_losses))
print(sum(prev_losses) / len(prev_losses))
print(sum(linear_losses) / len(linear_losses))
'asd'

1.3317180910099897
1.588679245283019
1.5092664724938767


'asd'