In [1]:
import pandas as pd
import torch
import numpy as np
import random
from random import shuffle
from matplotlib import pyplot as plt
import seaborn as sns
from torch.utils.data import TensorDataset, DataLoader
import wget
import os
import aiohttp
import asyncio
np.random.seed(17)
random.seed(17)
torch.manual_seed(17)
from fpl import FPL
from player import Player
from team import Team
from data_processor import get_fpl, get_players, get_teams, get_training_datasets
from models import PreviousScoreModel, PlayerAvgScoreModel, LinearModel, HierarchialLinearModel, NonLinearModel
seed = 5
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7ff7cf7f2630>

In [2]:
fpl = await get_fpl()
opponent_feature_names = ["npxG","npxGA"]
player_feature_names = ["total_points", "ict_index", "clean_sheets", "saves", "assists"]
teams = get_teams(team_feature_names=opponent_feature_names, visualize=False)
players = await get_players(player_feature_names, opponent_feature_names, visualize=False, num_players=580)
players[200].name

'Ezgjan Alioski'

In [10]:
def get_heirarchical_datasets(player_features_array, opponent_features_array, total_points_array, points_this_season_array, batch_size):
    '''
        Function does the following
            - Normalize feature arrays and store normalizers
            - Gets train, test loaders, normalizers and returns them
        Args
        Returns
            (train_loader, test_loader, normalizers)
    '''
    indices = np.random.permutation(range(0, len(player_features_array)))
    indices = range(0, len(player_features_array))
    train_length = int(0.8 * len(indices))

    # Normalize player feature array
    player_features_array, player_features_means, player_features_stds = normalize(player_features_array) # (N, D, T)
    opponent_features_array, opponent_features_means, opponent_features_stds = normalize(opponent_features_array) # (N, D, T)
    total_points_array, total_points_means, total_points_stds = normalize(total_points_array, is_scalar=True) #(N, 1)
    points_this_season_array, points_this_season_means, points_this_season_stds = normalize(points_this_season_array, is_scalar=True) #(N, 1)    

    train_player_features_array, test_player_features_array = player_features_array[indices[:train_length]], player_features_array[indices[train_length:]]
    train_opponent_features_array, test_opponent_features_array = opponent_features_array[indices[:train_length]], opponent_features_array[indices[train_length:]]
    train_total_points_array, test_total_points_array = total_points_array[indices[:train_length]], total_points_array[indices[train_length:]]
    
    train_points_this_season_array, test_points_this_season_array = points_this_season_array[indices[:train_length]], points_this_season_array[indices[train_length:]]

    
    train_loader = DataLoader(TensorDataset(train_player_features_array, train_opponent_features_array, train_points_this_season_array, train_total_points_array), batch_size=batch_size)
    test_loader = DataLoader(TensorDataset(test_player_features_array, test_opponent_features_array, test_points_this_season_array, test_total_points_array), batch_size=batch_size)
    return train_loader, test_loader, (player_features_means, player_features_stds, opponent_features_means, opponent_features_stds, points_this_season_means , points_this_season_stds, total_points_means, total_points_stds)

def get_training_datasets(players, teams, window=4, batch_size=50, visualize=False, autoregressive=False):
    player_features_array = []
    opponent_features_array = []
    total_points_array = []
    points_this_season_array = []
    for player in players:
        player_features = player.player_features # ( D * L matrix)
        opponents = player.opponents.reshape((-1, 1)) # (1 * L matrix)
        
        player_feature_chunks = []
        opponent_chunks = []
        total_points = []
        points_this_season = []

        # Break (D * L) matrix into (L - W + 1) D * W matrices
        for i in range(player_features.shape[1] - window - 1):
            player_feature_chunk = player_features[:,i:i+window]
            opponent_chunk = (i+window, opponents[i+window])
            total_point = player_features[0, i+window]
            point_this_season = player_features[0, :i+window].sum()
            
            player_feature_chunks.append(player_feature_chunk)
            opponent_chunks.append(opponent_chunk) 
            total_points.append(total_point)
            points_this_season.append(point_this_season)

        if len(player_feature_chunks) == 0:
            continue
        opponent_feature_chunks = []
        for i, opponent in opponent_chunks:
            for team in teams:
                if team.name == opponent:
                    opponent_feature = team.team_features[:,i-window:i]
                    if opponent_feature.shape[1] != window:
                        opponent_feature = np.zeros((opponent_feature.shape[0], window))
                    opponent_feature_chunks.append(opponent_feature)
        
        opponent_feature_chunks = np.array(opponent_feature_chunks)
        player_features_array.extend(player_feature_chunks)
        opponent_features_array.extend(opponent_feature_chunks)
        total_points_array.extend(total_points)
        points_this_season_array.extend(points_this_season)
    
    if autoregressive:
        return get_autoregressive_datasets(player_features_array, opponent_features_array, total_points_array, points_this_season_array, batch_size)
    return get_heirarchical_datasets(player_features_array, opponent_features_array, total_points_array, points_this_season_array, batch_size)

def normalize(input_array, is_scalar = False):
    if not is_scalar:
        input_array = torch.tensor(np.array(input_array).astype(float)).double() # (N, D, W)
        input_means = torch.mean(input_array, dim=(0, 2)) # Means is d dimensional
        input_stds = torch.std(input_array, dim=(0, 2))
        input_array = input_array.permute(0, 2, 1) # Convert to (N, W, D) to do the normalization
        # input_array = (input_array - input_means) / (input_stds)
        input_array = input_array.permute(0, 2, 1) # Reset to (N, D, W)
        return input_array, input_means, input_stds
    else:
        input_array = torch.tensor(np.array(input_array).astype(float).reshape((-1, 1))).double()
        input_means = torch.mean(input_array)
        input_stds = torch.std(input_array)
        # input_array = (input_array - input_means) / input_stds
        return input_array, input_means, input_stds


train_loader, test_loader, _ = get_training_datasets(players, teams)
train_loader, test_loader
x, _,_, y = next(iter(train_loader))


In [13]:
x[14], y[14]

(tensor([[5.0000, 2.0000, 1.0000, 0.0000],
         [6.7000, 4.8000, 1.9000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000],
         [1.0000, 0.0000, 0.0000, 0.0000]], dtype=torch.float64),
 tensor([2.], dtype=torch.float64))