In [1]:
# nb. of goals = 1 character
# inspired by character prediction

In [2]:
import pandas as pd
import numpy as np
import math
from copy import deepcopy

In [3]:
def rolling_mean_n_performance(df, window=5, performance_col='goals_scored'):
    dg = df.sort_values(by=['leg'])[['season', 'team', performance_col]].groupby(
        by=['season', 'team'])[performance_col].rolling(window=window, min_periods=1).mean().reset_index()
    
    new_col_name = f'rolling_{window}_games_avg_{performance_col}'
    df[new_col_name] = dg.set_index('level_2')[performance_col]
    return df

def get_past_feature(df, feat_col, team=True):

    merge_col = 'team' if team else 'opponent'    
    tmp_df = deepcopy(df[['season', 'leg', merge_col, feat_col]])
    tmp_df.loc[:,'next_leg'] = tmp_df['leg'] +1

    tmp_df.rename(columns={'leg': 'previous_leg', 
                           'next_leg':'leg', 
                           feat_col:f'previous_{merge_col}_{feat_col}'},
                  inplace=True)

    df = df.merge(tmp_df, how='left', on=['leg', 'season', merge_col])
    df.drop(columns=['previous_leg'], inplace=True)
    return df

In [4]:
def prepare_data(csv_path, championship, rolling=5):
    df = pd.read_csv(csv_path).drop(columns='Unnamed: 0')
    df['championship'] = championship
    df['goal_diff'] = df['goals_scored'] - df['goals_conceded']
    # cumulative
    df['cum_pts'] = df[['season', 'team', 'nb_points']].groupby(
    by=['season', 'team']).cumsum()
    
    df['cum_goal_diff'] = df[['season', 'team', 'goal_diff']].groupby(
    by=['season', 'team']).cumsum()
    
    df['cum_goals_scored'] = df[['season', 'team', 'goals_scored']].groupby(
    by=['season', 'team']).cumsum()
    
    df['cum_goals_conceded'] = df['cum_goals_scored']-df['cum_goal_diff']
    df['rank'] = df[['season', 'leg', 'cum_pts', 'cum_goal_diff', 'cum_goals_scored']].sort_values(
        by=['cum_pts', 'cum_goal_diff', 'cum_goals_scored'], ascending=False).groupby(
        by=['season', 'leg']).cumcount() + 1
    
    df['avg_goals_scored_since_season_start'] = df['cum_goals_scored'].div(df['leg'])
    df['avg_goals_conceded_since_season_start'] = df['cum_goals_conceded'].div(df['leg'])
    df['avg_cum_pts_since_season_start'] = df['cum_pts'].div(df['leg'])
    
    # removed unwanted useless seasons
    data = deepcopy(df[df.season > '2003-2004'])
    data.reset_index(drop=True, inplace=True)
    
    leg_max = data.leg.max()
    
    end_season = data[data.leg==leg_max].rename(columns={'rank':'final_rank', 'cum_pts': 'final_cum_pts'})
    data = data.merge(end_season[['season', 'team', 'final_rank', 'final_cum_pts']], on=['season', 'team'])
    
    # rolling mean
    cols = ['goals_conceded', 'goals_scored', 'nb_points']
    for c in cols:
        data = rolling_mean_n_performance(df=data, window=rolling, performance_col=c)
    # past features
    past_features = {'rank': [True, False], 
                     'rolling_5_games_avg_goals_scored' : [True],
                     'rolling_5_games_avg_goals_conceded': [False],
                     'avg_goals_scored_since_season_start': [True],
                     'avg_goals_conceded_since_season_start': [False],
                     'goals_scored': [True],
                     'goals_conceded': [False],
                     'rolling_5_games_avg_nb_points': [True, False],
                     'nb_points': [True, False]
                    }
    # print(f'length {len(data)}')
    for col, is_team_ll in past_features.items():
        for is_team in is_team_ll:
            # print(f"is_team ={is_team}, col = {col}")
            data = get_past_feature(df=data, feat_col=col, team=is_team)
    
    return data

In [5]:
def get_pivoted(data: pd.DataFrame, break_leg: int, value_col: str = 'goals_scored'):
    
    df = deepcopy(data[data.leg <= break_leg])
    df.rolling_5_games_avg_nb_points = [y if x!=x else x for x, y in 
           zip(df.rolling_5_games_avg_nb_points, df.avg_cum_pts_since_season_start)]
    
    df_pivot = df.pivot_table(index=['season', 'team'], 
                              columns='leg', 
                              values=[value_col]).reset_index()
    
    df_pivot.columns = [f'leg_{l}' if l!='' else n for n, l in df_pivot.columns]
    
    final = df[['season', 'team', 'final_rank', 'final_cum_pts']].drop_duplicates()
    df_last_leg = df[df.leg==break_leg][['season', 
                                          'team', 
                                          'rank', 
                                          'rolling_5_games_avg_nb_points', 
                                          'avg_cum_pts_since_season_start', 
                                          'cum_pts']].reset_index(drop=True)
    
    df_pivot = df_pivot.merge(df_last_leg, on=['season', 'team'])
    
    return df_pivot.merge(final, on=['season', 'team'])

In [6]:
championship_csv = {'ligue-1': 'ligue-1_data_2002_2019',
                   'ligue-2': 'ligue-2_data_2002_2019',
                   'serie-A': 'serie-a_data_2004_2019',
                   'bundesliga': 'bundesliga_data_2004_2019',
                   'premier-league': 'premier-league_data_2004_2019',
                   'liga':'liga_data_2004_2019'}

In [7]:
all_data_dfs = [prepare_data(csv_path=path, championship=champ) for champ, path in championship_csv.items()]

In [8]:
all_data_df = pd.concat(all_data_dfs)

In [9]:
data_exploitable_df = all_data_df.reset_index(drop=True)

In [10]:
data_exploitable_df

Unnamed: 0,country,season,leg,team,play,goals_scored,opponent,goals_conceded,nb_points,championship,...,previous_team_rolling_5_games_avg_goals_scored,previous_opponent_rolling_5_games_avg_goals_conceded,previous_team_avg_goals_scored_since_season_start,previous_opponent_avg_goals_conceded_since_season_start,previous_team_goals_scored,previous_opponent_goals_conceded,previous_team_rolling_5_games_avg_nb_points,previous_opponent_rolling_5_games_avg_nb_points,previous_team_nb_points,previous_opponent_nb_points
0,France,2004-2005,1,Lyon,Home,0,Nice,0,1,ligue-1,...,,,,,,,,,,
1,France,2004-2005,2,Lyon,Home,1,Sochaux,1,1,ligue-1,...,0.000000,1.000000,0.000000,1.000000,0.0,1.0,1.0,3.000000,1.0,3.0
2,France,2004-2005,3,Lyon,Away,1,Metz,1,1,ligue-1,...,0.500000,1.500000,0.500000,1.500000,1.0,2.0,1.0,1.500000,1.0,0.0
3,France,2004-2005,4,Lyon,Home,1,Lille,0,3,ligue-1,...,0.666667,1.333333,0.666667,1.333333,1.0,0.0,1.0,1.666667,1.0,1.0
4,France,2004-2005,5,Lyon,Away,2,Rennes,1,3,ligue-1,...,0.750000,1.250000,0.750000,1.250000,1.0,1.0,1.5,1.000000,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66175,Spain,2018-2019,34,FC Barcelone,Away,2,Alavés,0,3,liga,...,2.000000,1.600000,2.484848,1.484848,2.0,2.0,2.2,0.600000,3.0,1.0
66176,Spain,2018-2019,35,FC Barcelone,Home,1,Levante,0,3,liga,...,2.000000,2.400000,2.470588,1.352941,2.0,4.0,2.2,0.600000,3.0,0.0
66177,Spain,2018-2019,36,FC Barcelone,Away,0,Celta Vigo,2,0,liga,...,1.400000,1.000000,2.428571,1.114286,1.0,0.0,2.6,0.600000,3.0,1.0
66178,Spain,2018-2019,37,FC Barcelone,Home,2,Getafe,0,3,liga,...,1.000000,1.200000,2.361111,1.416667,0.0,2.0,2.0,0.600000,0.0,0.0


In [11]:
# group by championship, season, team
# need goals_scored ordered by leg

# Data Prep

In [12]:
cols = ['championship', 'season', 'team', 'goals_scored', 'leg']

In [13]:
data_seq_df = data_exploitable_df[cols].sort_values(['leg'],ascending=True).groupby(
    by=['championship', 'season', 'team']).goals_scored.apply(list).reset_index()

data_seq_df

Unnamed: 0,championship,season,team,goals_scored
0,bundesliga,2004-2005,Arminia Bielefeld,"[1, 1, 1, 0, 1, 2, 0, 2, 1, 2, 1, 1, 3, 2, 1, ..."
1,bundesliga,2004-2005,Bayer Leverkusen,"[5, 2, 4, 0, 2, 0, 3, 1, 0, 3, 0, 4, 2, 0, 2, ..."
2,bundesliga,2004-2005,Bayern Munich,"[3, 1, 1, 1, 2, 3, 2, 0, 2, 2, 0, 3, 3, 3, 4, ..."
3,bundesliga,2004-2005,Bochum,"[1, 2, 2, 2, 1, 1, 2, 0, 0, 3, 0, 1, 1, 0, 3, ..."
4,bundesliga,2004-2005,Borussia Dortmund,"[2, 3, 1, 2, 2, 1, 2, 0, 0, 1, 1, 0, 0, 2, 0, ..."
...,...,...,...,...
1765,serie-A,2018-2019,SPAL,"[2, 1, 0, 2, 0, 0, 1, 1, 2, 0, 1, 2, 0, 2, 1, ..."
1766,serie-A,2018-2019,Sampdoria Gênes,"[2, 0, 3, 5, 0, 0, 2, 1, 0, 2, 1, 1, 1, 4, 2, ..."
1767,serie-A,2018-2019,Sassuolo,"[1, 2, 5, 1, 3, 2, 1, 0, 0, 2, 2, 1, 1, 0, 3, ..."
1768,serie-A,2018-2019,Torino,"[3, 2, 1, 1, 1, 0, 1, 3, 2, 1, 4, 1, 0, 2, 0, ..."


In [14]:
data_seq_df=data_seq_df.reset_index()

In [15]:
vocab = data_exploitable_df.goals_scored.unique()
vocab

array([ 0,  1,  2,  3,  4,  5,  8,  6,  7,  9, 10])

In [16]:
# chars = tuple(set(text))
int2goal = dict(enumerate(np.append(vocab, [-1])))
goal2int = {ch: ii for ii, ch in int2goal.items()}

# encode the text
def encode(text, encoder):
    return np.array([encoder[ch] for ch in text])

#### One-hot encoding

Doing so modelizes the "character" into a probability distribution : since the character is known when used as input, the distribution is a Dirac.   

In [17]:
np.append(vocab,[-1])

array([ 0,  1,  2,  3,  4,  5,  8,  6,  7,  9, 10, -1])

In [18]:
def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [78]:
def get_batches(arr, batch_size, seq_length):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    ## Get the number of batches we can make
    batch_size_total = batch_size*seq_length

    n_batches = len(arr)//batch_size_total
    
    ## Keep only enough characters to make full batches
    if n_batches > 0 :
        arr = arr[:batch_size_total*n_batches]
        ## Reshape into batch_size rows
        arr = arr.reshape((batch_size, -1))
    else:
        arr = arr[:(len(arr)//seq_length)*seq_length]
        arr = arr.reshape((1, -1))
    
    ## Iterate over the batches using a window of size seq_length
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n: n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

### split

In [20]:
import random

In [21]:
fit_idx = random.choices(list(range(len(data_seq_df))), k=int(0.05*len(data_seq_df)))

In [22]:
test_data = deepcopy(data_seq_df.loc[fit_idx,:]).reset_index(drop=True)

In [23]:
train_data = deepcopy(data_seq_df[~data_seq_df['index'].isin(fit_idx)]).reset_index(drop=True)

### last prep

In [24]:
train_data['goals_scored_2'] = train_data.goals_scored.apply(lambda ll: np.array([-1]*(38-len(ll))+ll))
train_data['goals_scored'] = train_data.goals_scored.apply(lambda ll: np.array(ll))

In [25]:
train_data

Unnamed: 0,index,championship,season,team,goals_scored,goals_scored_2
0,0,bundesliga,2004-2005,Arminia Bielefeld,"[1, 1, 1, 0, 1, 2, 0, 2, 1, 2, 1, 1, 3, 2, 1, ...","[-1, -1, -1, -1, 1, 1, 1, 0, 1, 2, 0, 2, 1, 2,..."
1,1,bundesliga,2004-2005,Bayer Leverkusen,"[5, 2, 4, 0, 2, 0, 3, 1, 0, 3, 0, 4, 2, 0, 2, ...","[-1, -1, -1, -1, 5, 2, 4, 0, 2, 0, 3, 1, 0, 3,..."
2,2,bundesliga,2004-2005,Bayern Munich,"[3, 1, 1, 1, 2, 3, 2, 0, 2, 2, 0, 3, 3, 3, 4, ...","[-1, -1, -1, -1, 3, 1, 1, 1, 2, 3, 2, 0, 2, 2,..."
3,3,bundesliga,2004-2005,Bochum,"[1, 2, 2, 2, 1, 1, 2, 0, 0, 3, 0, 1, 1, 0, 3, ...","[-1, -1, -1, -1, 1, 2, 2, 2, 1, 1, 2, 0, 0, 3,..."
4,4,bundesliga,2004-2005,Borussia Dortmund,"[2, 3, 1, 2, 2, 1, 2, 0, 0, 1, 1, 0, 0, 2, 0, ...","[-1, -1, -1, -1, 2, 3, 1, 2, 2, 1, 2, 0, 0, 1,..."
...,...,...,...,...,...,...
1681,1765,serie-A,2018-2019,SPAL,"[2, 1, 0, 2, 0, 0, 1, 1, 2, 0, 1, 2, 0, 2, 1, ...","[2, 1, 0, 2, 0, 0, 1, 1, 2, 0, 1, 2, 0, 2, 1, ..."
1682,1766,serie-A,2018-2019,Sampdoria Gênes,"[2, 0, 3, 5, 0, 0, 2, 1, 0, 2, 1, 1, 1, 4, 2, ...","[2, 0, 3, 5, 0, 0, 2, 1, 0, 2, 1, 1, 1, 4, 2, ..."
1683,1767,serie-A,2018-2019,Sassuolo,"[1, 2, 5, 1, 3, 2, 1, 0, 0, 2, 2, 1, 1, 0, 3, ...","[1, 2, 5, 1, 3, 2, 1, 0, 0, 2, 2, 1, 1, 0, 3, ..."
1684,1768,serie-A,2018-2019,Torino,"[3, 2, 1, 1, 1, 0, 1, 3, 2, 1, 4, 1, 0, 2, 0, ...","[3, 2, 1, 1, 1, 0, 1, 3, 2, 1, 4, 1, 0, 2, 0, ..."


In [26]:
text1_arr = np.concatenate(train_data.goals_scored.values)
text2_arr = np.concatenate(train_data.goals_scored_2.values)

In [27]:
text1_encoded = encode(text=text1_arr, encoder=goal2int)
text2_encoded = encode(text=text2_arr, encoder=goal2int)

In [28]:
len(text2_arr)

64068

version 1 assumes that goal scoring performances are independent over the seasons : 
it allows full flexibility when choosing batch and sequence length values


version 2 takes the stand that every goal sequence over a single season defines a batch : 
only sequence length can be chosen

In [29]:
batch_size_1, seq_length_1 = 8, 38

In [30]:
batch_size_2, seq_length_2 = len(train_data), 19

# Model Definition

In [31]:
import torch
from torch import nn
import torch.nn.functional as F

In [32]:
# check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

No GPU available, training on CPU; consider making n_epochs very small.


In [33]:
class GoalRNN(nn.Module):
    
    def __init__(self, unique_goals, n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.goals = unique_goals
        self.int2goal = dict(enumerate(self.goals))
        self.goal2int = {ch: ii for ii, ch in self.int2goal.items()}
        
        ## layers of the model
        self.lstm = nn.LSTM(input_size=len(self.goals), hidden_size=n_hidden, num_layers=n_layers, 
                            dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(p=drop_prob)
        self.fc = nn.Linear(in_features= n_hidden, out_features=len(self.goals))
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        
        ## Get the outputs and the new hidden state from the lstm
        r_output, hidden = self.lstm(x, hidden)
        
        out = self.dropout(r_output)
        
        out = out.contiguous().view(-1, self.n_hidden)
        # return the final output and the hidden state
        out = self.fc(out)
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [94]:
def train(net, data, epochs=10, batch_size=5, seq_length=10, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: GoalRNN network
        data: Scored goals data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
    '''
    net.train()
    global val_data
    global train_data
    global counter
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    print(f"data length : {len(data)}")
    print(f"validation length : {len(data) - val_idx}")
    train_data, val_data = data[:val_idx], data[val_idx:]
    
    if(train_on_gpu):
        net.cuda()
    
    counter = 0
    n_goals = len(net.goals)
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(train_data, batch_size, seq_length):
            counter += 1
            
            # One-hot encode data and make them Torch tensors
            x = one_hot_encode(x, n_goals)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if(train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)

            # calculate the loss and perform backprop
           # try:
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            #except:
            #    print(targets.size())
            #    break
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                if len(val_data)//(batch_size*seq_length) > 0:
                    resize = batch_size*seq_length
                    hidden_dim_val = batch_size
                else:
                    resize = seq_length
                    hidden_dim_val = 1
                
                val_h = net.init_hidden(hidden_dim_val)
                val_losses = []
                net.eval()
                #print(val_data)
                #val_batches = get_batches(val_data, batch_size, seq_length)
                #x,y = next(val_batches)
                #print(x)
                #print(y)
                #print("Val")
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    # print("HERE")
                    x = one_hot_encode(x, n_goals)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    
                    val_loss = criterion(output, targets.view(resize).long())
                
                    val_losses.append(val_loss.item())
                
                net.train() # reset to train mode after iterationg through validation data
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

In [41]:
n_hidden=512
n_layers=2
def get_goalrnn_model(goals_vocab):
    net = GoalRNN(unique_goals=goals_vocab, n_hidden=n_hidden, n_layers=n_layers)
    print(net)
    return net

In [42]:
goalrnn_2 = get_goalrnn_model(goals_vocab=np.append(vocab,[-1]))
goalrnn_1 = get_goalrnn_model(goals_vocab=np.append(vocab,[-1]))

GoalRNN(
  (lstm): LSTM(12, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=12, bias=True)
)
GoalRNN(
  (lstm): LSTM(12, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=12, bias=True)
)


In [96]:
n_epochs = 28 # start smaller if you are just testing initial behavior

# train the model
train(net=goalrnn_2, 
      data=text2_encoded, 
      epochs=n_epochs, 
      batch_size=batch_size_2,
      seq_length=seq_length_2,
      lr=0.001,
      print_every=1)

data length : 64068
validation length : 6407
Epoch: 1/30... Step: 1... Loss: 1.7101... Val Loss: 1.6207
Epoch: 2/30... Step: 2... Loss: 1.7563... Val Loss: 1.5289
Epoch: 3/30... Step: 3... Loss: 1.6823... Val Loss: 1.5269
Epoch: 4/30... Step: 4... Loss: 1.6711... Val Loss: 1.5438
Epoch: 5/30... Step: 5... Loss: 1.6753... Val Loss: 1.5368
Epoch: 6/30... Step: 6... Loss: 1.6636... Val Loss: 1.5285
Epoch: 7/30... Step: 7... Loss: 1.6490... Val Loss: 1.5230
Epoch: 8/30... Step: 8... Loss: 1.6407... Val Loss: 1.5200
Epoch: 9/30... Step: 9... Loss: 1.6345... Val Loss: 1.5194
Epoch: 10/30... Step: 10... Loss: 1.6317... Val Loss: 1.5191
Epoch: 11/30... Step: 11... Loss: 1.6299... Val Loss: 1.5177
Epoch: 12/30... Step: 12... Loss: 1.6255... Val Loss: 1.5164
Epoch: 13/30... Step: 13... Loss: 1.6214... Val Loss: 1.5156
Epoch: 14/30... Step: 14... Loss: 1.6172... Val Loss: 1.5149
Epoch: 15/30... Step: 15... Loss: 1.6141... Val Loss: 1.5146
Epoch: 16/30... Step: 16... Loss: 1.6099... Val Loss: 1.51

# Prediction

In [125]:
def predict(net, curr_goal_scored, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        # tensor inputs
        x = np.array([[curr_goal_scored]])
        x = one_hot_encode(x, len(net.goals))
        inputs = torch.from_numpy(x)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        
        # get top characters
        if top_k is None:
            top_gs = np.arange(len(net.goals))
        else:
            p, top_gs = p.topk(max(top_k, 2))
            top_gs = top_gs.numpy().squeeze()
        
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        goal_scored_select = np.random.choice(top_gs, size=2, p=p/p.sum())
        #print(f"selected goals are {goal_scored_select}")
        curr_goal_scored = goal_scored_select[0] if goal_scored_select[0]!=-1 else goal_scored_select[1]
        
        # return the encoded value of the predicted goal_scored and the hidden state
        return net.int2goal[curr_goal_scored], h

In [146]:
def sample(net, size, prime_goals_seq, top_k=None):
    
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    # chars = [ch for ch in prime]
    goal_seq = deepcopy(prime_goals_seq)
    h = net.init_hidden(1)
    # pre-train the hidden state
    for goal in prime_goals_seq:
        goal, h = predict(net, goal, h, top_k=top_k)
        
    # last prediction of the loop corresponds to the firt new data
    goal_seq.append(goal)

    # Now pass in the previous character and get a new one
    for ii in range(size-1):
        goal, h = predict(net, goal_seq[-1], h, top_k=top_k)
        goal_seq.append(goal)

    return goal_seq, goal_seq[-size:]

In [139]:
sample(net=goalrnn_2, size=3, prime_goals_seq=[0,0,1,0,4,1,2], top_k=3)

goal_seq : [0, 0, 1, 0, 4, 1, 2]
type goal_seq : <class 'list'>
goal : 2
goal_seq : [0, 0, 1, 0, 4, 1, 2, 2]
goal_seq : [0, 0, 1, 0, 4, 1, 2, 2, 1]


([0, 0, 1, 0, 4, 1, 2, 2, 1, 0], [2, 1, 0])

In [121]:
test_data['goals_score_history'] = test_data.goals_scored.apply(lambda arr: arr[:27])
test_data['goals_score_target'] = test_data.goals_scored.apply(lambda arr: arr[27:])

In [123]:
test_data['prediction_size'] = test_data.championship.apply(lambda champ: 11-4*(champ=='bundesliga'))

In [147]:
test_data['predicted_goals'] = test_data[['goals_score_history', 'prediction_size']].apply(
    lambda row : sample(net=goalrnn_2, size=row[1], prime_goals_seq=row[0], top_k=3)[1], axis=1)

In [148]:
test_data

Unnamed: 0,index,championship,season,team,goals_scored,goals_score_history,goals_score_target,prediction_size,predicted_goals
0,87,bundesliga,2008-2009,VfB Stuttgart,"[1, 0, 2, 0, 3, 0, 4, 1, 2, 0, 1, 2, 0, 1, 2, ...","[1, 0, 2, 0, 3, 0, 4, 1, 2, 0, 1, 2, 0, 1, 2, ...","[3, 2, 2, 4, 2, 2, 1]",7,"[1, 0, 2, 0, 2, 1, 2]"
1,995,ligue-2,2010-2011,Dijon,"[0, 2, 1, 1, 1, 1, 5, 2, 0, 1, 4, 0, 0, 0, 3, ...","[0, 2, 1, 1, 1, 1, 5, 2, 0, 1, 4, 0, 0, 0, 3, ...","[2, 4, 2, 1, 1, 2, 3, 1, 2, 1, 0]",11,"[2, 0, 1, 2, 1, 2, 1, 0, 0, 0, 2]"
2,911,ligue-2,2006-2007,Amiens,"[2, 1, 1, 2, 1, 0, 0, 1, 3, 1, 1, 0, 2, 1, 0, ...","[2, 1, 1, 2, 1, 0, 0, 1, 3, 1, 1, 0, 2, 1, 0, ...","[2, 2, 2, 4, 0, 0, 3, 2, 3, 2, 2]",11,"[1, 1, 0, 0, 1, 0, 2, 1, 1, 1, 0]"
3,730,ligue-1,2012-2013,AC Ajaccio,"[1, 0, 0, 2, 0, 2, 1, 0, 0, 4, 0, 4, 0, 1, 0, ...","[1, 0, 0, 2, 0, 2, 1, 0, 0, 4, 0, 4, 0, 1, 0, ...","[1, 0, 2, 0, 1, 2, 2, 1, 0, 1, 0]",11,"[1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]"
4,361,liga,2008-2009,Numancia,"[0, 3, 1, 0, 0, 0, 0, 2, 1, 2, 4, 1, 0, 2, 0, ...","[0, 3, 1, 0, 0, 0, 0, 2, 1, 2, 4, 1, 0, 2, 0, ...","[2, 3, 0, 0, 1, 1, 2, 0, 1, 0, 0]",11,"[1, 2, 0, 0, 1, 1, 1, 0, 2, 0, 2]"
...,...,...,...,...,...,...,...,...,...
83,1389,premier-league,2014-2015,West Ham,"[0, 3, 1, 2, 3, 1, 2, 3, 2, 2, 0, 1, 1, 2, 3, ...","[0, 3, 1, 2, 3, 1, 2, 3, 2, 2, 0, 1, 1, 2, 3, ...","[0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0]",11,"[4, 9, 10, 8, 10, 10, 9, 9, 8, 8, 8]"
84,843,ligue-1,2017-2018,Nice,"[0, 1, 2, 0, 4, 1, 2, 2, 0, 1, 0, 1, 1, 0, 2, ...","[0, 1, 2, 0, 4, 1, 2, 2, 0, 1, 0, 1, 1, 0, 2, ...","[2, 5, 1, 2, 1, 1, 1, 1, 1, 4, 2]",11,"[1, 2, 2, 0, 1, 2, 0, 0, 0, 0, 0]"
85,1319,premier-league,2011-2012,Manchester United,"[1, 3, 8, 5, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 4, ...","[1, 3, 8, 5, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 4, ...","[2, 5, 1, 2, 2, 0, 4, 4, 0, 2, 1]",11,"[8, 9, 8, 10, 8, 9, 8, 9, 9, 10, 8]"
86,51,bundesliga,2006-2007,VfB Stuttgart,"[2, 3, 1, 3, 1, 2, 3, 1, 3, 4, 2, 2, 1, 1, 0, ...","[2, 3, 1, 3, 1, 2, 3, 1, 3, 4, 2, 2, 1, 1, 0, ...","[4, 2, 2, 1, 2, 3, 2]",7,"[10, 8, 8, 10, 8, 8, 10]"


In [149]:
ll1 = [1,2,3,4]
ll2 = [1,7,0,4]

In [152]:
sum(np.array(ll1) == np.array(ll2))

2

In [153]:
test_data['prediction_correctness'] = test_data[['goals_score_target', 'predicted_goals']].apply(
    lambda row : sum(np.array(row[0]) == np.array(row[1])), axis=1)

In [154]:
test_data.prediction_correctness.sum()

217

In [156]:
test_data.prediction_size.sum()

924

In [157]:
217/924

0.23484848484848486