In [2]:
#Setting up to interface with file system on Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#Install necessary packages
!pip install torch -q gwpy
!pip install torchbnn -q gwpy
!pip install torchmetrics -q gwpy

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ligo-segments (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.6/731.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
#Import all packages
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchbnn as bnn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [5]:
#Read in data, randomly sort it
raw_df = pd.read_csv("drive/MyDrive/Colab Notebooks/OPER 785 14 EPL_Value.csv")
raw_df = raw_df.sample(frac=1).reset_index(drop=True)

#Visualize data
raw_df.head()

Unnamed: 0,name,club,continent,position,age,avg_daily_page_views,fpl_value,fpl_sel_pct,market_value
0,Ben Hamer,Leicester_City,UK,GK,29,132,4.5,0.1,500000
1,Jack Butland,Stoke_City,UK,GK,24,402,5.0,2.8,15000000
2,Dale Stephens,Brighton_and_Hove,UK,CM,27,122,4.5,0.9,5000000
3,Harry Maguire,Leicester_City,UK,CB,24,488,5.0,1.5,8000000
4,John Stones,Manchester_City,UK,CB,23,1078,5.5,2.3,35000000


In [6]:
#One-hot encode club, continent, and position
club = pd.get_dummies(raw_df.club)
continent = pd.get_dummies(raw_df.continent)
position = pd.get_dummies(raw_df.position)

In [7]:
#Section out, keep names, standardize numerical features
names = raw_df[['name']]
numerical = raw_df[['age', 'avg_daily_page_views', 'fpl_value', 'fpl_sel_pct']]
numerical_sd = (numerical - numerical.mean(axis=0))/(numerical.std(axis=0,ddof=1))
market_value = raw_df[['market_value']]

In [8]:
#Combine player features, our X
player_features = pd.concat([names, club, continent, position, numerical_sd], axis = 1)
player_features.head()

Unnamed: 0,name,Arsenal,Bournemouth,Brighton_and_Hove,Burnley,Chelsea,Crystal_Palace,Everton,Huddersfield,Leicester_City,...,LM,LW,RB,RM,RW,SS,age,avg_daily_page_views,fpl_value,fpl_sel_pct
0,Ben Hamer,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0.551734,-0.679352,-0.702373,-0.554529
1,Jack Butland,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,-0.710941,-0.389725,-0.33143,-0.079218
2,Dale Stephens,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0.046664,-0.690079,-0.702373,-0.413697
3,Harry Maguire,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,-0.710941,-0.297474,-0.33143,-0.308072
4,John Stones,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,-0.963476,0.335415,0.039514,-0.167239


In [9]:
#Pull 23 players for our holdout dataset (one whole "club", 5% of data)
model_x = player_features.iloc[:437,1:]
holdout_x = player_features.iloc[437:,1:]
model_y = market_value.iloc[:437,:]
holdout_y = market_value.iloc[437:,:]
holdout_names = player_features.iloc[437:,0:1]

In [10]:
#PyTorch wants numpy arrays
X = pd.DataFrame.to_numpy(model_x)
Y = pd.DataFrame.to_numpy(model_y)

Xh = pd.DataFrame.to_numpy(holdout_x)
Yh = pd.DataFrame.to_numpy(holdout_y)

In [11]:
#Convert to torch tensors
x, y = torch.from_numpy(X).float(), torch.from_numpy(Y).float()
xh, yh = torch.from_numpy(Xh).float(), torch.from_numpy(Yh).float()

In [12]:
#Build simple BNN
model = nn.Sequential(
    bnn.BayesLinear(prior_mu=0, prior_sigma=1, in_features=42, out_features=80),
    nn.SELU(),
    bnn.BayesLinear(prior_mu=0, prior_sigma=1, in_features=80, out_features=50),
    nn.SELU(),
    bnn.BayesLinear(prior_mu=0, prior_sigma=1, in_features=50, out_features=30),
    nn.SELU(),
    bnn.BayesLinear(prior_mu=0, prior_sigma=1, in_features=30, out_features=10),
    nn.SELU(),
    bnn.BayesLinear(prior_mu=0, prior_sigma=1, in_features=10, out_features=1),
)

mse_loss = nn.MSELoss()
kl_loss = bnn.BKLLoss(reduction='mean', last_layer_only=False)

optimizer = optim.Adam(model.parameters(), lr=0.01)

In [13]:
#Increased KL weights since MSE is so large
kl_weight = 500

for step in range(5000):
    pre = model(x)
    mse = mse_loss(pre, y)
    kl = kl_loss(model)
    cost = mse + kl_weight*kl

    optimizer.zero_grad()
    cost.backward()
    optimizer.step()

print('- MSE : %2.2f, KL : %2.2f' % (mse.item(), kl.item()))

- MSE : 6765394001920.00, KL : 2.58


In [14]:
#Create array that will record 100 predictions
preds = np.zeros((23,100))
for i in range(100):
  y_pred = model(xh)
  preds[:,i:] = y_pred.data.numpy()

#Cis array will save mean, standard deviation, lower and upper 95% bounds, and true value
CIs = np.zeros((23,5))

for i in range(len(preds)):
  CIs[i,0] = np.mean(preds[i,:], axis = 0)
  CIs[i,1] = np.std(preds[i,:], axis = 0)
  CIs[i,2] = CIs[i,0] - 1.96*CIs[i,1]
  CIs[i,3] = CIs[i,0] + 1.96*CIs[i,1]
  CIs[i,4] = Yh[i]

#Clean up the messy digits
CIs = np.round(CIs, 0)

In [15]:
#Print results
for i in range(len(CIs)):
  print('Player Name:', holdout_names.iloc[i,0])
  print('Transfer Market Value Summer of 2017:',CIs[i,4])
  print('95% CI for Prediction: (',CIs[i,2],',',CIs[i,3],')')

  if (CIs[i,4]<=CIs[i,3]) and (CIs[i,4]>=CIs[i,2]):
    print('True Value Contained in Interval')
  else:
    print('True Value Not Contained in Interval')
    if CIs[i,4] < CIs[i,2]:
      print('Model Overpredicted')
    else:
      print('Model Underpredicted')
  print()

Player Name: Charlie Austin
Transfer Market Value Summer of 2017: 13000000.0
95% CI for Prediction: ( 6667260.0 , 9884248.0 )
True Value Not Contained in Interval
Model Underpredicted

Player Name: Eden Hazard
Transfer Market Value Summer of 2017: 75000000.0
95% CI for Prediction: ( 63578350.0 , 73283711.0 )
True Value Not Contained in Interval
Model Underpredicted

Player Name: Nathan Dyer
Transfer Market Value Summer of 2017: 3500000.0
95% CI for Prediction: ( 1376267.0 , 2857340.0 )
True Value Not Contained in Interval
Model Underpredicted

Player Name: Idrissa Gueye
Transfer Market Value Summer of 2017: 18000000.0
95% CI for Prediction: ( 1503955.0 , 3098603.0 )
True Value Not Contained in Interval
Model Underpredicted

Player Name: Daniel Amartey
Transfer Market Value Summer of 2017: 5000000.0
95% CI for Prediction: ( 6503866.0 , 9228009.0 )
True Value Not Contained in Interval
Model Overpredicted

Player Name: Sadio Mane
Transfer Market Value Summer of 2017: 40000000.0
95% CI for