In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import time
import pickle as pkl
import glob
import numpy as np
from datetime import datetime

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

%run utils.ipynb

In [3]:
from IPython.core.debugger import set_trace

In [4]:
d11_scores = pd.read_csv("player_d11scores.csv")
d11_scores.shape
d11_scores = d11_scores[~pd.isnull(d11_scores.tot_score)]
d11_scores.shape
d11_scores.head()

(15157, 3)

(14926, 3)

Unnamed: 0,match_id,player,tot_score
0,829729,DR Smith,98.0
1,829729,BB McCullum,60.0
2,829729,SK Raina,55.0
3,829729,F du Plessis,33.0
4,829729,MS Dhoni,7.0


In [6]:


matchid_year_map = {x: matchid_to_year(x) for x in np.unique(d11_scores.match_id)}
len(matchid_year_map)

284

753

753

In [8]:
d11_scores["season"] = [matchid_year_map[x] for x in d11_scores["match_id"]]
d11_scores

Unnamed: 0,match_id,player,tot_score,season
0,829729,DR Smith,98.0,2015
1,829729,BB McCullum,60.0,2015
2,829729,SK Raina,55.0,2015
3,829729,F du Plessis,33.0,2015
4,829729,MS Dhoni,7.0,2015
5,829729,DJ Bravo,78.0,2015
6,829729,RA Jadeja,14.0,2015
7,829729,R Ashwin,4.0,2015
8,829729,MM Sharma,25.0,2015
9,829729,IC Pandey,37.0,2015


In [9]:
np.unique(d11_scores.season)

array([2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018,
       2019])

In [22]:
pmfs = []
for season in [2015, 2016, 2017, 2018, 2019]:
    print("------ "+str(season)+" ------")
    
    player_idx = {x: i for i, x in enumerate(np.unique(d11_scores[d11_scores.season < season].player))}
    print(len(player_idx))

    matchid_idx = {x: i for i, x in enumerate(np.unique(d11_scores[d11_scores.season <= season].match_id))}
    print(len(matchid_idx))
    
    data = np.zeros((len(matchid_idx), len(player_idx)))
    for _, row in d11_scores[d11_scores.season <= season].iterrows():
        if row["player"] in player_idx:
            data[matchid_idx[row["match_id"]], player_idx[row["player"]]] = row["tot_score"]
    
    X = data[[matchid_idx[x] for x in np.unique(d11_scores[d11_scores.season.values < season].match_id)]]
    Xval = data[[matchid_idx[x] for x in np.unique(d11_scores[d11_scores.season.values == season].match_id)]]
    
    print(X.shape, Xval.shape)
    
    pmf = PairwiseMF(len(player_idx), 5, alpha=0.0001, adam_b1=0, adam_b2=0)
    pmf.fit(X, valX=Xval, num_iter=500)
    
    pmfs.append(pmf)
    
    print("TR Score : {:.4f} | CV Score : {:.4f}".format(pmf.score(X), pmf.score(Xval)))

------ 2015 ------
235
514
((457, 235), (57, 235))
---------- 0 ----------
---------- 50 ----------
---------- 100 ----------
---------- 150 ----------
---------- 200 ----------
TR Score : 2.5916 | CV Score : 1.2561
------ 2016 ------
246
574
((514, 246), (60, 246))
---------- 0 ----------
---------- 50 ----------
---------- 100 ----------
---------- 150 ----------
TR Score : 2.5371 | CV Score : 2.6476
------ 2017 ------
262
633
((574, 262), (59, 262))
---------- 0 ----------
---------- 50 ----------
---------- 100 ----------
---------- 150 ----------
---------- 200 ----------
TR Score : 2.4113 | CV Score : 4.0932
------ 2018 ------
274
693
((633, 274), (60, 274))
---------- 0 ----------
---------- 50 ----------
---------- 100 ----------
---------- 150 ----------
TR Score : 2.2500 | CV Score : 2.5583
------ 2019 ------
283
753
((693, 283), (60, 283))
---------- 0 ----------
---------- 50 ----------
---------- 100 ----------
---------- 150 ----------
TR Score : 2.1189 | CV Score : 1.814

In [21]:
class PairwiseMF():
    def __init__(self, num_points, num_latent, alpha=0.001, beta=0, adam_b1=0.9, adam_b2=0.999):
        np.random.seed(0)
        self.latent_feats = np.random.normal(scale=1./num_latent, size=(num_points, num_latent))
        self.alpha = alpha
        self.beta = beta
        self.adam_b1 = adam_b1
        self.adam_b2 = adam_b2
    
    def fit(self, X, valX=None, num_iter=10, random_seed=0, early_stop_factor=20):
        np.random.seed(random_seed)
        
        past_m1 = np.zeros_like(self.latent_feats)
        past_m2 = np.zeros_like(self.latent_feats)
        
        best_itr = 0
        best_score = float("inf")
        best_lf = np.copy(self.latent_feats)
        for itr in range(num_iter):
            if itr%50 == 0:
                print("---------- "+str(itr)+" ----------")
            np.random.shuffle(X)
            
            updts = []
            for x in X:
                xids = np.where(x>0)[0]
                
                curr_lf = self.latent_feats[xids, :]
                curr_m1 = past_m1[xids, :]
                curr_m2 = past_m2[xids, :]
                
                updt = np.zeros_like(curr_lf)
                updt_m1 = np.zeros_like(curr_lf)
                updt_m2 = np.zeros_like(curr_lf)
                for i in range(len(xids)):
                    for j in range(len(xids)):
                        if i!=j:
                            e = x[xids[i]] - (curr_lf[i]*curr_lf[j]).sum()
                            g = - e*curr_lf[j] + self.beta*curr_lf[i]
                            
                            m1 = self.adam_b1*curr_m1[i] + (1-self.adam_b1)*g
                            m2 = self.adam_b2*curr_m2[i] + (1-self.adam_b2)*(g**2)
                            
                            m1_corr = m1/(1 - self.adam_b1**(itr+1))
                            m2_corr = m2/(1 - self.adam_b2**(itr+1))
                            
                            if self.adam_b1 == 0 and self.adam_b2 == 0:
                                updt[i] -= self.alpha * m1_corr
                            else:
                                updt[i] -= self.alpha * m1_corr / (np.sqrt(m2_corr) + np.finfo(np.float32).eps)
                            
                            updt_m1[i] += m1
                            updt_m2[i] += m2
                            
                updt /= len(xids) - 1
                updt_m1 /= len(xids) - 1
                updt_m2 /= len(xids) - 1
                
                updts.append(np.abs(updt).mean())
                if not np.isfinite(updt.sum()):
                    print(updt)
                    print(lf)
                    print(x[xids])
                    set_trace()
                    raise ValueError
                
                self.latent_feats[xids, :] += updt
                past_m1[xids, :] = updt_m1
                past_m2[xids, :] = updt_m2
            
            curr_score = self.score(X)
            #print("TR MSE: {:.4f}".format(curr_score))
            #if valX is not None:
                #print("CV MSE: {:.4f}".format(self.score(valX)))
            
            if best_score > curr_score:
                best_itr = itr
                best_score = curr_score
                best_lf = np.copy(self.latent_feats)
            
            if itr - best_itr > num_iter/early_stop_factor:
                self.latent_feats = best_lf
                self.best_score = best_score
                self.best_itr = best_itr
                break
    
    def score(self, X):
        sample_es = []
        for x in X:
            xids = np.where(x>0)[0]
            lf = self.latent_feats[xids, :]
            
            sample_e = []
            for i in range(len(xids)):
                for j in range(len(xids)):
                    if i!=j:
                        sample_e.append((x[xids[i]] - (lf[i]*lf[j]).sum()))
            sample_es.append(np.mean(sample_e))
        return np.mean(sample_es)

In [None]:
pmf = PairwiseMF(len(player_idx), 5, alpha=0.0001, adam_b1=0, adam_b2=0)

In [None]:
np.random.seed(0)
np.random.shuffle(data)

In [None]:
pmf.fit(data[:400], valX=data[400:], num_iter=500)

In [None]:
x = [ 33.  17.  18.  39.  27.  37.  39.   4.  18.  43.   2.  38.  29.  33.
43.  27. 101.  14.  91.   4.  54.]

In [None]:
updt = np.zeros_like(lf)
for i in range(len(xids)):
    for j in range(len(xids)):
        if i!=j:
            e = x[xids[i]] - (lf[i]*lf[j]).sum()
            updt[i] += self.alpha * (e * lf[j] - self.beta * lf[i])
updt /= len(xids) - 1