In [28]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy.sparse import csc_matrix,csr_matrix,eye,bmat
from scipy.sparse.linalg import eigs,inv,gmres
from scipy.stats import norm
import pickle
import multiprocessing
%matplotlib inline

In [2]:
with open('inds_to_name.pkl','r') as pickleFile:
    inds_to_name = pickle.load(pickleFile)

with open('hole_tups.pkl','r') as pickleFile:
    hole_tups = pickle.load(pickleFile)

In [3]:
epsilon = 10
key = pd.read_csv('cats%g/key_file.csv' % (epsilon,),header=None,index_col=0)
key_dict = {tuple(value):key for key,value in key.T.to_dict('list').iteritems()}

In [4]:
key_dict.items()[0:3]

[((2009, 481, 2, 730, 5), 17697),
 ((2015, 5, 4, 5, 14), 35863),
 ((2012, 54, 1, 609, 14), 27879)]

In [5]:
n_players = len(inds_to_name)
n_holes = len(hole_tups)
n_tournaments = len(pd.DataFrame(np.array(hole_tups))[[0,1]].drop_duplicates())

In [6]:
print n_players,n_holes,n_tournaments

1992 38319 546


In [7]:
def load_sparse_csc(filename):
    loader = np.load(filename)
    return csc_matrix((loader['data'],loader['indices'],loader['indptr']),shape = loader['shape'])

In [8]:
def my_norm(x,BETA):
    return norm.pdf(x,0,BETA)/norm.pdf(0,0,BETA)

In [9]:
def inflate(cat,tournament_group,holes_to_inflate,n_tournament_groups,BETA,window_size=28):
    mat = csc_matrix((n_players*n_tournament_groups,n_players),dtype=float)
    mat_1 = csc_matrix((n_players*n_tournament_groups,n_players),dtype=float)
    for j in holes_to_inflate:
        ind = key_dict[j]
        for c in cats[cat]:
            mat += bmat([[load_sparse_csc('cats%g/%s_%d.npz' % (epsilon,c,ind))*my_norm(tournament_group-k,BETA)] for k in range(1,n_tournament_groups+1)],format='csc')
            mat_1 += bmat([[(load_sparse_csc('cats%g/%s_%d.npz' % (epsilon,c,ind))!=0).astype(float)*my_norm(tournament_group-k,BETA)] for k in range(1,n_tournament_groups+1)],format='csc')
    if tournament_group>window_size:
        del inflate.__dict__[tournament_group-window_size]
    inflate.__dict__[tournament_group] = (mat,mat_1)
    out_mat = bmat([[inflate.__dict__[i][0][max(0,tournament_group-window_size)*n_players:n_players*tournament_group] for i in range(max(1,tournament_group-window_size+1),tournament_group+1)]],format='csc')
    out_mat1 = bmat([[inflate.__dict__[i][1][max(0,tournament_group-window_size)*n_players:n_players*tournament_group] for i in range(max(1,tournament_group-window_size+1),tournament_group+1)]],format='csc')
    return (out_mat,out_mat1)

In [10]:
def alpha(A,a):
    A.data[A.data<1e-6] = 0
    A.data[np.isnan(A.data)]=0
    w,v = eigs(A,k=1,which='LM')
    return a/w[0].real

In [11]:
def solve(mat,mat_1,a,min_reps,x_guess=None,x_guess1=None):
    mat.data[mat_1.data<1e-6] = 0
    mat_1.data[mat_1.data<1e-6] = 0
    mat.data[np.isnan(mat.data)] = 0
    mat_1.data[np.isnan(mat_1.data)] = 0
    
    S = eye(mat.shape[0],format='csc')-alpha(mat,a)*mat
    w_a = gmres(S,mat.sum(1),x0=x_guess)[0]
    
    S = eye(mat_1.shape[0],format='csc')-alpha(mat_1,a)*mat_1 
    w_g = gmres(S,mat_1.sum(1),x0=x_guess1)[0]
    
    w_a[w_g<min_reps]=0
    solve.w_a = w_a
    solve.w_g = w_g
    
    return ((w_a/w_g)[-n_players:],w_g[-n_players:])

In [12]:
cats = {}
cats['green0'] = ['green0','fringe0']
cats['green5'] = ['green5','fringe5']
cats['green10'] = ['green10','fringe10']
cats['green20'] = ['green20','fringe20']
cats['rough0'] = ['prough0','irough0']
cats['rough90'] = ['prough90','irough90']
cats['rough375'] = ['prough375','irough375']
cats['fairway0'] = ['fairway0']
cats['fairway300'] = ['fairway300']
cats['fairway540'] = ['fairway540']
cats['bunker'] = ['bunker']
cats['tee3'] = ['tee3']
cats['tee45'] = ['tee45']

In [14]:
cat = 'green0'
ranks,reps = [],[]
BETA = 6
a = .8
bin_size = 4
window_size = 28
n_tournament_groups = int(math.ceil(n_tournaments/float(bin_size)))
current_group = 0
tournament_groups=[set()]
tournaments = set()
holes_to_inflate = []
for hole_ind,tup in enumerate(hole_tups):
    if hole_ind>=400:
        continue
    tournaments.add(tuple(tup[0:2]))
    tournament_group = (len(tournaments)-1)/bin_size
    if tournament_group>current_group:
        A,G = inflate(cat,tournament_group,holes_to_inflate,n_tournament_groups,BETA)
        if current_group==0:
            res = solve(A,G,a,1)
            ranks.append(res[0])
            reps.append(res[1])
            print 'Tournament Group %d done' % tournament_group
            current_group = tournament_group
            tournament_groups.append(set())
            holes_to_inflate = []
        else:
            w_a_approx = np.append(solve.w_a[0 if tournament_group<=window_size else n_players:],solve.w_a[-n_players:])
            w_g_approx = np.append(solve.w_g[0 if tournament_group<=window_size else n_players:],solve.w_g[-n_players:])
            res = solve(A,G,a,1,w_a_approx,w_g_approx)
            ranks.append(res[0])
            reps.append(res[1])
            print 'Tournament Group %d done' % tournament_group
            current_group = tournament_group
            tournament_groups.append(set())
            holes_to_inflate = []
    tournament_groups[current_group].add(tuple(tup[0:2]))
    holes_to_inflate.append(tuple(tup))

Tournament Group 1 done


In [27]:
def run_a_slice(slice):
    def inflate(cat,tournament_group,holes_to_inflate,n_tournament_groups,BETA,window_size=28):
        mat = csc_matrix((n_players*n_tournament_groups,n_players),dtype=float)
        mat_1 = csc_matrix((n_players*n_tournament_groups,n_players),dtype=float)
        for j in holes_to_inflate:
            ind = key_dict[j]
            for c in cats[cat]:
                mat += bmat([[load_sparse_csc('cats%g/%s_%d.npz' % (epsilon,c,ind))*my_norm(tournament_group-k,BETA)] for k in range(1,n_tournament_groups+1)],format='csc')
                mat_1 += bmat([[(load_sparse_csc('cats%g/%s_%d.npz' % (epsilon,c,ind))!=0).astype(float)*my_norm(tournament_group-k,BETA)] for k in range(1,n_tournament_groups+1)],format='csc')
        return {tournament_group:(mat,mat_1)}
    d = {}
    for group,tups in slice:
        d.update(inflate(cat,group,tups,n_tournament_groups,BETA))
    return d

In [15]:
bin_size = 4
n_tournament_groups = int(math.ceil(n_tournaments/float(bin_size)))
current_group = 0
tournament_groups=[set()]
tournaments = set()
group_to_tups = {}
holes_to_inflate = []
for tup in hole_tups:
    tournaments.add(tuple(tup[0:2]))
    tournament_group = (len(tournaments)-1)/bin_size
    if tournament_group>current_group:
        current_group = tournament_group
        tournament_groups.append(set())
        group_to_tups[current_group] = holes_to_inflate
        holes_to_inflate = []
    tournament_groups[current_group].add(tuple(tup[0:2]))
    holes_to_inflate.append(tuple(tup))

In [30]:
def partition (lst, n):
    return [lst[i::n] for i in xrange(n)]

In [None]:
num_cores = multiprocessing.cpu_count()-1
slices = partition(group_to_tups.items(),num_cores)
pool = multiprocessing.Pool(num_cores)
results = pool.map(run_a_slice, slices)
pool.close()

In [None]:
master_df = pd.DataFrame({'Player_Index':[],'Permanent_Tournament_#':[],'Course_#':[],'Field_Strength':[],
                          'Finishing_Pct':[],'Rating':[],'Reps':[],'Pct_Reps':[]})
for j in range(len(ranks)):
    df = pd.DataFrame({'player_ind':range(n_players),
                       'rank':ranks[j],
                       'reps':reps[j]}).dropna()
    pct_reps = pd.Series(df.reps[df.reps!=0]).rank(pct=True)
    df.insert(len(df.columns),'pct_reps',[0]*len(df))
    df.ix[df.reps!=0,'pct_reps'] = pct_reps
    rank_dict,reps_dict,pct_reps_dict = df['rank'].to_dict(),df['reps'].to_dict(),df['pct_reps'].to_dict()
    years = [int(i.split(',')[0]) for i in tournament_groups[j+1]]
    t_ids = [int(i.split(',')[1]) for i in tournament_groups[j+1]]
    df2 = data[data.Year.isin(years) & data['Permanent_Tournament_#'].isin(t_ids)]
    grouped = df2.groupby(['Player_Index','Permanent_Tournament_#','Course_#'],as_index=False)
    df3 = grouped['Finishing_Pct'].mean()
    df3['Rating'] = df3['Player_Index'].map(rank_dict)
    df3['Reps'] = df3['Player_Index'].map(reps_dict)
    df3['Pct_Reps'] = df3['Player_Index'].map(pct_reps_dict)
    df3 = df3[df3.Rating!=0].dropna()
    tourn_strength_dict = df3.groupby('Permanent_Tournament_#')['Rating'].mean().to_dict()
    df3['Field_Strength'] = df3['Permanent_Tournament_#'].map(tourn_strength_dict)
    master_df = pd.concat([master_df,df3])

# master_df = master_df.dropna()
# master_df = master_df[master_df.Rating!=0]

In [None]:
df = data[['Player_Index','Player_First_Name','Player_Last_Name']].drop_duplicates()
df.index = df.Player_Index
df = df.drop('Player_Index',axis=1)
inds_to_name = df.T.to_dict('list')
inds_to_name = {key:', '.join(value) for key,value in inds_to_name.iteritems()}
names_to_ind = {inds_to_name[key]:key for key in inds_to_name}

In [None]:
pd.DataFrame({'player_ind':range(n_players),
              'player_name':[inds_to_name[i] for i in range(n_players)],
              'rank':ranks[3]}).sort_values('rank',ascending=False).dropna()