In [1]:
import pandas as pd
import numpy as np
import random, re
from scipy.stats import norm
import matplotlib.pyplot as plt


In [19]:
# load a text version of the kenpom data and creat a dictionary
def loadkp(fname):
    keys = 'Rank        Team    Conf    W-L     AdjEM   AdjO    AdjD    AdjT    Luck    SOSAdjEM        SOSOppO SOSOppD NCSOSAdjEM'.split()
    f = open(fname).readlines()
    kpd = {}
    for line in f:
        lsp = line.split('\t')
        if 'Strength' in lsp[0] or 'Rank' in lsp[0]: pass
        else:
            datanorank =  [lsp[i] for i in [0,1,2,3,4,5,7,9,11,13,15,17,19]]
            datanorank[1] = re.split('(\d+)',datanorank[1])[0].strip()
            teamd = dict(zip(keys,datanorank))
            if len(re.split('(\d+)',lsp[1])) > 1:
                ncaaseed = re.split('(\d+)',lsp[1])[1].strip()
                teamd['ncaaseed'] = ncaaseed
            kpd[datanorank[1]] = teamd
    return kpd

# function to calculate kenpom probabilities. a is away team, b is home team. 
def kpprob(a,b,kpd,std=7,homea=0):
    adjemdiff = ((float(kpd[a]['AdjEM']) - float(kpd[b]['AdjEM']))*
                (float(kpd[a]['AdjT']) + float(kpd[b]['AdjT']))/200.0) - homea
    bprob = norm.cdf(0,adjemdiff,std)
    aprob = 1.0-bprob
    return aprob,bprob,adjemdiff

# pick a winner base on the above kenpom probability fucntion
def pickw(pfunc,kpd,g,**kwargs):
    if random.random() <= pfunc(g[0].strip(),g[1].strip(),kpd,**kwargs)[0]: return g[0].strip()
    else: return g[1].strip()

In [22]:
years = [str(val).zfill(2) for val in range(2,19)]
fullresd = {}
allkp = {}
for year in years:
    print year
    kpd = loadkp('kp{0}.txt'.format(year))
    allkp['20'+year] = kpd
    resd = pd.read_csv('b10_{0}.csv'.format(year))

    awayt = resd['Visitor/Neutral'].values
    homet = resd['Home/Neutral'].values
    pdiff = resd['PTS'] - resd['PTS.1']
    resd['PD.act'] = pdiff

    expdiffs = [kpprob(v[0],v[1],kpd,std=11,homea=0)[2] for v in zip(awayt,homet)]
    resd['PD.exp'] = expdiffs

    resd['PD.ha'] = resd['PD.act'] - resd['PD.exp']

    teams = list(set(list(homet)))
#     for team in sorted(teams):
#         print team, resd.loc[resd['Home/Neutral'] == team, 'PD.ha'].mean()
        
    fullresd['20'+year] = resd

02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18


In [24]:
fullresd['2018']

Unnamed: 0,Date,Visitor/Neutral,PTS,Home/Neutral,PTS.1,OT,Notes,PD.act,PD.exp,PD.ha
0,Fri Dec 1 2017,Purdue,80,Maryland,75,,,5,7.502700,-2.502700
1,Fri Dec 1 2017,Illinois,68,Northwestern,72,OT,,-4,-1.271100,-2.728900
2,Sat Dec 2 2017,Penn St.,77,Iowa,73,,,4,7.595520,-3.595520
3,Sat Dec 2 2017,Indiana,55,Michigan,69,,,-14,-9.047200,-4.952800
4,Sat Dec 2 2017,Ohio St.,83,Wisconsin,58,,,25,6.796280,18.203720
5,Sun Dec 3 2017,Maryland,92,Illinois,91,OT,,1,5.693160,-4.693160
6,Sun Dec 3 2017,Nebraska,57,Michigan St.,86,,,-29,-7.976640,-21.023360
7,Sun Dec 3 2017,Rutgers,67,Minnesota,89,,,-22,-1.364790,-20.635210
8,Sun Dec 3 2017,Northwestern,69,Purdue,74,,,-5,-11.602350,6.602350
9,Mon Dec 4 2017,Iowa,64,Indiana,77,,,-13,-1.574350,-11.425650


In [25]:
pan = pd.Panel(fullresd)

In [47]:
pan['2002']['PD.ha'].dropna().mean()

-6.8497302840909127

In [26]:
years = range(2002,2018)
stds = []
for year in years:
    std = pan[str(year)]['PD.exp'].std()
    stds.append(std)
    
print np.array(std).mean()
    

6.16020548983


In [61]:
years = range(2002,2018)
for team in teams:
    ranks, homeas = [], []
    for year in years:
        stryear = str(year)
        homea_yr = pan[stryear].loc[pan[stryear]['Home/Neutral'] == team, 'PD.ha'].mean()
        rank = allkp[stryear][team]['Rank']
        ranks.append(rank)
        homeas.append(homea_yr)

    print team, np.nanmean(np.array(homeas))


Northwestern -3.30489294271
Penn St. -3.69490145399
Wisconsin -5.1313155599
Ohio St. -4.64177692708
Michigan St. -5.38495300781
Michigan -4.15913175781
Purdue -6.07407303385
Minnesota -4.33457042101
Illinois -4.08198338542
Rutgers -0.453445925926
Nebraska -4.39356787037
Indiana -5.09584558594
Iowa -5.30968264757
Maryland -3.50976111111


In [32]:
print homeas
# print np.array(homeas).mean()
# print np.array(homeas).std()

[True, True, True, True, True, True, True, True, True, True, True, True, True, -4.744759444444444, -4.983985, -0.8005388888888897]
