In [126]:
import pandas as pd
import numpy as np
import random, re
from scipy.stats import norm
import matplotlib.pyplot as plt


In [147]:
# load a text version of the kenpom data and creat a dictionary
def loadkp(fname):
    keys = 'Rank        Team    Conf    W-L     AdjEM   AdjO    AdjD    AdjT    Luck    SOSAdjEM        SOSOppO SOSOppD NCSOSAdjEM'.split()
    f = open(fname).readlines()
    kpd = {}
    for line in f:
        lsp = line.split('\t')
        if 'Strength' in lsp[0] or 'Rank' in lsp[0]: pass
        else:
            datanorank =  [lsp[i] for i in [0,1,2,3,4,5,7,9,11,13,15,17,19]]
            datanorank[1] = re.split('(\d+)',datanorank[1])[0].strip()
            teamd = dict(zip(keys,datanorank))
            if len(re.split('(\d+)',lsp[1])) > 1:
                ncaaseed = re.split('(\d+)',lsp[1])[1].strip()
                teamd['ncaaseed'] = ncaaseed
            kpd[datanorank[1]] = teamd
    return kpd

# function to calculate kenpom probabilities. a is away team, b is home team. 
def kpprob(a,b,kpd,std=7,homea=0):
    adjemdiff = ((float(kpd[a]['AdjEM']) - float(kpd[b]['AdjEM']))*
                (float(kpd[a]['AdjT']) + float(kpd[b]['AdjT']))/200.0) - homea
    bprob = norm.cdf(0,adjemdiff,std)
    aprob = 1.0-bprob
    return aprob,bprob,adjemdiff

# pick a winner base on the above kenpom probability fucntion
def pickw(pfunc,kpd,g,**kwargs):
    if random.random() <= pfunc(g[0].strip(),g[1].strip(),kpd,**kwargs)[0]: return g[0].strip()
    else: return g[1].strip()

In [150]:
years = [str(val).zfill(2) for val in range(2,18)]
fullresd = {}
allkp = {}
for year in years:
#     print year
    kpd = loadkp('kp{0}.txt'.format(year))
    allkp['20'+year] = kpd
    resd = pd.read_csv('b10_{0}.csv'.format(year))

    awayt = resd['Visitor/Neutral'].values
    homet = resd['Home/Neutral'].values
    pdiff = resd['PTS'] - resd['PTS.1']
    resd['PD.act'] = pdiff

    expdiffs = [kpprob(v[0],v[1],kpd,std=11,homea=0)[2] for v in zip(awayt,homet)]
    resd['PD.exp'] = expdiffs

    resd['PD.ha'] = resd['PD.act'] - resd['PD.exp']

    teams = list(set(list(homet)))
#     for team in sorted(teams):
#         print team, resd.loc[resd['Home/Neutral'] == team, 'PD.ha'].mean()
        
    fullresd['20'+year] = resd

In [151]:
pan = pd.Panel(fullresd)

In [152]:
years = range(2002,2018)
stds = []
for year in years:
    std = pan[str(year)]['PD.exp'].std()
    stds.append(std)
    
print np.array(std).mean()
    

6.16020548983


In [153]:
years = range(2002,2018)
for team in teams:
    ranks, homeas = [], []
    for year in years:
        stryear = str(year)
        homea_yr = pan[stryear].loc[pan[stryear]['Home/Neutral'] == team, 'PD.ha'].mean()
        rank = allkp[stryear][team]['Rank']
        ranks.append(rank)
        homeas.append(homea_yr)
    print team, np.array(homea_yr).mean()



Northwestern -2.90505611111
Iowa -6.05741333333
Wisconsin -4.08229944444
Ohio St. -4.27883166667
Purdue -8.54400222222
Michigan -3.27778166667
Michigan St. -9.95439444444
Minnesota -3.15343555556
Nebraska -1.30769777778
Illinois -3.57257777778
Rutgers 0.844396666667
Indiana -2.21855
Penn St. -4.51881
Maryland -0.800538888889


In [154]:
print np.array(homeas).mean()
print np.array(homeas).std()

nan
nan


In [155]:
kppan = pd.Panel(allkp)

In [156]:
kppan['2016'].loc['AdjD']
kppan['2016']

Unnamed: 0,Abilene Christian,Air Force,Akron,Alabama,Alabama A&M,Alabama St.,Albany,Alcorn St.,American,Appalachian St.,...,William & Mary,Winston Salem St.,Winthrop,Wisconsin,Wofford,Wright St.,Wyoming,Xavier,Yale,Youngstown St.
AdjD,112.3,106.0,101.5,97.4,113.4,110.7,102.7,112.6,107.4,110.9,...,104.5,,107.7,93.6,108.5,99.4,108.5,94.6,96.0,113.9
AdjEM,-15.24,-6.92,+8.63,+8.07,-13.28,-11.59,+4.25,-14.55,-15.34,-8.17,...,+8.79,,+0.69,+16.55,-2.22,+2.01,-0.19,+22.52,+14.33,-10.70
AdjO,97.1,99.1,110.1,105.5,100.1,99.1,106.9,98.0,92.1,102.7,...,113.3,,108.4,110.1,106.3,101.4,108.3,117.1,110.4,103.2
AdjT,67.8,67.0,67.5,64.8,65.2,69.0,66.2,65.5,61.3,70.3,...,68.3,,72.8,63.3,65.2,65.1,64.6,71.9,66.0,72.4
Conf,Slnd,MWC,MAC,SEC,SWAC,SWAC,AE,SWAC,Pat,SB,...,CAA,,BSth,B10,SC,Horz,MWC,BE,Ivy,Horz
Luck,+.025,+.087,+.025,+.093,-.046,+.029,-.030,+.103,+.111,-.015,...,-.015,,+.034,-.022,-.017,+.057,-.033,+.047,+.026,+.007
NCSOSAdjEM,-0.96,-4.70,-3.29,+4.92,+0.76,+4.32,-4.63,+3.06,+0.25,+1.80,...,+1.24,,-2.57,+1.24,+7.24,-1.15,+0.44,+1.73,-1.00,+0.63
Rank,317,242,88,96,301,286,121,313,319,258,...,87,,157,38,187,146,166,14,47,282
SOSAdjEM,-7.76,+0.69,-1.32,+9.70,-9.75,-7.89,-8.12,-9.04,-4.72,-0.42,...,+1.93,,-6.36,+9.57,-1.17,-1.68,+1.93,+8.16,-0.85,-0.15
SOSOppD,109.2,103.8,105.9,100.8,107.4,107.2,108.9,107.8,107.3,104.4,...,103.9,,108.2,100.7,106.1,106.1,103.5,100.0,106.4,104.7


In [158]:
kpd['Kentucky']['ncaaseed']

'2'