In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss

In [2]:
K = 20.
HOME_ADVANTAGE = 100.

In [3]:
rs = pd.read_csv("C:\\Users\\bcara\\kaggle\\NCAA_ML_2018\\Data\\RegularSeasonCompactResults.csv")
rs.head(3)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0


In [4]:
team_ids = set(rs.WTeamID).union(set(rs.LTeamID))
len(team_ids)

364

In [5]:
elo_dict = dict(zip(list(team_ids), [1500] * len(team_ids)))

In [6]:
rs['margin'] = rs.WScore - rs.LScore
rs['w_elo'] = None
rs['l_elo'] = None

In [7]:
def elo_pred(elo1, elo2):
    return(1. / (10. ** (-(elo1 - elo2) / 400.) + 1.))

def expected_margin(elo_diff):
    return((7.5 + 0.006 * elo_diff))

def elo_update(w_elo, l_elo, margin):
    elo_diff = w_elo - l_elo
    pred = elo_pred(w_elo, l_elo)
    mult = ((margin + 3.) ** 0.8) / expected_margin(elo_diff)
    update = K * mult * (1 - pred)
    return(pred, update)

In [8]:
assert np.all(rs.index.values == np.array(range(rs.shape[0]))), "Index is out of order."

In [9]:
preds = []

# Loop over all rows of the games dataframe
for i in range(rs.shape[0]):
    
    # Get key data from current row
    w = rs.at[i, 'WTeamID']
    l = rs.at[i, 'LTeamID']
    margin = rs.at[i, 'margin']
    wloc = rs.at[i, 'WLoc']
    
    # Does either team get a home-court advantage?
    w_ad, l_ad, = 0., 0.
    if wloc == "H":
        w_ad += HOME_ADVANTAGE
    elif wloc == "A":
        l_ad += HOME_ADVANTAGE
    
    # Get elo updates as a result of the game
    pred, update = elo_update(elo_dict[w] + w_ad,
                              elo_dict[l] + l_ad, 
                              margin)
    elo_dict[w] += update
    elo_dict[l] -= update
    preds.append(pred)

    # Stores new elos in the games dataframe
    rs.loc[i, 'w_elo'] = elo_dict[w]
    rs.loc[i, 'l_elo'] = elo_dict[l]

In [10]:
rs.tail(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,margin,w_elo,l_elo
150674,2017,131,1436,56,1107,53,H,0,3,1652.42,1542.46
150675,2017,131,1437,74,1166,60,N,0,14,2097.4,1864.51
150676,2017,131,1458,76,1321,48,N,0,28,1953.23,1770.25
150677,2017,131,1463,73,1217,71,N,0,2,1591.42,1581.21
150678,2017,132,1246,82,1116,65,N,0,17,2070.61,1809.06
150679,2017,132,1276,71,1458,56,N,0,15,1904.71,1936.38
150680,2017,132,1343,71,1463,59,N,0,12,1710.38,1584.06
150681,2017,132,1348,70,1433,63,N,0,7,1790.62,1841.22
150682,2017,132,1374,71,1153,56,N,0,15,1965.93,1906.94
150683,2017,132,1407,59,1402,53,N,0,6,1447.51,1449.7


In [11]:
np.mean(-np.log(preds))

0.5322015765612655

In [12]:
def final_elo_per_season(df, team_id):
    d = df.copy()
    d = d.loc[(d.WTeamID == team_id) | (d.LTeamID == team_id), :]
    d.sort_values(['Season', 'DayNum'], inplace=True)
    d.drop_duplicates(['Season'], keep='last', inplace=True)
    w_mask = d.WTeamID == team_id
    l_mask = d.LTeamID == team_id
    d['season_elo'] = None
    d.loc[w_mask, 'season_elo'] = d.loc[w_mask, 'w_elo']
    d.loc[l_mask, 'season_elo'] = d.loc[l_mask, 'l_elo']
    out = pd.DataFrame({
        'team_id': team_id,
        'season': d.Season,
        'season_elo': d.season_elo
    })
    return(out)

In [13]:
df_list = [final_elo_per_season(rs, i) for i in team_ids]
season_elos = pd.concat(df_list)

In [14]:
season_elos.sample(10)

Unnamed: 0,season,season_elo,team_id
15184,1988,1427.33,1203
19360,1989,1450.45,1451
27583,1991,1206.79,1302
78616,2003,1572.43,1325
47595,1996,1674.16,1158
150679,2017,1904.71,1276
108062,2009,1242.82,1115
150635,2017,1709.66,1382
39485,1994,1136.55,1119
113380,2010,1910.63,1207


In [15]:
season_elos.to_csv("season_elos.csv", index=None)