# Collaborative Filtering with Neural Nets

In [1]:
# for data manipulation
import numpy as np
import pandas as pd
import os

# use surprise for collaborative filtering
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


# plot
import matplotlib.pyplot as plt

## Read in data

#### Game data

In [2]:
game_data_path = "data/final_game_data/"
files = os.listdir(game_data_path)
season = pd.read_csv(game_data_path + files[0])

In [3]:
season.shape

(2632, 98)

In [4]:
game_data_path + files[0]

'data/final_game_data/2007-2008_games_final.csv'

#### Odds data

In [5]:
odds_data_path = "data/odds_data_processed/"
odds_files = os.listdir(odds_data_path)

In [6]:
odds = pd.read_csv(odds_data_path + odds_files[1])

In [7]:
odds = odds.drop(['Unnamed: 0'], axis = 1)
odds.shape

(1288, 13)

In [8]:
odds.columns

Index(['Date', 'Home', 'Away', 'OU', 'Spread', 'OU_2H', 'Spread_2H', 'ML_home',
       'ML_away', 'Points', 'Win Margin', '2H Points', '2H Win Margin'],
      dtype='object')

In [9]:
odds_data_path + odds_files[1]

'data/odds_data_processed/2007-2008.csv'

#### Reconciling names

In [10]:
season_names = {'Golden State Warriors':'GSW',
                'Los Angeles Lakers': 'LAL',
                'San Antonio Spurs': 'SAS',
                'Cleveland Cavaliers': 'CLE',
                'Denver Nuggets': 'DEN',
                'Indiana Pacers': 'IND',
                'Memphis Grizzlies': 'MEM',
                'New Jersey Nets': 'BRK',
                'Brooklyn Nets': 'BRK',
                'New Orleans Hornets': 'NOP',
                'New Orleans Pelicans': 'NOP',
                'Orlando Magic': 'ORL',
                'Toronto Raptors': 'TOR',
                'Miami Heat': 'MIA',
                'Seattle SuperSonics': 'SEA',
                'Utah Jazz': 'UTA',
                'Atlanta Hawks': 'ATL',
                'Boston Celtics': 'BOS',
                'Charlotte Bobcats': 'CHA',
                'Charlotte Hornets': 'CHA',
                'Chicago Bulls': 'CHI',
                'Los Angeles Clippers': 'LAC',
                'Minnesota Timberwolves': 'MIN',
                'Phoenix Suns': 'PHO',
                'Dallas Mavericks': 'DAL',
                'Houston Rockets': 'HOU',
                'Milwaukee Bucks': 'MIL',
                'Philadelphia 76ers': 'PHI',
                'Washington Wizards': 'WAS',
                'Detroit Pistons': 'DET',
                'New York Knicks': 'NYK',
                'Sacramento Kings': 'SAC',
                'Portland Trail Blazers': 'POR',
                'Oklahoma City Thunder': 'OKC'
        }
odds_names = {}
for name in list(pd.unique(odds.Home)):
    found = False
    for s_name in season_names:
        if name in s_name:
            found = True
            odds_names[name] = season_names[s_name]
    if not found: print(name)
odds_names["LA Lakers"] = "LAL"
odds_names["LA Clippers"] = "LAC"
odds_names["Okla City"] = "OKC"

LA Lakers
LA Clippers


In [11]:
odds["Home"] = odds["Home"].apply(lambda x: odds_names[x])
odds["Away"] = odds["Away"].apply(lambda x: odds_names[x])

season["team"] = season["team"].apply(lambda x: season_names[x])
season["opponent"] = season["opponent"].apply(lambda x: season_names[x])

#### Merging the two data tables

In [12]:
def make_index(row, col1, col2, col3):
    return str(row[col1]) + str(row[col2]) + str(row[col3])

def find_category(row):
    ref = row["Index"]
    if row["home"] == 0:
        ref = ref[:-6] + ref[-3:] + ref[-6:-3]
    odds_row = odds.loc[odds["Index"] == ref]
    #print(list(odds_row["Points"]))
    try:
        return list(odds_row["Points"])[0]
    except:
        return 0

season["date"] = season["date"].apply(lambda x: str(x)[:-1])

season["Index"] = season.apply(lambda x: make_index(x, "date", "team", "opponent"), axis=1)

odds["Date"] = odds["Date"].apply(lambda x: "".join(x.split("-")))

odds["Index"] = odds.apply(lambda x: make_index(x, "Date", "Home", "Away"), axis=1)

In [13]:
season["Outcome"] = season.apply(lambda x: find_category(x), axis = 1) ##### CHANGE THIS TO DEAL WITH OTHER INDICES

#merged = merged.drop(["Unnamed: 0_x", "Unnamed: 0_y", "date", "Home", "Away", "index"], axis = 1)

In [14]:
season.sample(5)

Unnamed: 0.1,Unnamed: 0,team,opponent,date,index,team_STL%,team_FT,team_2PA,team_FG,team_DRB,...,opponent_TRB,opponent_FTA,opponent_BLK,opponent_FTr,opponent_TS%,opponent_FT/FGA,opponent_3P%,home,Index,Outcome
2052,2052,POR,LAC,20080321,Portland Trail Blazers69,5.9,14.0,58.0,42.0,24.0,...,28.0,16.0,2.0,0.219,0.637181,0.164384,0.615385,1,20080321PORLAC,209
239,239,MIA,BOS,20071116,Boston Celtics8,8.8,16.0,56.0,35.0,29.0,...,40.0,18.0,2.0,0.254,0.582869,0.239437,0.357143,0,20071116MIABOS,183
575,575,CLE,CHA,20071208,Charlotte Bobcats18,8.7,23.0,55.0,32.0,25.0,...,34.0,48.0,8.0,0.75,0.56391,0.5625,0.444444,0,20071208CLECHA,189
1440,1440,SAC,SEA,20080206,Sacramento Kings47,10.1,13.0,76.0,36.0,28.0,...,54.0,26.0,7.0,0.342,0.600412,0.263158,0.5,1,20080206SACSEA,197
1677,1677,POR,LAL,20080226,Los Angeles Lakers57,6.8,21.0,63.0,30.0,30.0,...,41.0,17.0,6.0,0.224,0.574988,0.197368,0.473684,0,20080226PORLAL,179


In [15]:
in_data = season.set_index("Index")
in_data = in_data.drop(["index", "Unnamed: 0"], axis = 1)
in_data = in_data.sort_index()

In [16]:
in_data.shape

(2632, 97)

In [17]:
in_data.sample(1)

Unnamed: 0_level_0,team,opponent,date,team_STL%,team_FT,team_2PA,team_FG,team_DRB,team_ORB%,team_AST,...,opponent_STL,opponent_TRB,opponent_FTA,opponent_BLK,opponent_FTr,opponent_TS%,opponent_FT/FGA,opponent_3P%,home,Outcome
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20080303NYKNOP,NYK,NOP,20080303,2.3,13.0,72.0,35.0,35.0,24.5,18.0,...,5.0,52.0,20.0,6.0,0.227,0.516529,0.159091,0.4,1,188


## Extracting the data

In [18]:
### for every team: past 3 games stats (them and opponent) + opponent season averages

#### Computing running season averages by team

In [19]:
dates = pd.unique(in_data.date)

season_averages = {}

for date in dates:
    # get all past games
    past_games = in_data[in_data.date < date]
    # means
    season_averages[date] = past_games.groupby('team').mean()

#### Computing the past n games for every matchup

In [20]:
n = 3
home_only = in_data[in_data.home == 1]

In [21]:
## build a list of games for every team
past_n = {}

for date in dates:
    team_map = {}
    past_games = in_data[in_data.date < date]
    for team in pd.unique(home_only.team):
        #get the past games for team
        past_team = past_games[past_games.team == team].tail(3)
        team_map[team] = past_team
    past_n[date] = team_map        

#### Making a dataset

In [22]:
## one-hot encode team names
teams = season_names.values()
encoding = {}
index = 0
for team in teams:
    if team not in encoding:
        encoding[team] = index
        index += 1
        
empty_list = [0 for j in range(index + 1)]
encoded = {}
for team in teams:
    if team in encoded: continue
    
    copy = empty_list[:]
    
    i = encoding[team]
    copy[i] = 1
    encoded[team] = copy

In [23]:
X = []
y = []

for i, row in home_only.iterrows():
    
    home_team = row["team"]
    away_team = row["opponent"]
    
    date = row["date"]
    
    past_n_home = past_n[date][home_team]
    past_n_away = past_n[date][away_team]
    
    avgs = season_averages[date]
    
    if past_n_home.shape[0] < n or past_n_away.shape[0] < n: continue
    
    ################ AWAY TEAM PAST GAMES
    data_home = []
    for j, row_2 in past_n_home.iterrows():
        cur_data = []

        team = row["team"]
        opponent = row["opponent"]

        cur_data.extend(encoded[team])
        cur_data.extend(encoded[opponent])
        cur_data.extend(row.drop(["team", "opponent", "date"]).values)

        opp_stats = avgs.loc[opponent].values

        cur_data.extend(opp_stats)

        data_home.append(cur_data)
    
    ################ AWAY TEAM PAST GAMES
    data_away = []
    for j, row_2 in past_n_away.iterrows():
        cur_data = []

        team = row["team"]
        opponent = row["opponent"]

        cur_data.extend(encoded[team])
        cur_data.extend(encoded[opponent])
        cur_data.extend(row.drop(["team", "opponent", "date"]).values)

        opp_stats = avgs.loc[opponent].values

        cur_data.extend(opp_stats)

        data_away.append(cur_data)
    
    ################ MERGE THE TWO
    data = []
    for i in range(len(data_home)):
        cur_data = data_home[i]
        cur_data.extend(data_away[i])
        data.append(cur_data)
    
    X.append(data)
    y.append(row["Outcome"])

In [24]:
X = np.array(X)
y = np.array(y)

X = X[y > 0]
y = y[y > 0]

p = np.random.permutation(len(X))
X = X[p]
y = y[p]

val = 0.2
val = round(len(X) * val)
val_X = X[:val]
val_y = y[:val]
X = X[val:]
y = y[val:]

In [25]:
#### Specify the model architecture
class LSTMModel(nn.Module):

    def __init__(self, input_dim, hidden_dim, target_size, num_layers, batch_size, time_steps):
        super(LSTMModel, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.time_steps = time_steps
        
        # Initialize LSTM unit
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=False)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2out = nn.Linear(hidden_dim, target_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size , hidden_dim)
        return (torch.zeros(self.num_layers, self.time_steps, self.hidden_dim),
                torch.zeros(self.num_layers, self.time_steps, self.hidden_dim))
        
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))

    def forward(self, input_seq):
        lstm_out, self.hidden = self.lstm(input_seq, self.hidden)
        
        pred = self.hidden2out(lstm_out)
        
        return pred

In [40]:
## Define the model
model = LSTMModel(input_dim = 504,
                     hidden_dim = 20,
                     target_size = 1,
                     num_layers = 1,
                     batch_size = 10, 
                     time_steps = 3)
                     
loss_function = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [None]:
losses = []
val_losses = []

for epoch in range(500):   # again, normally you would NOT do 300 epochs, it is toy data
    train_loss = 0
    for i in range(0, len(X), model.batch_size):
        if i + model.batch_size >= len(X) : continue
        
        #Pytorch accumulates gradients. We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM, detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network.
        batch_input = X[i : i + model.batch_size] #.reshape((X.shape[1], model.batch_size, X.shape[2]))
        batch = Variable(torch.from_numpy(batch_input)).type(torch.FloatTensor)
                                                    
        targets = Variable(torch.from_numpy(y[i : i + model.batch_size])).type(torch.FloatTensor)

        # Step 3. Run our forward pass.
        scores = model(batch)
        scores = scores[:, -1].reshape((model.batch_size)) # we only care about the last output

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(scores, targets)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.detach().numpy()
        
    ## validation loss
    if (epoch + 1) % 10 == 0:
        print("----------")
        print("Losses after {} iterations:".format(epoch))
        print("Train: {}".format(loss.detach().numpy()))
#        with torch.no_grad():
#             batch_input = val_X
#             batch = Variable(torch.from_numpy(batch_input)).type(torch.FloatTensor)
#             targets = Variable(torch.from_numpy(val_y)).type(torch.FloatTensor)
#             scores = model(batch)
#             scores = scores[:, -1].reshape((len(val_y))) # we only care about the last output
#             val_loss = loss_function(scores, targets)
#             print("Val: {}".format(val_loss))
#             val_losses.append(val_loss)
#             losses.append(train_loss/len(X))

----------
Losses after 9 iterations:
Train: 322.2840270996094
----------
Losses after 19 iterations:
Train: 339.21392822265625
----------
Losses after 29 iterations:
Train: 353.0520935058594
----------
Losses after 39 iterations:
Train: 358.0295715332031
----------
Losses after 49 iterations:
Train: 359.78619384765625
----------
Losses after 59 iterations:
Train: 360.4023132324219
----------
Losses after 69 iterations:
Train: 360.6178283691406
----------
Losses after 79 iterations:
Train: 360.6932067871094
----------
Losses after 89 iterations:
Train: 360.7195739746094
----------
Losses after 99 iterations:
Train: 360.7287292480469
----------
Losses after 109 iterations:
Train: 360.73175048828125
----------
Losses after 119 iterations:
Train: 360.7330017089844
----------
Losses after 129 iterations:
Train: 360.7333984375
----------
Losses after 139 iterations:
Train: 360.73370361328125
----------
Losses after 149 iterations:
Train: 360.7336730957031
----------
Losses after 159 iterati

In [31]:
# See what the scores are after training
with torch.no_grad():
    batch_input = val_X
    batch = Variable(torch.from_numpy(batch_input)).type(torch.FloatTensor)
    targets = Variable(torch.from_numpy(val_y)).type(torch.FloatTensor)
    scores = model(batch)
    scores = scores[:, -1].reshape((len(val_y))) # we only care about the last output
    val_loss = loss_function(scores, targets)
    print(val_loss)

tensor(420.3976)
