# Collaborative Filtering with Neural Nets

In [2]:
# for data manipulation
import numpy as np
import pandas as pd
import os

# use surprise for collaborative filtering
import torch
import torch.nn as nn
from torch.autograd import Variable

## Read in data

#### Game data

In [10]:
def getData(year):
    game_data_path = "data/final_game_data/"
    game_filepath = str(year) + "-" + str(year + 1) + "_games_final.csv"
    season = pd.read_csv(game_data_path + game_filepath)
    
    odds_data_path = "data/odds_data_processed/"
    odds_filepath = str(year) + "-" + str(year + 1) + ".csv"
    odds = pd.read_csv(odds_data_path + odds_filepath)
    odds = odds.drop(['Unnamed: 0'], axis = 1)
    
    return season, odds

In [3]:
game_data_path = "data/final_game_data/"
files = os.listdir(game_data_path)
season = pd.read_csv(game_data_path + "2008-2009_games_final.csv")

In [14]:
season.shape

(2630, 98)

#### Odds data

In [5]:
odds_data_path = "data/odds_data_processed/"
odds_files = os.listdir(odds_data_path)

In [6]:
odds = pd.read_csv(odds_data_path + "2008-2009.csv")

In [12]:
odds = odds.drop(['Unnamed: 0'], axis = 1)
odds.shape

(1305, 13)

In [13]:
odds.columns

Index(['Date', 'Home', 'Away', 'OU', 'Spread', 'OU_2H', 'Spread_2H', 'ML_home',
       'ML_away', 'Points', 'Win Margin', '2H Points', '2H Win Margin'],
      dtype='object')

In [29]:
season, odds = getData(2008)

#### Reconciling names

In [8]:
season_names = {'Golden State Warriors':'GSW',
                'Los Angeles Lakers': 'LAL',
                'San Antonio Spurs': 'SAS',
                'Cleveland Cavaliers': 'CLE',
                'Denver Nuggets': 'DEN',
                'Indiana Pacers': 'IND',
                'Memphis Grizzlies': 'MEM',
                'New Jersey Nets': 'BRK',
                'Brooklyn Nets': 'BRK',
                'New Orleans Hornets': 'NOP',
                'New Orleans Pelicans': 'NOP',
                'Orlando Magic': 'ORL',
                'Toronto Raptors': 'TOR',
                'Miami Heat': 'MIA',
                'Seattle SuperSonics': 'SEA',
                'Utah Jazz': 'UTA',
                'Atlanta Hawks': 'ATL',
                'Boston Celtics': 'BOS',
                'Charlotte Bobcats': 'CHA',
                'Charlotte Hornets': 'CHA',
                'Chicago Bulls': 'CHI',
                'Los Angeles Clippers': 'LAC',
                'Minnesota Timberwolves': 'MIN',
                'Phoenix Suns': 'PHO',
                'Dallas Mavericks': 'DAL',
                'Houston Rockets': 'HOU',
                'Milwaukee Bucks': 'MIL',
                'Philadelphia 76ers': 'PHI',
                'Washington Wizards': 'WAS',
                'Detroit Pistons': 'DET',
                'New York Knicks': 'NYK',
                'Sacramento Kings': 'SAC',
                'Portland Trail Blazers': 'POR',
                'Oklahoma City Thunder': 'OKC'
        }
odds_names = {}
for name in list(pd.unique(odds.Home)):
    found = False
    for s_name in season_names:
        if name in s_name:
            found = True
            odds_names[name] = season_names[s_name]
    if not found: print(name)
odds_names["LA Lakers"] = "LAL"
odds_names["LA Clippers"] = "LAC"
odds_names["Okla City"] = "OKC"

LA Lakers
Okla City
LA Clippers


In [9]:
odds["Home"] = odds["Home"].apply(lambda x: odds_names[x])
odds["Away"] = odds["Away"].apply(lambda x: odds_names[x])

season["team"] = season["team"].apply(lambda x: season_names[x])
season["opponent"] = season["opponent"].apply(lambda x: season_names[x])

In [32]:
season_names = {'Golden State Warriors':'GSW',
                'Los Angeles Lakers': 'LAL',
                'San Antonio Spurs': 'SAS',
                'Cleveland Cavaliers': 'CLE',
                'Denver Nuggets': 'DEN',
                'Indiana Pacers': 'IND',
                'Memphis Grizzlies': 'MEM',
                'New Jersey Nets': 'BRK',
                'Brooklyn Nets': 'BRK',
                'New Orleans Hornets': 'NOP',
                'New Orleans Pelicans': 'NOP',
                'Orlando Magic': 'ORL',
                'Toronto Raptors': 'TOR',
                'Miami Heat': 'MIA',
                'Seattle SuperSonics': 'SEA',
                'Utah Jazz': 'UTA',
                'Atlanta Hawks': 'ATL',
                'Boston Celtics': 'BOS',
                'Charlotte Bobcats': 'CHA',
                'Charlotte Hornets': 'CHA',
                'Chicago Bulls': 'CHI',
                'Los Angeles Clippers': 'LAC',
                'Minnesota Timberwolves': 'MIN',
                'Phoenix Suns': 'PHO',
                'Dallas Mavericks': 'DAL',
                'Houston Rockets': 'HOU',
                'Milwaukee Bucks': 'MIL',
                'Philadelphia 76ers': 'PHI',
                'Washington Wizards': 'WAS',
                'Detroit Pistons': 'DET',
                'New York Knicks': 'NYK',
                'Sacramento Kings': 'SAC',
                'Portland Trail Blazers': 'POR',
                'Oklahoma City Thunder': 'OKC'
        }

In [18]:
def cleanNames(season, odds):
    odds_names = {}
    for name in list(pd.unique(odds.Home)):
        found = False
        for s_name in season_names:
            if name in s_name:
                found = True
                odds_names[name] = season_names[s_name]
    odds_names["LA Lakers"] = "LAL"
    odds_names["LA Clippers"] = "LAC"
    odds_names["Okla City"] = "OKC"
    
    odds["Home"] = odds["Home"].apply(lambda x: odds_names[x])
    odds["Away"] = odds["Away"].apply(lambda x: odds_names[x])

    season["team"] = season["team"].apply(lambda x: season_names[x])
    season["opponent"] = season["opponent"].apply(lambda x: season_names[x])
    
    return season, odds


In [30]:
season, odds = cleanNames(season, odds)

#### Merging the two data tables

In [22]:
def make_index(row, col1, col2, col3):
    return str(row[col1]) + str(row[col2]) + str(row[col3])

def find_category(row):
    ref = row["Index"]
    if row["home"] == 0:
        ref = ref[:-6] + ref[-3:] + ref[-6:-3]
    odds_row = odds.loc[odds["Index"] == ref]
    #print(list(odds_row["Points"]))
    try:
        return list(odds_row["Points"])[0]
    except:
        return 0

In [28]:
def makeIndices(season, odds):
    season["date"] = season["date"].apply(lambda x: str(x)[:-1])
    season["Index"] = season.apply(lambda x: make_index(x, "date", "team", "opponent"), axis=1)
    
    odds["Date"] = odds["Date"].apply(lambda x: "".join(x.split("-")))
    odds["Index"] = odds.apply(lambda x: make_index(x, "Date", "Home", "Away"), axis=1)
    
    season["Outcome"] = season.apply(lambda x: find_category(x), axis = 1) ##### CHANGE THIS TO DEAL WITH OTHER INDICES
    
    in_data = season.set_index("Index")
    in_data = in_data.drop(["index", "Unnamed: 0"], axis = 1)
    in_data = in_data.sort_index()
    
    return season, odds, in_data

In [10]:
season["date"] = season["date"].apply(lambda x: str(x)[:-1])

season["Index"] = season.apply(lambda x: make_index(x, "date", "team", "opponent"), axis=1)

odds["Date"] = odds["Date"].apply(lambda x: "".join(x.split("-")))

odds["Index"] = odds.apply(lambda x: make_index(x, "Date", "Home", "Away"), axis=1)

In [11]:
season["Outcome"] = season.apply(lambda x: find_category(x), axis = 1) ##### CHANGE THIS TO DEAL WITH OTHER INDICES

#merged = merged.drop(["Unnamed: 0_x", "Unnamed: 0_y", "date", "Home", "Away", "index"], axis = 1)

In [12]:
season.sample(5)

Unnamed: 0.1,Unnamed: 0,team,opponent,date,index,team_STL%,team_FT,team_2PA,team_FG,team_DRB,...,opponent_TRB,opponent_FTA,opponent_BLK,opponent_FTr,opponent_TS%,opponent_FT/FGA,opponent_3P%,home,Index,Outcome
1509,1509,MIN,NOP,20090208,New Orleans Hornets48,6.9,15.0,57.0,37.0,31.0,...,33.0,39.0,4.0,0.574,0.593001,0.470588,0.2,0,20090208MINNOP,198
72,72,ORL,SAC,20081101,Orlando Magic3,6.8,25.0,58.0,44.0,25.0,...,28.0,24.0,5.0,0.324,0.609035,0.27027,0.333333,1,20081101ORLSAC,224
1313,1313,MIN,MIL,20090126,Milwaukee Bucks48,7.6,22.0,48.0,30.0,35.0,...,43.0,19.0,7.0,0.224,0.444516,0.164706,0.071429,0,20090126MINMIL,173
2046,2046,DET,LAC,20090320,Detroit Pistons68,7.6,15.0,69.0,44.0,24.0,...,32.0,19.0,4.0,0.268,0.567036,0.211268,0.454545,1,20090320DETLAC,198
659,659,SAC,LAL,20081212,Los Angeles Lakers22,7.1,23.0,60.0,37.0,31.0,...,46.0,35.0,4.0,0.417,0.56338,0.25,0.466667,0,20081212SACLAL,215


In [13]:
in_data = season.set_index("Index")
in_data = in_data.drop(["index", "Unnamed: 0"], axis = 1)
in_data = in_data.sort_index()

In [14]:
in_data.shape

(2630, 97)

In [15]:
in_data.sample(1)

Unnamed: 0_level_0,team,opponent,date,team_STL%,team_FT,team_2PA,team_FG,team_DRB,team_ORB%,team_AST,...,opponent_STL,opponent_TRB,opponent_FTA,opponent_BLK,opponent_FTr,opponent_TS%,opponent_FT/FGA,opponent_3P%,home,Outcome
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20090311MEMMIN,MEM,MIN,20090311,5.9,12.0,61.0,32.0,23.0,12.8,16.0,...,8.0,48.0,21.0,3.0,0.269,0.596057,0.230769,0.454545,0,183


In [31]:
season, odds, in_data = makeIndices(season, odds)

## Extracting the data

In [16]:
### for every team: past 3 games stats (them and opponent) + opponent season averages

#### Super ugly function

In [34]:
def createFinalData(in_data):
    dates = pd.unique(in_data.date)

    season_averages = {}

    for date in dates:
        # get all past games
        past_games = in_data[in_data.date < date]
        # means
        season_averages[date] = past_games.groupby('team').mean()

    n = 3
    home_only = in_data[in_data.home == 1]

    ## build a list of games for every team
    past_n = {}

    for date in dates:
        team_map = {}
        past_games = in_data[in_data.date < date]
        for team in pd.unique(home_only.team):
            #get the past games for team
            past_team = past_games[past_games.team == team].tail(3)
            team_map[team] = past_team
        past_n[date] = team_map 

    ## one-hot encode team names
    teams = season_names.values()
    encoding = {}
    index = 0
    for team in teams:
        if team not in encoding:
            encoding[team] = index
            index += 1

    empty_list = [0 for j in range(index + 1)]
    encoded = {}
    for team in teams:
        if team in encoded: continue

        copy = empty_list[:]

        i = encoding[team]
        copy[i] = 1
        encoded[team] = copy

    X = []
    y = []

    for i, row in home_only.iterrows():

        home_team = row["team"]
        away_team = row["opponent"]

        date = row["date"]

        past_n_home = past_n[date][home_team]
        past_n_away = past_n[date][away_team]

        avgs = season_averages[date]

        if past_n_home.shape[0] < n or past_n_away.shape[0] < n: continue

        ################ AWAY TEAM PAST GAMES
        data_home = []
        for j, row_2 in past_n_home.iterrows():
            cur_data = []

            team = row["team"]
            opponent = row["opponent"]

            cur_data.extend(encoded[team])
            cur_data.extend(encoded[opponent])
            cur_data.extend(row.drop(["team", "opponent", "date"]).values)

            opp_stats = avgs.loc[opponent].values

            cur_data.extend(opp_stats)

            data_home.append(cur_data)

        ################ AWAY TEAM PAST GAMES
        data_away = []
        for j, row_2 in past_n_away.iterrows():
            cur_data = []

            team = row["team"]
            opponent = row["opponent"]

            cur_data.extend(encoded[team])
            cur_data.extend(encoded[opponent])
            cur_data.extend(row.drop(["team", "opponent", "date"]).values)

            opp_stats = avgs.loc[opponent].values

            cur_data.extend(opp_stats)

            data_away.append(cur_data)

        ################ MERGE THE TWO
        data = []
        for i in range(len(data_home)):
            cur_data = data_home[i]
            cur_data.extend(data_away[i])
            data.append(cur_data)

        X.append(data)
        y.append(row["Outcome"])

    return X, y

In [35]:
X, y = createFinalData(in_data)

#### Computing running season averages by team

In [17]:
dates = pd.unique(in_data.date)

season_averages = {}

for date in dates:
    # get all past games
    past_games = in_data[in_data.date < date]
    # means
    season_averages[date] = past_games.groupby('team').mean()

#### Computing the past n games for every matchup

In [18]:
n = 3
home_only = in_data[in_data.home == 1]

In [19]:
## build a list of games for every team
past_n = {}

for date in dates:
    team_map = {}
    past_games = in_data[in_data.date < date]
    for team in pd.unique(home_only.team):
        #get the past games for team
        past_team = past_games[past_games.team == team].tail(3)
        team_map[team] = past_team
    past_n[date] = team_map        

#### Making a dataset

In [20]:
## one-hot encode team names
teams = season_names.values()
encoding = {}
index = 0
for team in teams:
    if team not in encoding:
        encoding[team] = index
        index += 1
        
empty_list = [0 for j in range(index + 1)]
encoded = {}
for team in teams:
    if team in encoded: continue
    
    copy = empty_list[:]
    
    i = encoding[team]
    copy[i] = 1
    encoded[team] = copy

In [21]:
X = []
y = []

for i, row in home_only.iterrows():
    
    home_team = row["team"]
    away_team = row["opponent"]
    
    date = row["date"]
    
    past_n_home = past_n[date][home_team]
    past_n_away = past_n[date][away_team]
    
    avgs = season_averages[date]
    
    if past_n_home.shape[0] < n or past_n_away.shape[0] < n: continue
    
    ################ AWAY TEAM PAST GAMES
    data_home = []
    for j, row_2 in past_n_home.iterrows():
        cur_data = []

        team = row["team"]
        opponent = row["opponent"]

        cur_data.extend(encoded[team])
        cur_data.extend(encoded[opponent])
        cur_data.extend(row.drop(["team", "opponent", "date"]).values)

        opp_stats = avgs.loc[opponent].values

        cur_data.extend(opp_stats)

        data_home.append(cur_data)
    
    ################ AWAY TEAM PAST GAMES
    data_away = []
    for j, row_2 in past_n_away.iterrows():
        cur_data = []

        team = row["team"]
        opponent = row["opponent"]

        cur_data.extend(encoded[team])
        cur_data.extend(encoded[opponent])
        cur_data.extend(row.drop(["team", "opponent", "date"]).values)

        opp_stats = avgs.loc[opponent].values

        cur_data.extend(opp_stats)

        data_away.append(cur_data)
    
    ################ MERGE THE TWO
    data = []
    for i in range(len(data_home)):
        cur_data = data_home[i]
        cur_data.extend(data_away[i])
        data.append(cur_data)
    
    X.append(data)
    y.append(row["Outcome"])

In [36]:
X = np.array(X)
y = np.array(y)

In [41]:
X.shape

(1004, 3, 504)

In [42]:
y.shape

(1004,)

In [39]:
def trainValSplit(X, y):
    X = np.array(X)
    y = np.array(y)

    X = X[y > 0]
    y = y[y > 0]

    p = np.random.permutation(len(X))
    X = X[p]
    y = y[p]

    val = 0.2
    val = round(len(X) * val)
    X_val = X[:val]
    y_val = y[:val]
    X = X[val:]
    y = y[val:]
    
    return X, y, X_val, y_val

In [40]:
X, y, X_val, y_val = trainValSplit(X, y)

In [43]:
y = torch.from_numpy(y[:,np.newaxis]).type(torch.FloatTensor)
X = torch.from_numpy(X.reshape((X.shape[0], -1))).type(torch.FloatTensor)
y_val = torch.from_numpy(y_val[:,np.newaxis]).type(torch.FloatTensor)
X_val = torch.from_numpy(X_val.reshape((X_val.shape[0], -1))).type(torch.FloatTensor)

In [44]:
# Split train/test:
print(y[1:10])

tensor([[213.],
        [210.],
        [202.],
        [191.],
        [247.],
        [207.],
        [167.],
        [229.],
        [179.]])


In [106]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

class Net(torch.nn.Module):
    def __init__(self, n_feature, n_output):
        super(Net, self).__init__()
        
        # layer 1 fully connected 150 units
        self.lin1 = nn.Linear(n_feature, 500)
        
        # layer 2 fully connected 50 units
        self.lin2 = nn.Linear(500, 100)
        
        # layer 3 fully connected 1 unit (output)
        self.lin3 = nn.Linear(100, n_output)
        
        # dropouts
        self.drop1 = nn.Dropout(0.5)
        self.drop2 = nn.Dropout(0.4)
        self.drop3 = nn.Dropout(0.25)

    def forward(self, x):
        # perform dropout on input vector embeddings
        # x = self.drop1(x)
        x = F.relu(self.lin1(x))
        # x = self.drop2(F.relu(self.lin1(x)))
        x = F.relu(self.lin2(x))
        # x = self.drop3(F.relu(self.lin2(x)))
        x = self.lin3(x)
        
        return x  

net = Net(n_feature=1512, n_output=1)     # define the network
print(net)  # net architecture

optimizer = torch.optim.SGD(net.parameters(), lr=0.000001)
loss_func = torch.nn.MSELoss()  # this is for regression mean squared loss

# plt.ion()   # something about plotting

for t in range(500):
    prediction = net(X)     # input x and predict based on x

    loss = loss_func(prediction, y)     # must be (1. nn output, 2. target)

    optimizer.zero_grad()   # clear gradients for next train
    loss.backward()         # backpropagation, compute gradients
    optimizer.step()        # apply gradients
    
    # Do validation loss
    with torch.no_grad():
        pred_val = net(X_val)
        loss_val = loss_func(pred_val, y_val)

    if t % 5 == 0:
        # plot and show learning process
        '''
        plt.cla()
        plt.scatter(x.data.numpy(), y.data.numpy())
        plt.plot(x.data.numpy(), prediction.data.numpy(), 'r-', lw=5)
        plt.text(0.5, 0, 'Loss=%.4f' % loss.data.numpy(), fontdict={'size': 20, 'color':  'red'})
        plt.pause(0.1)
        '''
        print('Loss=%.4f' % loss.data.numpy())
        print('Val Loss=%.4f' % loss_val.data.numpy())

# plt.ioff()
# plt.show()

Net(
  (lin1): Linear(in_features=1512, out_features=500, bias=True)
  (lin2): Linear(in_features=500, out_features=100, bias=True)
  (lin3): Linear(in_features=100, out_features=1, bias=True)
  (drop1): Dropout(p=0.5)
  (drop2): Dropout(p=0.4)
  (drop3): Dropout(p=0.25)
)
Loss=40131.9453
Loss=536.7881
Loss=495.8045
Loss=489.6784
Loss=483.7281
Loss=477.7469
Loss=471.0899
Loss=465.7350
Loss=460.8159
Loss=456.0045
Loss=451.1470
Loss=446.2653
Loss=441.5219
Loss=436.9084
Loss=432.3907
Loss=427.9353
Loss=423.5046
Loss=419.0998
Loss=414.7785
Loss=410.5339
Loss=406.3308
Loss=402.1418
Loss=397.9559
Loss=393.7902
Loss=389.6584
Loss=385.5572
Loss=381.4409
Loss=377.2537
Loss=372.9801
Loss=368.7366
Loss=364.6370
Loss=360.6323
Loss=356.6897
Loss=352.7888
Loss=348.9171
Loss=345.0619
Loss=341.2221
Loss=337.3980
Loss=333.5872
Loss=329.7929
Loss=326.0143
Loss=322.2527
Loss=318.5057
Loss=314.7734
Loss=311.0550
Loss=307.3494
Loss=303.6575
Loss=299.9797
Loss=296.3138
Loss=292.6609
Loss=289.0222
Loss=285.3