# Collaborative Filtering with Neural Nets

In [77]:
# for data manipulation
import numpy as np
import pandas as pd
import os

# use surprise for collaborative filtering
import torch
import torch.nn as nn
from torch.autograd import Variable

## Read in data

#### Game data

In [2]:
game_data_path = "data/final_game_data/"
files = os.listdir(game_data_path)
season = pd.read_csv(game_data_path + files[0])

In [3]:
season.shape

(2632, 98)

#### Odds data

In [4]:
odds_data_path = "data/odds_data_processed/"
odds_files = os.listdir(odds_data_path)

In [5]:
odds = pd.read_csv(odds_data_path + odds_files[1])

In [6]:
odds = odds.drop(['Unnamed: 0'], axis = 1)
odds.shape

(1288, 13)

In [7]:
odds.columns

Index(['Date', 'Home', 'Away', 'OU', 'Spread', 'OU_2H', 'Spread_2H', 'ML_home',
       'ML_away', 'Points', 'Win Margin', '2H Points', '2H Win Margin'],
      dtype='object')

#### Reconciling names

In [8]:
season_names = {'Golden State Warriors':'GSW',
                'Los Angeles Lakers': 'LAL',
                'San Antonio Spurs': 'SAS',
                'Cleveland Cavaliers': 'CLE',
                'Denver Nuggets': 'DEN',
                'Indiana Pacers': 'IND',
                'Memphis Grizzlies': 'MEM',
                'New Jersey Nets': 'BRK',
                'Brooklyn Nets': 'BRK',
                'New Orleans Hornets': 'NOP',
                'New Orleans Pelicans': 'NOP',
                'Orlando Magic': 'ORL',
                'Toronto Raptors': 'TOR',
                'Miami Heat': 'MIA',
                'Seattle SuperSonics': 'SEA',
                'Utah Jazz': 'UTA',
                'Atlanta Hawks': 'ATL',
                'Boston Celtics': 'BOS',
                'Charlotte Bobcats': 'CHA',
                'Charlotte Hornets': 'CHA',
                'Chicago Bulls': 'CHI',
                'Los Angeles Clippers': 'LAC',
                'Minnesota Timberwolves': 'MIN',
                'Phoenix Suns': 'PHO',
                'Dallas Mavericks': 'DAL',
                'Houston Rockets': 'HOU',
                'Milwaukee Bucks': 'MIL',
                'Philadelphia 76ers': 'PHI',
                'Washington Wizards': 'WAS',
                'Detroit Pistons': 'DET',
                'New York Knicks': 'NYK',
                'Sacramento Kings': 'SAC',
                'Portland Trail Blazers': 'POR',
                'Oklahoma City Thunder': 'OKC'
        }
odds_names = {}
for name in list(pd.unique(odds.Home)):
    found = False
    for s_name in season_names:
        if name in s_name:
            found = True
            odds_names[name] = season_names[s_name]
    if not found: print(name)
odds_names["LA Lakers"] = "LAL"
odds_names["LA Clippers"] = "LAC"
odds_names["Okla City"] = "OKC"

LA Lakers
LA Clippers


In [9]:
odds["Home"] = odds["Home"].apply(lambda x: odds_names[x])
odds["Away"] = odds["Away"].apply(lambda x: odds_names[x])

season["team"] = season["team"].apply(lambda x: season_names[x])
season["opponent"] = season["opponent"].apply(lambda x: season_names[x])

#### Merging the two data tables

In [10]:
def make_index(row, col1, col2, col3):
    return str(row[col1]) + str(row[col2]) + str(row[col3])

def find_category(row):
    ref = row["Index"]
    if row["home"] == 0:
        ref = ref[:-6] + ref[-3:] + ref[-6:-3]
    odds_row = odds.loc[odds["Index"] == ref]
    #print(list(odds_row["Points"]))
    try:
        return list(odds_row["Points"])[0]
    except:
        return 0

season["date"] = season["date"].apply(lambda x: str(x)[:-1])

season["Index"] = season.apply(lambda x: make_index(x, "date", "team", "opponent"), axis=1)

odds["Date"] = odds["Date"].apply(lambda x: "".join(x.split("-")))

odds["Index"] = odds.apply(lambda x: make_index(x, "Date", "Home", "Away"), axis=1)

In [11]:
season["Outcome"] = season.apply(lambda x: find_category(x), axis = 1) ##### CHANGE THIS TO DEAL WITH OTHER INDICES

#merged = merged.drop(["Unnamed: 0_x", "Unnamed: 0_y", "date", "Home", "Away", "index"], axis = 1)

In [12]:
season.sample(5)

Unnamed: 0.1,Unnamed: 0,team,opponent,date,index,team_STL%,team_FT,team_2PA,team_FG,team_DRB,...,opponent_TRB,opponent_FTA,opponent_BLK,opponent_FTr,opponent_TS%,opponent_FT/FGA,opponent_3P%,home,Index,Outcome
1877,1877,POR,CLE,20080310,Cleveland Cavaliers64,6.3,12.0,57.0,31.0,27.0,...,39.0,33.0,4.0,0.485,0.533204,0.367647,0.357143,0,20080310PORCLE,168
1537,1537,PHO,GSW,20080213,Golden State Warriors52,5.0,21.0,66.0,45.0,33.0,...,34.0,13.0,2.0,0.135,0.589855,0.083333,0.380952,0,20080213PHOGSW,238
194,194,IND,BOS,20071113,Indiana Pacers7,2.1,20.0,58.0,29.0,31.0,...,42.0,44.0,8.0,0.603,0.546773,0.452055,0.285714,1,20071113INDBOS,187
1090,1090,TOR,POR,20080113,Toronto Raptors37,3.9,18.0,73.0,45.0,37.0,...,52.0,14.0,3.0,0.131,0.481619,0.121495,0.380952,1,20080113TORPOR,225
2328,2328,HOU,SEA,20080409,Houston Rockets78,4.4,13.0,47.0,39.0,39.0,...,35.0,19.0,3.0,0.216,0.41511,0.159091,0.25,1,20080409HOUSEA,183


In [13]:
in_data = season.set_index("Index")
in_data = in_data.drop(["index", "Unnamed: 0"], axis = 1)
in_data = in_data.sort_index()

In [14]:
in_data.shape

(2632, 97)

In [15]:
in_data.sample(1)

Unnamed: 0_level_0,team,opponent,date,team_STL%,team_FT,team_2PA,team_FG,team_DRB,team_ORB%,team_AST,...,opponent_STL,opponent_TRB,opponent_FTA,opponent_BLK,opponent_FTr,opponent_TS%,opponent_FT/FGA,opponent_3P%,home,Outcome
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20080214CHIMIA,CHI,MIA,20080214,7.3,20.0,72.0,38.0,40.0,29.5,19.0,...,7.0,35.0,23.0,4.0,0.311,0.546838,0.189189,0.47619,1,191


## Extracting the data

In [16]:
### for every team: past 3 games stats (them and opponent) + opponent season averages

#### Computing running season averages by team

In [17]:
dates = pd.unique(in_data.date)

season_averages = {}

for date in dates:
    # get all past games
    past_games = in_data[in_data.date < date]
    # means
    season_averages[date] = past_games.groupby('team').mean()

#### Computing the past n games for every matchup

In [18]:
n = 3
home_only = in_data[in_data.home == 1]

In [32]:
## build a list of games for every team
past_n = {}

for date in dates:
    team_map = {}
    past_games = in_data[in_data.date < date]
    for team in pd.unique(home_only.team):
        #get the past games for team
        past_team = past_games[past_games.team == team].tail(3)
        team_map[team] = past_team
    past_n[date] = team_map        

#### Making a dataset

In [60]:
## one-hot encode team names
teams = season_names.values()
encoding = {}
index = 0
for team in teams:
    if team not in encoding:
        encoding[team] = index
        index += 1
        
empty_list = [0 for j in range(index + 1)]
encoded = {}
for team in teams:
    if team in encoded: continue
    
    copy = empty_list[:]
    
    i = encoding[team]
    copy[i] = 1
    encoded[team] = copy

In [69]:
X = []
y = []

for i, row in home_only.iterrows():
    
    home_team = row["team"]
    away_team = row["opponent"]
    
    date = row["date"]
    
    past_n_home = past_n[date][home_team]
    past_n_away = past_n[date][away_team]
    
    avgs = season_averages[date]
    
    if past_n_home.shape[0] < n or past_n_away.shape[0] < n: continue
    
    ################ AWAY TEAM PAST GAMES
    data_home = []
    for j, row_2 in past_n_home.iterrows():
        cur_data = []

        team = row["team"]
        opponent = row["opponent"]

        cur_data.extend(encoded[team])
        cur_data.extend(encoded[opponent])
        cur_data.extend(row.drop(["team", "opponent", "date"]).values)

        opp_stats = avgs.loc[opponent].values

        cur_data.extend(opp_stats)

        data_home.append(cur_data)
    
    ################ AWAY TEAM PAST GAMES
    data_away = []
    for j, row_2 in past_n_away.iterrows():
        cur_data = []

        team = row["team"]
        opponent = row["opponent"]

        cur_data.extend(encoded[team])
        cur_data.extend(encoded[opponent])
        cur_data.extend(row.drop(["team", "opponent", "date"]).values)

        opp_stats = avgs.loc[opponent].values

        cur_data.extend(opp_stats)

        data_away.append(cur_data)
    
    ################ MERGE THE TWO
    data = []
    for i in range(len(data_home)):
        cur_data = data_home[i]
        cur_data.extend(data_away[i])
        data.append(cur_data)
    
    X.append(data)
    y.append(row["Outcome"])

In [72]:
X = np.array(X)
y = np.array(y)