# Collaborative Filtering with Neural Nets

In [55]:
# for data manipulation
import numpy as np
import pandas as pd
import os

# use surprise for collaborative filtering
from surprise import Reader, Dataset
from surprise import SVD, evaluate
from surprise import NMF

## Read in data

#### Game data

In [105]:
game_data_path = "data/final_game_data/"
files = os.listdir(game_data_path)
season = pd.read_csv(game_data_path + files[0])

In [106]:
season.shape

(2632, 98)

#### Odds data

In [107]:
odds_data_path = "data/odds_data_processed/"
odds_files = os.listdir(odds_data_path)

In [108]:
odds = pd.read_csv(odds_data_path + odds_files[1])

In [109]:
odds = odds.drop(['Unnamed: 0'], axis = 1)
odds.shape

(1288, 13)

In [110]:
odds.columns

Index(['Date', 'Home', 'Away', 'OU', 'Spread', 'OU_2H', 'Spread_2H', 'ML_home',
       'ML_away', 'Points', 'Win Margin', '2H Points', '2H Win Margin'],
      dtype='object')

#### Reconciling names

In [111]:
season_names = {'Golden State Warriors':'GSW',
                'Los Angeles Lakers': 'LAL',
                'San Antonio Spurs': 'SAS',
                'Cleveland Cavaliers': 'CLE',
                'Denver Nuggets': 'DEN',
                'Indiana Pacers': 'IND',
                'Memphis Grizzlies': 'MEM',
                'New Jersey Nets': 'BRK',
                'Brooklyn Nets': 'BRK',
                'New Orleans Hornets': 'NOP',
                'New Orleans Pelicans': 'NOP',
                'Orlando Magic': 'ORL',
                'Toronto Raptors': 'TOR',
                'Miami Heat': 'MIA',
                'Seattle SuperSonics': 'SEA',
                'Utah Jazz': 'UTA',
                'Atlanta Hawks': 'ATL',
                'Boston Celtics': 'BOS',
                'Charlotte Bobcats': 'CHA',
                'Charlotte Hornets': 'CHA',
                'Chicago Bulls': 'CHI',
                'Los Angeles Clippers': 'LAC',
                'Minnesota Timberwolves': 'MIN',
                'Phoenix Suns': 'PHO',
                'Dallas Mavericks': 'DAL',
                'Houston Rockets': 'HOU',
                'Milwaukee Bucks': 'MIL',
                'Philadelphia 76ers': 'PHI',
                'Washington Wizards': 'WAS',
                'Detroit Pistons': 'DET',
                'New York Knicks': 'NYK',
                'Sacramento Kings': 'SAC',
                'Portland Trail Blazers': 'POR',
                'Oklahoma City Thunder': 'OKC'
        }
odds_names = {}
for name in list(pd.unique(odds.Home)):
    found = False
    for s_name in season_names:
        if name in s_name:
            found = True
            odds_names[name] = season_names[s_name]
    if not found: print(name)
odds_names["LA Lakers"] = "LAL"
odds_names["LA Clippers"] = "LAC"
odds_names["Okla City"] = "OKC"

LA Lakers
LA Clippers


In [112]:
odds["Home"] = odds["Home"].apply(lambda x: odds_names[x])
odds["Away"] = odds["Away"].apply(lambda x: odds_names[x])

season["team"] = season["team"].apply(lambda x: season_names[x])
season["opponent"] = season["opponent"].apply(lambda x: season_names[x])

#### Merging the two data tables

In [113]:
def make_index(row, col1, col2, col3):
    return str(row[col1]) + str(row[col2]) + str(row[col3])

def find_category(row):
    ref = row["Index"]
    if row["home"] == 0:
        ref = ref[:-6] + ref[-3:] + ref[-6:-3]
    odds_row = odds.loc[odds["Index"] == ref]
    #print(list(odds_row["Points"]))
    try:
        return list(odds_row["Points"])[0]
    except:
        return 0

season["date"] = season["date"].apply(lambda x: str(x)[:-1])

season["Index"] = season.apply(lambda x: make_index(x, "date", "team", "opponent"), axis=1)

odds["Date"] = odds["Date"].apply(lambda x: "".join(x.split("-")))

odds["Index"] = odds.apply(lambda x: make_index(x, "Date", "Home", "Away"), axis=1)

In [114]:
season["Outcome"] = season.apply(lambda x: find_category(x), axis = 1) ##### CHANGE THIS TO DEAL WITH OTHER INDICES

#merged = merged.drop(["Unnamed: 0_x", "Unnamed: 0_y", "date", "Home", "Away", "index"], axis = 1)

In [118]:
season.sample(5)

Unnamed: 0.1,Unnamed: 0,team,opponent,date,index,team_STL%,team_FT,team_2PA,team_FG,team_DRB,...,opponent_TRB,opponent_FTA,opponent_BLK,opponent_FTr,opponent_TS%,opponent_FT/FGA,opponent_3P%,home,Index,Outcome
1459,1459,POR,DET,20080208,Detroit Pistons49,6.3,16.0,67.0,33.0,24.0,...,37.0,15.0,7.0,0.205,0.571608,0.136986,0.5625,0,20080208PORDET,173
673,673,SAC,PHI,20071214,Philadelphia 76ers23,9.7,18.0,68.0,43.0,26.0,...,31.0,24.0,4.0,0.324,0.585383,0.216216,0.4375,0,20071214SACPHI,208
348,348,ORL,CHA,20071123,Orlando Magic14,4.3,22.0,52.0,38.0,38.0,...,50.0,20.0,3.0,0.222,0.465587,0.144444,0.238095,1,20071123ORLCHA,197
255,255,NYK,SAC,20071116,Sacramento Kings9,7.4,33.0,83.0,40.0,34.0,...,46.0,46.0,6.0,0.535,0.578878,0.395349,0.4375,0,20071116NYKSAC,234
1935,1935,PHI,CHI,20080314,Chicago Bulls65,7.4,40.0,72.0,33.0,31.0,...,44.0,20.0,8.0,0.22,0.531062,0.197802,0.315789,0,20080314PHICHI,216


In [128]:
in_data = season.set_index("Index")
in_data = in_data.drop(["index", "Unnamed: 0"], axis = 1)
in_data = in_data.sort_index()

In [129]:
in_data.shape

(2632, 97)

In [130]:
in_data.sample(1)

Unnamed: 0_level_0,team,opponent,date,team_STL%,team_FT,team_2PA,team_FG,team_DRB,team_ORB%,team_AST,...,opponent_STL,opponent_TRB,opponent_FTA,opponent_BLK,opponent_FTr,opponent_TS%,opponent_FT/FGA,opponent_3P%,home,Outcome
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20071212PHOUTA,PHO,UTA,20071212,9.6,19.0,70.0,40.0,29.0,31.9,22.0,...,12.0,49.0,27.0,5.0,0.303,0.485726,0.191011,0.428571,1,201


## Extracting the data

In [131]:
### for every team: past 3 games stats (them and opponent) + opponent season averages

#### Computing running season averages by team

In [137]:
dates = pd.unique(in_data.date)

season_averages = {}

for date in dates:
    # get all past games
    past_games = in_data[in_data.date < date]
    # means
    season_averages[date] = past_games.groupby('team').mean()

#### Computing the past n games for every matchup

In [140]:
n = 3
home_only = in_data[in_data.home == 1]

In [164]:
## build a list of games for every team
past_n = {}

for date in dates:
    team_map = {}
    past_games = home_only[home_only.date < date]
    for team in pd.unique(home_only.team):
        #get the past games for team
        past_team = past_games[(past_games.team == team) | (past_games.opponent == team)].tail(3)
        team_map[team] = past_team
    past_n[date] = team_map        