# Attempt at making an LSTM

In [156]:
import numpy as np
import pandas as pd
import os

In [157]:
# pytorch for lstm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

## Loading the data

#### Game Data

In [226]:
game_data_path = "data/processed_game_data/"
files = os.listdir(game_data_path) #######START FOR

season = pd.read_csv(game_data_path + files[0]) #### MOD
season = season.drop(["Unnamed: 0"], axis = 1)
season = season.drop_duplicates(subset=["date", "home_name", "away_name"])

# Compute new columns
cols = season.columns

away_cols = []
home_cols = []
neutral_cols = []
stripped_cols = []

for col in cols:
    if "away" in col and "name" not in col:
        away_cols.append(col)
        stripped_cols.append(col.split("_")[1])
    elif "home" in col and "name" not in col:
        home_cols.append(col)
    elif "name" not in col:
        neutral_cols.append(col)

In [233]:
neutral_cols

['date', 'index']

In [234]:
new_cols = ["team", "opponent"]
new_cols.extend(neutral_cols)
for col in stripped_cols:
    new_cols.append("team_" + col)
for col in stripped_cols:
    new_cols.append("opponent_" + col)
new_cols.append("home")

In [235]:
new_data = []

for row in season.iterrows():
    home_team = row[1]["home_name"]
    away_team = row[1]["away_name"]
    
    ### Deal with home team   
    home_row = [home_team, away_team]
    home_row.extend(row[1][neutral_cols])
    home_row.extend(row[1][home_cols])
    home_row.extend(row[1][away_cols])
    home_row.append(1)
    
    new_data.append(home_row)
    
    ### Deal with away team
    away_row = [away_team, home_team]
    away_row.extend(row[1][neutral_cols])
    away_row.extend(row[1][away_cols])
    away_row.extend(row[1][home_cols])
    away_row.append(0)
    
    new_data.append(away_row)

In [236]:
season_revamped = pd.DataFrame(new_data, columns=new_cols)

In [None]:
season_revamped.to_csv(game_data_path + files[0].split(".")[0] + "_final.csv")  #### MOD

In [239]:
season = season_revamped

#### Odds Data

In [240]:
odds_data_path = "data/odds_data_processed/"
odds_files = os.listdir(odds_data_path)

In [241]:
odds = pd.read_csv(odds_data_path + odds_files[1])

In [242]:
odds.shape

(1288, 14)

In [243]:
odds.sample(5)

Unnamed: 0.1,Unnamed: 0,Date,Home,Away,OU,Spread,OU_2H,Spread_2H,ML_home,ML_away,Points,Win Margin,2H Points,2H Win Margin
1234,1234,2008-04-28,Denver,LA Lakers,228.0,3.0,112.0,1.0,160,-190,208,-6,90,4
1126,1126,2008-04-08,Charlotte,Minnesota,198.0,3.5,101.5,2.0,-200,170,240,2,134,-10
949,949,2008-03-14,Dallas,Indiana,212.0,12.5,104.5,3.0,-3000,1500,213,19,105,3
1019,1019,2008-03-24,Boston,Philadelphia,194.0,10.0,96.0,5.0,-525,425,185,-5,85,-9
963,963,2008-03-16,New York,Atlanta,202.0,2.0,102.0,1.0,120,-140,207,-11,105,-7


#### Reconciling names

In [244]:
season_names = {'Golden State Warriors':'GSW',
                'Los Angeles Lakers': 'LAL',
                'San Antonio Spurs': 'SAS',
                'Cleveland Cavaliers': 'CLE',
                'Denver Nuggets': 'DEN',
                'Indiana Pacers': 'IND',
                'Memphis Grizzlies': 'MEM',
                'New Jersey Nets': 'BRK',
                'Brooklyn Nets': 'BRK',
                'New Orleans Hornets': 'NOP',
                'New Orleans Pelicans': 'NOP',
                'Orlando Magic': 'ORL',
                'Toronto Raptors': 'TOR',
                'Miami Heat': 'MIA',
                'Seattle SuperSonics': 'SEA',
                'Utah Jazz': 'UTA',
                'Atlanta Hawks': 'ATL',
                'Boston Celtics': 'BOS',
                'Charlotte Bobcats': 'CHA',
                'Charlotte Hornets': 'CHA',
                'Chicago Bulls': 'CHI',
                'Los Angeles Clippers': 'LAC',
                'Minnesota Timberwolves': 'MIN',
                'Phoenix Suns': 'PHO',
                'Dallas Mavericks': 'DAL',
                'Houston Rockets': 'HOU',
                'Milwaukee Bucks': 'MIL',
                'Philadelphia 76ers': 'PHI',
                'Washington Wizards': 'WAS',
                'Detroit Pistons': 'DET',
                'New York Knicks': 'NYK',
                'Sacramento Kings': 'SAC',
                'Portland Trail Blazers': 'POR',
                'Oklahoma City Thunder': 'OKC'
        }

In [245]:
odds_names = {}
for name in list(pd.unique(odds.Home)):
    found = False
    for s_name in season_names:
        if name in s_name:
            found = True
            odds_names[name] = season_names[s_name]
    if not found: print(name)

LA Lakers
LA Clippers


In [246]:
odds_names["LA Lakers"] = "LAL"
odds_names["LA Clippers"] = "LAC"

In [247]:
odds["Home"] = odds["Home"].apply(lambda x: odds_names[x])

In [248]:
odds["Away"] = odds["Away"].apply(lambda x: odds_names[x])

In [252]:
season["team"] = season["team"].apply(lambda x: season_names[x])
season["opponent"] = season["opponent"].apply(lambda x: season_names[x])

### Merging the two tables

In [253]:
def make_index(row, col1, col2, col3):
    return str(row[col1]) + str(row[col2]) + str(row[col3])

In [254]:
season["date"] = season["date"].apply(lambda x: str(x)[:-1])

In [255]:
season["Index"] = season.apply(lambda x: make_index(x, "date", "team", "opponent"), axis=1)

In [256]:
odds["Date"] = odds["Date"].apply(lambda x: "".join(x.split("-")))

In [257]:
odds["Index"] = odds.apply(lambda x: make_index(x, "Date", "Home", "Away"), axis=1)

In [258]:
merged = pd.merge(odds, season, on='Index')

In [260]:
merged = merged.drop(["Unnamed: 0", "date", "Home", "Away", "index"], axis = 1)

In [261]:
merged.sample(1)

Unnamed: 0,Date,OU,Spread,OU_2H,Spread_2H,ML_home,ML_away,Points,Win Margin,2H Points,...,opponent_HOB,opponent_STL,opponent_TRB,opponent_FTA,opponent_BLK,opponent_FTr,opponent_TS%,opponent_FT/FGA,opponent_3P%,home
779,20080222,189.0,2.5,97,2,-165,145,179,1,94,...,1.5625,9.0,31.0,29.0,4.0,0.414,0.537699,0.3,0.307692,1


In [262]:
odds_cols = ["OU", "Spread", "OU_2H", "Spread_2H", "ML_home", "ML_away"]
labels = ["Points", "Win Margin", "2H Points", "2H Win Margin"]
label_index = [0, 1, 2, 3]
non_numeric = ['Date','Home', 'Away']

## Format Data from LSTM (using season-data)

In [None]:
# drop non numeric features

In [82]:
labels = ["Points", "Win Margin", "2H Points", "2H Win Margin"]
label_index = [0, 1, 2, 3]
non_numeric = ['index']

In [83]:
data = season.set_index("Index")
data = data.drop(non_numeric, axis = 1)

In [84]:
data.shape

(1316, 96)

In [85]:
for team_name in list(season_names.values()):
    m = data[data.index.str.contains(team_name)]
    if m.shape[0] > 0: break

In [86]:
m = m.sort_index()
label_col = 0
start_data_cols = 4
rows = m.shape[0]
N_PREV = 3

In [102]:
X = []
y = []

current_data = []

for r in range(rows):
    if len(current_data) == N_PREV:
        X.append(current_data)
        y.append(m.iloc[r].values[label_col])
        
    row = m.iloc[r].values[start_data_cols:]
    current_data.append(row)
    if len(current_data) > N_PREV:
        current_data.pop(0)

X = np.array(X, dtype=float)
y = np.array(y, dtype=float)

In [103]:
X.shape

(79, 3, 92)

In [104]:
y.shape

(79,)

In [105]:
m.shape

(82, 96)

In [106]:
season.shape

(1316, 98)

## LSTM