# Attempt at making an LSTM

In [1]:
import numpy as np
import pandas as pd
import os

In [69]:
# pytorch for lstm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

## Loading the data

#### Game Data

In [2]:
game_data_path = "data/processed_game_data/"
files = os.listdir(game_data_path)

In [3]:
season = pd.read_csv(game_data_path + files[0])

In [4]:
season.shape

(2632, 97)

In [5]:
season.sample(5)

Unnamed: 0.1,Unnamed: 0,date,home_name,away_name,home_STL%,home_FT,home_2PA,home_FG,home_DRB,home_ORB%,...,away_HOB,away_STL,away_TRB,away_FTA,away_BLK,away_FTr,away_TS%,away_FT/FGA,away_3P%,index
1791,1791,200803040,San Antonio Spurs,New Jersey Nets,9.3,25.0,65.0,26.0,38.0,28.0,...,1.535714,7.0,42.0,16.0,4.0,0.213,0.426621,0.133333,0.235294,New Jersey Nets60
1182,1182,200801190,Orlando Magic,Portland Trail Blazers,3.5,13.0,59.0,40.0,36.0,32.4,...,1.605263,6.0,37.0,11.0,5.0,0.121,0.490401,0.098901,0.5625,Orlando Magic42
2258,2258,200804040,Milwaukee Bucks,Indiana Pacers,13.9,25.0,76.0,37.0,31.0,28.0,...,1.676471,7.0,42.0,35.0,3.0,0.486,0.600686,0.402778,0.421053,Milwaukee Bucks75
2052,2052,200803210,Portland Trail Blazers,Los Angeles Clippers,5.9,14.0,58.0,42.0,24.0,33.3,...,1.634146,8.0,28.0,16.0,2.0,0.219,0.637181,0.164384,0.615385,Portland Trail Blazers69
652,652,200712140,Boston Celtics,Milwaukee Bucks,13.5,26.0,55.0,35.0,30.0,27.5,...,1.588235,8.0,42.0,13.0,0.0,0.165,0.483947,0.126582,0.210526,Boston Celtics21


#### Odds Data

In [6]:
odds_data_path = "data/odds_data_processed/"
odds_files = os.listdir(odds_data_path)

In [7]:
odds = pd.read_csv(odds_data_path + odds_files[1])

In [8]:
odds.shape

(1288, 14)

In [9]:
odds.sample(5)

Unnamed: 0.1,Unnamed: 0,Date,Home,Away,OU,Spread,OU_2H,Spread_2H,ML_home,ML_away,Points,Win Margin,2H Points,2H Win Margin
266,266,2007-12-07,Boston,Toronto,185.0,10.5,93.0,1.0,-1300,850,196,28,94,6
1061,1061,2008-03-29,Denver,Golden State,239.0,7.0,119.0,4.0,-355,295,231,7,105,3
480,480,2008-01-07,Phoenix,Denver,226.0,7.0,110.5,1.5,-350,290,252,22,115,3
524,524,2008-01-13,Atlanta,Chicago,188.5,5.0,95.5,3.5,-175,155,189,21,71,1
231,231,2007-12-02,San Antonio,Portland,186.5,15.0,95.5,2.0,-1500,950,179,21,90,8


#### Reconciling names

In [10]:
season_names = {'Golden State Warriors':'GSW',
                'Los Angeles Lakers': 'LAL',
                'San Antonio Spurs': 'SAS',
                'Cleveland Cavaliers': 'CLE',
                'Denver Nuggets': 'DEN',
                'Indiana Pacers': 'IND',
                'Memphis Grizzlies': 'MEM',
                'New Jersey Nets': 'BRK',
                'Brooklyn Nets': 'BRK',
                'New Orleans Hornets': 'NOP',
                'New Orleans Pelicans': 'NOP',
                'Orlando Magic': 'ORL',
                'Toronto Raptors': 'TOR',
                'Miami Heat': 'MIA',
                'Seattle SuperSonics': 'SEA',
                'Utah Jazz': 'UTA',
                'Atlanta Hawks': 'ATL',
                'Boston Celtics': 'BOS',
                'Charlotte Bobcats': 'CHA',
                'Charlotte Hornets': 'CHA',
                'Chicago Bulls': 'CHI',
                'Los Angeles Clippers': 'LAC',
                'Minnesota Timberwolves': 'MIN',
                'Phoenix Suns': 'PHO',
                'Dallas Mavericks': 'DAL',
                'Houston Rockets': 'HOU',
                'Milwaukee Bucks': 'MIL',
                'Philadelphia 76ers': 'PHI',
                'Washington Wizards': 'WAS',
                'Detroit Pistons': 'DET',
                'New York Knicks': 'NYK',
                'Sacramento Kings': 'SAC',
                'Portland Trail Blazers': 'POR',
                'Oklahoma City Thunder': 'OKC'
        }

In [11]:
odds_names = {}
for name in list(pd.unique(odds.Home)):
    found = False
    for s_name in season_names:
        if name in s_name:
            found = True
            odds_names[name] = season_names[s_name]
    if not found: print(name)

LA Lakers
LA Clippers


In [12]:
odds_names["LA Lakers"] = "LAL"
odds_names["LA Clippers"] = "LAC"

In [13]:
odds["Home"] = odds["Home"].apply(lambda x: odds_names[x])

In [14]:
odds["Away"] = odds["Away"].apply(lambda x: odds_names[x])

In [15]:
season["home_name"] = season["home_name"].apply(lambda x: season_names[x])
season["away_name"] = season["away_name"].apply(lambda x: season_names[x])

### Merging the two tables

In [16]:
def make_index(row, col1, col2, col3):
    return str(row[col1]) + str(row[col2]) + str(row[col3])

In [17]:
season["date"] = season["date"].apply(lambda x: str(x)[:-1])

In [18]:
season["Index"] = season.apply(lambda x: make_index(x, "date", "home_name", "away_name"), axis=1)

In [19]:
season = season.drop_duplicates(subset=["Index"])

In [20]:
odds["Date"] = odds["Date"].apply(lambda x: "".join(x.split("-")))

In [21]:
odds["Index"] = odds.apply(lambda x: make_index(x, "Date", "Home", "Away"), axis=1)

In [22]:
merged = pd.merge(odds, season, on='Index')

In [23]:
merged = merged.drop(["Unnamed: 0_x", "Unnamed: 0_y", "date", "home_name", "away_name", "index"], axis = 1)

In [24]:
merged.sample(1)

Unnamed: 0,Date,Home,Away,OU,Spread,OU_2H,Spread_2H,ML_home,ML_away,Points,...,away_PTS,away_HOB,away_STL,away_TRB,away_FTA,away_BLK,away_FTr,away_TS%,away_FT/FGA,away_3P%
1258,20080512,CLE,BOS,185.5,7.0,91.5,0.5,-125,105,165,...,77.0,1.592593,3.0,38.0,26.0,4.0,0.371,0.472741,0.285714,0.214286


In [None]:
odds_cols = ["OU", "Spread", "OU_2H", "Spread_2H", "ML_home", "ML_away"]
labels = ["Points", "Win Margin", "2H Points", "2H Win Margin"]
label_index = [0, 1, 2, 3]
non_numeric = ['Date','Home', 'Away']

## Format Data from LSTM (using season-data)

In [52]:
data = season.drop(["Unnamed: 0", "date", "home_name", "away_name", "index"], axis = 1)

In [82]:
labels = ["Points", "Win Margin", "2H Points", "2H Win Margin"]
label_index = [0, 1, 2, 3]
non_numeric = ['index']

In [83]:
data = season.set_index("Index")
data = data.drop(non_numeric, axis = 1)

In [84]:
data.shape

(1316, 96)

In [85]:
for team_name in list(season_names.values()):
    m = data[data.index.str.contains(team_name)]
    if m.shape[0] > 0: break

In [86]:
m = m.sort_index()
label_col = 0
start_data_cols = 4
rows = m.shape[0]
N_PREV = 3

In [102]:
X = []
y = []

current_data = []

for r in range(rows):
    if len(current_data) == N_PREV:
        X.append(current_data)
        y.append(m.iloc[r].values[label_col])
        
    row = m.iloc[r].values[start_data_cols:]
    current_data.append(row)
    if len(current_data) > N_PREV:
        current_data.pop(0)

X = np.array(X, dtype=float)
y = np.array(y, dtype=float)

In [103]:
X.shape

(79, 3, 92)

In [104]:
y.shape

(79,)

In [105]:
m.shape

(82, 96)

In [106]:
season.shape

(1316, 98)

## LSTM

In [107]:
class LSTM_Model(nn.Module):

    def __init__(self, input_dim, hidden_dim):
        #super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(input_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden_to_pred = nn.Linear(hidden_dim, 1)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, input_data):
        
        lstm_out, self.hidden = self.lstm(
            input_data.view(len(input_data), 1, -1), self.hidden)
        
        output = self.hidden_to_pred(lstm_out.view(len(input_data), -1))        
        return output

In [108]:
INPUT_DIM = len(X[0])
HIDDEN_DIM = 5

In [109]:
model = LSTM_Model(INPUT_DIM, HIDDEN_DIM)
loss_function = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for i, x in enumerate(X):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        label = y[i]

        # Step 3. Run our forward pass.
        out = model(x)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(out, label)
        loss.backward()
        optimizer.step()

AttributeError: cannot assign module before Module.__init__() call

In [139]:
model = nn.LSTM(input_size=len(X[0]), hidden_size=5, num_layers=1, batch_first=True)

# input dim: (batch_size, time_steps, in_size): (79, 3, 92)

input_seq = torch.from_numpy(X[:5])

output_seq, _ = model(input_seq)
last_output = output_seq[-1]

loss = nn.CrossEntropyLoss()
target = Variable(torch.LongTensor(batch_size).random_(0, classes_no-1))
err = loss(last_output, target)
err.backward()

RuntimeError: input.size(-1) must be equal to input_size. Expected 3, got 92

In [136]:
input_seq.shape

torch.Size([5, 3, 92])

In [125]:
X.shape

(79, 3, 92)

In [97]:
type(X.astype(np.float16))

numpy.ndarray

In [98]:
X.dtype

dtype('O')