In [1]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import itertools
from tqdm import tqdm
import feat_eng

%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv('./train.csv')
label = pd.read_csv('./challenge_output_data_training_file_nba_challenge.csv', sep=';')
train = pd.merge(train, label, on='ID')

train = feat_eng.add_fg(train)
train = feat_eng.add_incremental_features(train)

#df_train, df_val = train_test_split(train, test_size=0.2, random_state=42)
#print(df_train.shape, df_val.shape)

100%|██████████| 1439/1439 [00:16<00:00, 89.42it/s]
100%|██████████| 1440/1440 [01:04<00:00, 22.21it/s]
100%|██████████| 1440/1440 [00:19<00:00, 72.23it/s]
100%|██████████| 1440/1440 [01:05<00:00, 21.84it/s]


In [3]:
y = train.label.values
X = train.drop(['ID', 'label'], axis=1)

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
#kf.get_n_splits(X)

In [4]:
df1 = train
df1_tot = pd.DataFrame()
for sec in range(1,1441):
    print('\r Train pivotation %d/%d' % (sec, 1441), end='')
    df_sec = df1[['ID', 'label']+[k for k in df1.columns if k not in ['label', 'ID'] and k.split('_')[1] == str(sec)]]
    df_sec.columns = [k.split('_')[0] if k not in ['ID', 'time_step', 'label'] else k for k in df_sec.columns]
    df_sec = df_sec.assign(time_step = sec)
    df1_tot = pd.concat([df1_tot, df_sec])
df1_tot = df1_tot.sort_values(['ID', 'time_step'])
print('Train table pivoted')

 Train pivotation 1440/1441Train table pivoted


In [5]:
cols = ['score', 'offensive rebound', 'defensive rebound',
        'offensive foul', 'defensive foul', 'assist', 'lost ball', 
        'steals', 'bad pass', 'block', 'miss', 'two pts', 'three pts', 
        'fg', 'total rebound', 'turnover', 'fga']

In [6]:
df_X = df1_tot[df1_tot.time_step < 1441]
X = []
y = df_X.groupby('ID').mean()['label'].values
y = y.reshape((len(y), 1))
for id_ in df1_tot.ID.unique():
    X.append(df_X[df_X.ID == id_][cols].values)
X = np.array(X)
print('X_train created')

X_train created


In [7]:
#import run_lstm
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation
from keras.models import load_model

from keras.layers import BatchNormalization, Conv2D, Reshape, LSTM,Bidirectional, Dropout, Dense, TimeDistributed, Lambda, MaxPool2D
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.utils import to_categorical

In [8]:
def deepconvlstm(num_classes):
    nb_timestep = 1440
    nb_features = 17

    my_model = Sequential()

    my_model.add(BatchNormalization(input_shape = (nb_timestep,nb_features)))
    my_model.add(Reshape([nb_timestep,nb_features,1]))
    my_model.add(Conv2D(64, 1,activation="relu"))
    my_model.add(MaxPool2D(2))
    my_model.add(BatchNormalization())
    my_model.add(Conv2D(64,1,activation="relu"))
    my_model.add(MaxPool2D(2))
    my_model.add(BatchNormalization())
    my_model.add(Reshape([int(nb_timestep/4), 3*64]))
    my_model.add(LSTM(128, return_sequences=True))
    my_model.add(LSTM(128,return_sequences=True))
    my_model.add(Dropout(0.2))
    my_model.add(TimeDistributed(Dense(2,activation="sigmoid", kernel_regularizer=l2(0.0001))))
    my_model.add(Lambda(lambda x: x[:, -1, :], output_shape=[num_classes]))
    my_model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0001), metrics=['accuracy'])
    print("Model DeepConvLSTM created")
    return my_model

In [9]:
def bidirectional_lstm(num_classes):
    nb_timestep = 1440
    nb_features = 17

    my_model = Sequential()
    my_model.add(BatchNormalization(input_shape = (nb_timestep,nb_features)))
    my_model.add(Bidirectional(LSTM(128, return_sequences=False), merge_mode='concat'))
    #print(my_model.layers[-1].output_shape)
    my_model.add(Dropout(0.2))
    my_model.add(Dense(1,activation="sigmoid", kernel_regularizer=l2(0.0001))) #TimeDistributed(
    #print(my_model.layers[-1].output_shape)
    my_model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0001), metrics=['accuracy'])
    #my_model.summary()
    print("Model DeepConvLSTM created")
    return my_model

In [10]:
import gc
gc.collect()

292

In [20]:
import pickle
#dic = {}
for i, (train_index, val_index) in enumerate(kf.split(X)):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    num_classes = y_train.shape[1]
    d = bidirectional_lstm(num_classes)
    d.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1, validation_data=(X_val, y_val))
    d.save('./lstm_models/bidirec_model_%d' % i)
    del d

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_3 (Batch (None, 1440, 17)          68        
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               149504    
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
Total params: 149,829
Trainable params: 149,795
Non-trainable params: 34
_________________________________________________________________
Model DeepConvLSTM created
Train on 10061 samples, validate on 2515 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_nor

In [21]:
model_0 = load_model('./lstm_models/bidirec_model_0')
model_1 = load_model('./lstm_models/bidirec_model_1')
model_2 = load_model('./lstm_models/bidirec_model_2')
model_3 = load_model('./lstm_models/bidirec_model_3')
model_4 = load_model('./lstm_models/bidirec_model_4')

In [11]:
num_classes = y.shape[1]
bidirec = bidirectional_lstm(num_classes)
bidirec.fit(X, y, epochs=5, batch_size=32, verbose=1)

Model DeepConvLSTM created
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f59cd956748>

In [12]:
bidirec.save('./lstm_models/bidirec_model')

In [22]:
df_train, df_val = train_test_split(X, test_size=0.2, random_state=42)

In [29]:
test = pd.read_csv('./test.csv')
test = feat_eng.add_fg(test, test=True)
test = feat_eng.add_incremental_features(test)

100%|██████████| 1439/1439 [00:12<00:00, 115.65it/s]
100%|██████████| 1440/1440 [00:23<00:00, 60.71it/s]
100%|██████████| 1440/1440 [00:19<00:00, 75.38it/s]
100%|██████████| 1440/1440 [00:27<00:00, 52.33it/s]


In [33]:
df1 = test
df1_tot = pd.DataFrame()
for sec in range(1,1441):
    print('\r test pivotation %d/%d' % (sec, 1441), end='')
    df_sec = df1[['ID']+[k for k in df1.columns if k not in ['ID'] and k.split('_')[1] == str(sec)]]
    df_sec.columns = [k.split('_')[0] if k not in ['ID', 'time_step'] else k for k in df_sec.columns]
    df_sec = df_sec.assign(time_step = sec)
    df1_tot = pd.concat([df1_tot, df_sec])
df1_tot = df1_tot.sort_values(['ID', 'time_step'])
print('test table pivoted')

 test pivotation 1440/1441test table pivoted


In [34]:
cols = ['score', 'offensive rebound', 'defensive rebound',
        'offensive foul', 'defensive foul', 'assist', 'lost ball', 
        'steals', 'bad pass', 'block', 'miss', 'two pts', 'three pts', 
        'fg', 'total rebound', 'turnover', 'fga']

df_X = df1_tot[df1_tot.time_step < 1441]
X_test = []
#y = df_X.groupby('ID').mean()['label'].values
#y = y.reshape((len(y), 1))
for id_ in df1_tot.ID.unique():
    X_test.append(df_X[df_X.ID == id_][cols].values)
X_test = np.array(X_test)
print('X_test created')

X_test created


In [36]:
X_tot = bidirec.predict_proba(X_test)
#X_tot.apply(round).apply(int)

In [39]:
X_ = bidirec.predict_proba(X)
np.mean(np.round(X_) == y)

0.7292461832061069

In [38]:
Y = pd.DataFrame()
Y['ID'] = test['ID']
Y['label'] = np.round(X_tot)
Y.to_csv('pred_bidirection_lstm.csv', index=False)

In [None]:
#Pred = pd.DataFrame(np.argmax(y_pred, axis=1), columns= ['label'])
#Pred['ID'] = id_test
#Pred[['ID', 'label']].to_csv('pred_v3.csv', index=False)
#print('Prediction exported')