In [20]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import itertools
from tqdm import tqdm

%matplotlib inline

In [79]:
test = pd.read_csv('./test.csv')

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
label = pd.read_csv('./challenge_output_data_training_file_nba_challenge.csv', sep=';')
train = pd.merge(train, label, on='ID')

In [3]:
df_train, df_val = train_test_split(train, test_size=0.2, random_state=42)

### Rules

In [4]:
diff_goals = 8
temp = df_train[(df_train.score_1440 <= -1*diff_goals) | (df_train.score_1440 >= diff_goals)]
same_mt_f = df_train[((df_train.score_1440 <= -1*diff_goals) & (df_train.label == 0)) | ((df_train.score_1440 >= diff_goals) & (df_train.label == 1))]
print('Probability that the %d pt margin half-time leader (%.3f of full train set) is the full-time winner %.3f' % (diff_goals, len(temp)/len(df_train), len(same_mt_f)/len(temp)))

Probability that the 8 pt margin half-time leader (0.464 of full train set) is the full-time winner 0.854


In [6]:
df_train_2 = df_train[~df_train.ID.isin(temp.ID.values)]
diff_assists = 6
same_mt_f = df_train_2[((df_train_2.assist_1440 <= -1*diff_assists) & (df_train_2.label == 0)) 
                   | ((df_train_2.assist_1440 >= diff_assists) & (df_train_2.label == 1))]
temp2 = df_train_2[(df_train_2.assist_1440 <= -1*diff_assists) | (df_train_2.assist_1440 >= diff_assists)]
print('Probability that the %d assist margin half-time leader (%.3f of remaining train set) is the full-time winner %.3f' % (diff_assists, len(temp2)/len(df_train_2), len(same_mt_f)/len(temp2)))

Probability that the 6 assist margin half-time leader (0.138 of remaining train set) is the full-time winner 0.610


In [32]:
diff_goals = 1
temp = df_train_2[(df_train_2.score_1440 <= -1*diff_goals) | (df_train_2.score_1440 >= diff_goals)]
same_mt_f = df_train_2[((df_train_2.score_1440 <= -1*diff_goals) & (df_train_2.label == 0)) | ((df_train_2.score_1440 >= diff_goals) & (df_train_2.label == 1))]
print('Probability that the %d pt margin half-time leader (%.3f of remaining train set) is the full-time winner %.3f' % (diff_goals, len(temp)/len(df_train_2), len(same_mt_f)/len(temp)))

Probability that the 1 pt margin half-time leader (0.921 of remaining train set) is the full-time winner 0.617


In [4]:
def predict_from_rules(x): 
    # Score margin at HT
    diff_goals = 8
    if x.score_1440 <= -1*diff_goals:
        return 0.15
    if x.score_1440 >= diff_goals:
        return 0.85
    
    diff_goals = 6
    if x.score_1440 <= -1*diff_goals:
        return 0.28
    if x.score_1440 >= diff_goals:
        return 0.72
    
    diff_goals = 1
    if x.score_1440 <= -1*diff_goals:
        return 0.38
    if x.score_1440 >= diff_goals:
        return 0.62
    
    # Assist margin at HT
    diff_assists = 6
    if x.assist_1440 <= -1*diff_assists:
        return 0.39
    if x.assist_1440 >= diff_assists:
        return 0.61
    
    return 0.5

In [5]:
df_temp = df_val.copy()
df_temp['prediction'] = df_temp.apply(predict_from_rules, axis=1)

In [6]:
(df_temp['prediction'].apply(round) == df_temp['label']).mean()

0.7233704292527822

### Random Forest

In [7]:
y_train = df_train.label.values
X_train = df_train.drop(['ID', 'label'], axis=1)

In [8]:
y_val = df_val.label.values
X_val = df_val.drop(['ID', 'label'], axis=1)

In [9]:
rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=3)

In [10]:
_ = rf.fit(X_train, y_train)

In [11]:
y_pred = rf.predict(X_val)
np.mean(y_pred == y_val)

0.7452305246422893

In [12]:
df_temp['RF_prediction'] = rf.predict_proba(X_val)[:,1]

In [133]:
i, k = 2,5
df_temp['pred'] = (df_temp['prediction']*i + df_temp['RF_prediction']*k)/(i+k)
(df_temp['pred'].apply(round) == df_temp['label']).mean()

0.744435612082671

### Ensemble methods

In [13]:
from keras.models import load_model

In [14]:
my_model = load_model('./my_model_convlstm.h5')

#### Data preparation

In [78]:
def prepare_data(df, conf, test=False):
    """
    :param conf: score min to keep RF predictions
    """
    to_pred = df[(df['RF_prediction'] > 1-conf) & (df_temp['RF_prediction'] < conf)]
    ids = to_pred.ID.values
    # Add total rebound
    for k in range(1,1441):
        to_pred['total rebound_%d' % k] = to_pred['offensive rebound_%d' % k] + to_pred['defensive rebound_%d' % k]
    temp = to_pred
    if not test:
        cols_kept = ['ID', 'label', 'prediction', 'RF_prediction']
    else:
        cols_kept = ['ID', 'prediction', 'RF_prediction']
    cols_values = ['score', 'offensive rebound', 'defensive rebound',
           'offensive foul', 'defensive foul', 'assist', 'lost ball',
           'steals', 'bad pass', 'block', 'miss', 'total rebound']
    cols_sec = [k for k in temp.columns if any(x in k for x in cols_values)]
    df_id = pd.DataFrame()

    for sec in tqdm(range(1, 1441)):
        df_sec = temp[cols_kept + [k for k in cols_sec if k.split('_')[1] == str(sec)]]
        df_sec.columns = cols_kept + cols_values
        df_sec = df_sec.assign(time_step = sec)
        df_id = pd.concat([df_id, df_sec])
    df_id.ID = df_id.ID.astype("category")
    df_id.ID.cat.set_categories(ids, inplace=True)
    df_id = df_id.sort_values(["ID", 'time_step'])
    X_val = []
    for id_ in tqdm(ids):
        X_val.append(df_id[df_id.ID == id_][cols_values].values)
    X_val = np.array(X_val)
    print(X_val.shape)
    return X_val, to_pred

In [66]:
X_val, to_pred = prepare_data(df_temp, 0.58)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
100%|██████████| 1440/1440 [00:45<00:00, 31.58it/s]
100%|██████████| 518/518 [00:00<00:00, 1372.35it/s]

(518, 1440, 12) (518, 1)





#### Prediction

In [67]:
y_pred = my_model.predict(X_val, batch_size=32)

In [68]:
to_pred['LSTM'] = y_pred[:,1]
lstm_pred = to_pred[['ID','LSTM']].set_index('ID').to_dict()['LSTM']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [69]:
df_temp['LSTM'] = df_temp.apply(lambda x:lstm_pred.get(x.ID, x.RF_prediction), axis=1)

In [75]:
l = []
for i,j,k in itertools.permutations(np.arange(10), 3):
    df_temp['pred'] = (df_temp['prediction']*i + df_temp['RF_prediction']*k + df_temp['LSTM']*j)/(i+k+j)
    r = (df_temp['pred'].apply(round) == df_temp['label']).mean()
    l.append([(i,j,k), r])
sorted(l, key=lambda x:x[1], reverse=True)[:3]

[[(1, 8, 9), 0.75],
 [(1, 7, 9), 0.7496025437201908],
 [(1, 9, 6), 0.7496025437201908]]

### Test

#### RF

In [153]:
X_test = test.drop(['ID'], axis=1)
test['label'] = rf.predict_proba(X_test)[:,1]
test['label'] = test['label'].apply(round)

In [154]:
test[['ID', 'label']].to_csv('pred_RF.csv', index=False)

#### Ensemble

In [80]:
X_test = test.drop(['ID'], axis=1)
test['RF_prediction'] = rf.predict_proba(X_test)[:,1]
test['prediction'] = test.apply(predict_from_rules, axis=1)
X_test, to_pred_test = prepare_data(test, 0.58, True)

  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
100%|██████████| 1440/1440 [00:22<00:00, 63.37it/s]
100%|██████████| 306/306 [00:00<00:00, 1431.62it/s]

(306, 1440, 12)





In [81]:
y_ = my_model.predict(X_test, batch_size=32)
to_pred_test['LSTM'] = y_[:,1]
lstm_pred = to_pred_test[['ID','LSTM']].set_index('ID').to_dict()['LSTM']
test['LSTM'] = test.apply(lambda x:lstm_pred.get(x.ID, x.RF_prediction), axis=1)
i, j, k = 1, 8, 9
test['label'] = (test['prediction']*i + test['RF_prediction']*k + test['LSTM']*j)/(i+k+j)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [86]:
test[['ID', 'label']].to_csv('pred_RF2.csv', index=False)