In [40]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import itertools

%matplotlib inline

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
label = pd.read_csv('./challenge_output_data_training_file_nba_challenge.csv', sep=';')
train = pd.merge(train, label, on='ID')

In [3]:
df_train, df_val = train_test_split(train, test_size=0.2, random_state=42)

### Rules

In [4]:
diff_goals = 8
temp = df_train[(df_train.score_1440 <= -1*diff_goals) | (df_train.score_1440 >= diff_goals)]
same_mt_f = df_train[((df_train.score_1440 <= -1*diff_goals) & (df_train.label == 0)) | ((df_train.score_1440 >= diff_goals) & (df_train.label == 1))]
print('Probability that the %d pt margin half-time leader (%.3f of full train set) is the full-time winner %.3f' % (diff_goals, len(temp)/len(df_train), len(same_mt_f)/len(temp)))

Probability that the 8 pt margin half-time leader (0.464 of full train set) is the full-time winner 0.854


In [6]:
df_train_2 = df_train[~df_train.ID.isin(temp.ID.values)]
diff_assists = 6
same_mt_f = df_train_2[((df_train_2.assist_1440 <= -1*diff_assists) & (df_train_2.label == 0)) 
                   | ((df_train_2.assist_1440 >= diff_assists) & (df_train_2.label == 1))]
temp2 = df_train_2[(df_train_2.assist_1440 <= -1*diff_assists) | (df_train_2.assist_1440 >= diff_assists)]
print('Probability that the %d assist margin half-time leader (%.3f of remaining train set) is the full-time winner %.3f' % (diff_assists, len(temp2)/len(df_train_2), len(same_mt_f)/len(temp2)))

Probability that the 6 assist margin half-time leader (0.138 of remaining train set) is the full-time winner 0.610


In [32]:
diff_goals = 1
temp = df_train_2[(df_train_2.score_1440 <= -1*diff_goals) | (df_train_2.score_1440 >= diff_goals)]
same_mt_f = df_train_2[((df_train_2.score_1440 <= -1*diff_goals) & (df_train_2.label == 0)) | ((df_train_2.score_1440 >= diff_goals) & (df_train_2.label == 1))]
print('Probability that the %d pt margin half-time leader (%.3f of remaining train set) is the full-time winner %.3f' % (diff_goals, len(temp)/len(df_train_2), len(same_mt_f)/len(temp)))

Probability that the 1 pt margin half-time leader (0.921 of remaining train set) is the full-time winner 0.617


In [123]:
def predict_from_rules(x): 
    # Score margin at HT
    diff_goals = 8
    if x.score_1440 <= -1*diff_goals:
        return 0.15
    if x.score_1440 >= diff_goals:
        return 0.85
    
    diff_goals = 6
    if x.score_1440 <= -1*diff_goals:
        return 0.28
    if x.score_1440 >= diff_goals:
        return 0.72
    
    diff_goals = 1
    if x.score_1440 <= -1*diff_goals:
        return 0.38
    if x.score_1440 >= diff_goals:
        return 0.62
    
    # Assist margin at HT
    diff_assists = 6
    if x.assist_1440 <= -1*diff_assists:
        return 0.39
    if x.assist_1440 >= diff_assists:
        return 0.61
    
    return 0.5

In [124]:
df_temp = df_val.copy()
df_temp['prediction'] = df_temp.apply(predict_from_rules, axis=1)

In [125]:
(df_temp['prediction'].apply(round) == df_temp['label']).mean()

0.7233704292527822

### Random Forest

In [42]:
y_train = df_train.label.values
X_train = df_train.drop(['ID', 'label'], axis=1)

In [45]:
y_val = df_val.label.values
X_val = df_val.drop(['ID', 'label'], axis=1)

In [116]:
rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=3)

In [117]:
_ = rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [118]:
y_pred = rf.predict(X_val)
np.mean(y_pred == y_val)

0.744435612082671

In [130]:
df_temp['RF_prediction'] = rf.predict_proba(X_val)[:,1]

In [133]:
i, k = 2,5
df_temp['pred'] = (df_temp['prediction']*i + df_temp['RF_prediction']*k)/(i+k)
(df_temp['pred'].apply(round) == df_temp['label']).mean()

0.744435612082671

### Test

In [153]:
X_test = test.drop(['ID'], axis=1)
test['label'] = rf.predict_proba(X_test)[:,1]
test['label'] = test['label'].apply(round)

In [154]:
test[['ID', 'label']].to_csv('pred_RF.csv', index=False)