In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn import metrics as sk_metrics

In [2]:
import keras
from keras import metrics
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.optimizers import Adam, RMSprop
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, History 
from keras.models import load_model

Using TensorFlow backend.


# Data fetch

In [3]:
def data_fetch(csv_name):
    
    print("Reading the data...")
    df = pd.read_csv('datasets/{}.csv'.format(csv_name))
    
    return df

# Train Test Split

In [4]:
def split_train_test(df,label_col,test_size=0.2):
    
    X = df.loc[:,df.columns != label_col]
    y = df[label_col]

    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=42)
    
    return train_x, test_x, train_y, test_y

# RFE

In [25]:
def rfe_feat_selection(train_x,train_y,label_col,num_features):    
    from sklearn.feature_selection import RFE
    from sklearn.ensemble import RandomForestClassifier
    print("Selecting the best features for training...")
    step = int(np.ceil(train_x.shape[1] / 100))
    estimator = RandomForestClassifier(warm_start=True, random_state=42)
    selector = RFE(estimator,step=step,n_features_to_select=num_features,verbose=1) 
    selector = selector.fit(train_x, train_y)
    print('No of selected features:',selector.n_features_)
    global selected_cols
    selected_cols = []
    for val,col in zip(selector.support_,train_x.columns):
        if(val == True):
            selected_cols.append(col)
    return selected_cols

# Model Training

In [6]:
def get_search_space():
    
    space = {'num_layers': hp.choice('num_layers',['one_hidden', 'two_hidden']),

                'units1': hp.choice('units1', [32, 64, 128, 256,512]),
                'units2': hp.choice('units2', [32, 64, 128, 256,512]),

                'dropout1': hp.uniform('dropout1', .25,.75),
                'dropout2': hp.uniform('dropout2',  .25,.75),

                'batch_size' : hp.choice('batch_size', [16,32,64,128]),

#                 'nb_epochs' :  500,
                'optimizer': hp.choice('optimizer',['rmsprop', 'adam', 'nadam','sgd']),
                'activation': hp.choice('activation',['relu','sigmoid']),

                'early_stop_rounds': hp.choice('early_stop_rounds',[10,20,30,40,50]),
            }
    return space

In [7]:
def data(csv_name,label_col,num_features):
    
    data = data_fetch(csv_name)
    
    pre_processed_data = data.drop(columns=['Name'])
    
    train_x, test_x, train_y, test_y = split_train_test(df=pre_processed_data,label_col=label_col)
    
    best_features = rfe_feat_selection(train_x,train_y,label_col=label_col,num_features=num_features)
    
    best_features.append(label_col)
    
    feature_selected_data = pre_processed_data[best_features]
    
    x_train, x_test, y_train, y_test = split_train_test(df=feature_selected_data,label_col=label_col)
    
    return data, x_train, x_test, y_train, y_test

In [45]:
def create_model(params):
    
    x_train_temp = x_train.copy()
    x_test_temp = x_test.copy()
    y_train_temp = y_train.copy()
    y_test_temp = y_test.copy()
    
    model = Sequential()
    model.add(Dense(params['units1'], input_shape=(x_train_temp.shape[1],)))
    model.add(Activation(params['activation']))
    model.add(Dropout(params['dropout1']))

#     If we choose 'two_hidden', add an additional layer
    if(params['num_layers'] == 'two_hidden'):
        model.add(Dense(params['units2']))
        model.add(Activation(params['activation']))
        model.add(Dropout(params['dropout2']))
        
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy', metrics=['accuracy'],
                  optimizer=params['optimizer'])
    
    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=params['early_stop_rounds'])
    history = History()
    
    model.fit(x_train_temp, y_train_temp,
              batch_size=params['batch_size'],
              epochs=500,
              callbacks=[early_stop, history],
              verbose=0,
              validation_split=0.2)
    
    loss,acc = model.evaluate(x_test_temp, y_test_temp, verbose=0)
    
    # In cases where the loss turns out to be nan (due to bad network architecture)
    # An Assertion error is raised by hyperopt. Because of the nan value of loss.
    # So, to avoid such a case, we update loss to infinity in that case.
    if(np.isnan(acc)):
        print("Testing set Accuracy: NaN")
        return {'loss': np.inf, 'status': STATUS_OK, 'model': model}
    
    print("Testing set Accuracy: %.2f%%" % (acc*100))
    
    return {'loss': -acc, 'status': STATUS_OK, 'model': model}

In [9]:
def get_best_model(csv_name,label_col,num_features):
    
    global x_train,x_test,y_train,y_test
    input_df, x_train, x_test, y_train, y_test = data(csv_name=csv_name,label_col=label_col,num_features=num_features)
    
    
    trials=Trials()
    space = get_search_space()
    print("Moulding the network architecture specifically for your data...")
    print("Running evaluatins with various architectures and hyper-parameters...")
    best = fmin(create_model, space, algo=tpe.suggest, max_evals=3, trials=trials)
    best_model = trials.best_trial['result']['model']
    
#     print('\nBest params are:\n')
#     print(best)
#     print('\nBest model:\n')
#     print(best_model.summary())
    
    
    scaled_feature_df = pd.concat([x_train,x_test])
    label_df = pd.concat([y_train,y_test])
    
    pred_df = make_predictions(model=best_model,df=scaled_feature_df)
    
    output_df = pd.merge(input_df,pred_df['predictions'].to_frame(),left_index=True,right_index=True)
    
    return best_model, output_df

In [10]:
def make_predictions(model,df):
    
    # Prdeicting on whole df
    
    predictions = model.predict(df).flatten()
    df['predictions'] = predictions
    
    return df

In [11]:
def driver(csv_name,label_col,num_features):
    
    best_model, output_df = get_best_model(csv_name=csv_name,label_col=label_col,num_features=num_features)
    
#     display_results(best_model=best_model, output_df=output_df)
    
    # Save the model and update config db if user wants to save 
#     save(best_model=best_model, output_df=output_df,training_dummy_columns=training_dummy_columns)
    return best_model,output_df

In [46]:
if __name__ == '__main__':

    csv_name = 'nba_input'
    label_col = 'TARGET_5Yrs'
    num_features = 4
    
    best_model,output_df = driver(csv_name=csv_name,label_col=label_col,num_features=num_features)

Reading the data...
Selecting the best features for training...
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
No of selected features: 4
Moulding the network architecture specifically for your data...
Running evaluatins with various architectures and hyper-parameters...
Testing set Accuracy: 69.92%
Testing set Accuracy: 70.68%
Testing set Accuracy: 69.92%


In [28]:
selected_cols

['GP', 'MIN', 'FG%', 'FT%', 'TARGET_5Yrs']

In [30]:
best_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 64)                320       
_________________________________________________________________
activation_9 (Activation)    (None, 64)                0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 64)                4160      
_________________________________________________________________
activation_10 (Activation)   (None, 64)                0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 65        
__________

# Making predictions on the whole df

In [31]:
output_df['preds'] = np.where(output_df.predictions>=0.5,1,0)

In [33]:
output_df

Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,...,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs,predictions,preds
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,...,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0.0,0.396241,0
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,...,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0.0,0.316577,0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,...,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0.0,0.682243,1
3,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,...,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1.0,0.525699,1
4,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,...,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1.0,0.476779,0
5,Tony Bennett,75,11.4,3.7,1.5,3.5,42.3,0.3,1.1,32.5,...,0.2,0.7,0.8,1.8,0.4,0.0,0.7,0.0,0.590159,1
6,Don MacLean,62,10.9,6.6,2.5,5.8,43.5,0.0,0.1,50.0,...,0.5,1.4,2.0,0.6,0.2,0.1,0.7,1.0,0.491318,0
7,Tracy Murray,48,10.3,5.7,2.3,5.4,41.5,0.4,1.5,30.0,...,0.8,0.9,1.7,0.2,0.2,0.1,0.7,1.0,0.399361,0
8,Duane Cooper,65,9.9,2.4,1.0,2.4,39.2,0.1,0.5,23.3,...,0.2,0.6,0.8,2.3,0.3,0.0,1.1,0.0,0.547772,1
9,Dave Johnson,42,8.5,3.7,1.4,3.5,38.3,0.1,0.3,21.4,...,0.4,0.7,1.1,0.3,0.2,0.0,0.7,0.0,0.439888,0


# Analysing results on the test data

In [34]:
test = x_test.copy()

In [35]:
test['predict'] = best_model.predict_classes(x_test)

In [36]:
test['actual'] = y_test

In [38]:
sk_metrics.confusion_matrix(test.actual,test.predict)

array([[ 54,  36],
       [ 37, 139]])

In [39]:
sk_metrics.accuracy_score(test.actual,test.predict)

0.7255639097744361

In [40]:
sk_metrics.precision_score(test.actual,test.predict)

0.7942857142857143

In [41]:
sk_metrics.recall_score(test.actual,test.predict)

0.7897727272727273

In [42]:
sk_metrics.f1_score(test.actual,test.predict)

0.7920227920227919