# SVC with hyperparameter tuning

No standardisation

Pre-requisite: nommesen_april-week2-prepare_data.ipynb

In [1]:
import pandas as pd
import numpy as np
import math

# SKLearn
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns

import joblib

import xgboost as xgb
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin



## Loading the training data set

In [2]:
alltrain = np.load('../data/processed/alltrain.npy')

print("Dimension of training data - unsplit", alltrain.shape)

df_alltrain = pd.DataFrame(alltrain, 
             columns=['ID',
                 'Games Played', 
    'Minutes Played',
    'Points Per Game',
    'Field Goals Made',
    'Field Goals Attempts',
    'Field Goals Percent',
    '3Points Made',
    '3Points Attempts',
    '3Points Percent',
    'Free Throw Made',
    'Free Throw Attempts',
    'Free Throw Percent',
    'Offensive Rebounds',
    'Defensive Rebounds',
    'Rebounds',
    'Assists',
    'Steals',
    'Blocks',
    'Turnovers',
     'TARGET_5Yrs'
                     ])

df_alltrain['TARGET_5Yrs'] = df_alltrain['TARGET_5Yrs'].astype(dtype ='int64')
df_alltrain['ID'] = df_alltrain['ID'].astype(dtype ='str')

df_alltrain.head()

Dimension of training data - unsplit (8000, 21)


Unnamed: 0,ID,Games Played,Minutes Played,Points Per Game,Field Goals Made,Field Goals Attempts,Field Goals Percent,3Points Made,3Points Attempts,3Points Percent,...,Free Throw Attempts,Free Throw Percent,Offensive Rebounds,Defensive Rebounds,Rebounds,Assists,Steals,Blocks,Turnovers,TARGET_5Yrs
0,3799.0,80.0,24.3,7.8,3.0,6.4,45.7,0.1,0.3,22.6,...,2.9,72.1,2.2,2.0,3.8,3.2,1.1,0.2,1.6,1
1,3800.0,75.0,21.8,10.5,4.2,7.9,55.1,-0.3,-1.0,34.9,...,3.6,67.8,3.6,3.7,6.6,0.7,0.5,0.6,1.4,1
2,3801.0,85.0,19.1,4.5,1.9,4.5,42.8,0.4,1.2,34.3,...,0.6,75.7,0.6,1.8,2.4,0.8,0.4,0.2,0.6,1
3,3802.0,63.0,19.1,8.2,3.5,6.7,52.5,0.3,0.8,23.7,...,1.5,66.9,0.8,2.0,3.0,1.8,0.4,0.1,1.9,1
4,3803.0,63.0,17.8,3.7,1.7,3.4,50.8,0.5,1.4,13.7,...,0.5,54.0,2.4,2.7,4.9,0.4,0.4,0.6,0.7,1


## Feature selection
All features are used

In [3]:
target = df_alltrain.pop('TARGET_5Yrs')
features = df_alltrain[['Games Played', 
    'Minutes Played',
    'Points Per Game',
    'Field Goals Made',
    'Field Goals Attempts',
    'Field Goals Percent',
    '3Points Made',
    '3Points Attempts',
    '3Points Percent',
    'Free Throw Made',
    'Free Throw Attempts',
    'Free Throw Percent',
    'Offensive Rebounds',
    'Defensive Rebounds',
    'Rebounds',
    'Assists',
    'Steals',
    'Blocks',
    'Turnovers']]

# target.info()
# features.info()

## Splitting data
Split randomly the dataset with random_state=8 into 2 different sets: training data (80%) and validation data (20%)

In [4]:
X_train, X_val, y_train, y_val = train_test_split (features, 
                                                   target,
                                                   test_size=0.2,
                                                   random_state=8
                                                  )

print("Dimension of features training data", X_train.shape)
print("Dimension of target training data", y_train.shape)
print("Dimension of features validation data", X_val.shape)
print("Dimension of targer validation data", y_val.shape)

Dimension of features training data (6400, 19)
Dimension of target training data (6400,)
Dimension of features validation data (1600, 19)
Dimension of targer validation data (1600,)


In [5]:
X_train

Unnamed: 0,Games Played,Minutes Played,Points Per Game,Field Goals Made,Field Goals Attempts,Field Goals Percent,3Points Made,3Points Attempts,3Points Percent,Free Throw Made,Free Throw Attempts,Free Throw Percent,Offensive Rebounds,Defensive Rebounds,Rebounds,Assists,Steals,Blocks,Turnovers
3617,64.0,13.9,4.3,1.5,3.6,43.0,-0.2,-0.1,11.9,1.4,1.9,69.2,0.6,1.7,2.2,0.6,0.4,0.1,0.7
1120,81.0,20.8,7.6,2.7,6.2,44.8,0.4,1.1,12.1,1.8,2.1,79.0,1.7,4.0,5.6,0.7,0.6,0.1,1.3
3873,50.0,5.6,1.4,0.5,1.5,36.4,0.0,0.5,-3.1,0.2,0.2,68.6,0.1,0.2,0.4,1.5,0.4,-0.3,0.3
153,72.0,19.3,5.0,2.0,5.0,40.4,0.1,0.5,25.3,0.9,1.6,48.4,0.7,2.0,2.9,1.1,0.5,0.2,0.9
2960,45.0,19.7,6.1,2.5,5.9,42.1,0.1,0.4,-2.7,1.1,1.7,62.0,1.0,1.5,2.6,1.5,0.4,0.3,1.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2181,74.0,46.4,18.7,7.4,16.3,45.3,0.8,2.3,42.2,3.1,4.5,68.0,1.9,6.2,8.4,3.4,1.4,0.3,3.0
2409,59.0,14.8,3.4,1.0,3.2,33.7,0.5,1.7,14.3,1.3,2.0,56.3,0.7,1.2,2.1,1.5,0.9,0.1,0.7
2033,55.0,15.6,3.3,1.1,2.4,44.4,0.6,1.8,32.0,0.7,1.0,72.7,1.1,2.3,3.3,1.0,0.5,0.3,0.8
1364,91.0,47.9,12.1,4.2,9.6,46.0,0.4,1.3,-0.6,3.2,4.6,66.4,3.0,6.5,9.8,4.2,0.9,0.6,1.6


## Model Development

### SVC using the balanced mode (SVC1)

In [6]:
# instantiate the model (using the "balanced" mode)
svc1_train = SVC(kernel='poly', 
                 class_weight='balanced', 
                 probability=True)

# fit the model with data
svc1_train.fit(X_train, y_train)


In [7]:
# save the model to disk
joblib.dump(svc1_train, '../models/svc1.sav')

['../models/svc1.sav']

### SVC with hyperparameter tuning using Hyperopt

In [None]:
# Define the search space for xgboost hyperparameters

space={
    # 'c' : hp.quniform('c', 0.1, 100, 10),
       'kernel' : hp.choice('kernel', ['poly', 'sigmoid', 'rbf', 'linear']),
       'degree' : hp.quniform('degree', 1, 5, 1),
       'gamma' : hp.choice('gamma', ['auto', 'scale']),
       'class_weight' : 'balanced'       
    }


# Define a function called `objective` with the following logics:
# input parameters: hyperparameter seacrh space (`space`)
# logics: train a xgboost model with the search space and calculate the average accuracy score for cross validation with 10 folds
# output parameters: dictionary with the loss score and STATUS_OK


def objective(space):
    from sklearn.model_selection import cross_val_score
    
    svc = SVC(
        # c = space['c'],
        kernel = space['kernel'],
        degree = int(space['degree']),
        gamma = space['gamma'],
        class_weight='balanced', 
        probability=True
    )
            
    auc = cross_val_score(svc, X_train, y_train, cv=10, scoring="roc_auc").mean()

    return{'loss': auc, 'status': STATUS_OK }



# Launch Hyperopt search and save the result in a variable called `best`
best = fmin(
    fn=objective,   
    space=space,       
    algo=tpe.suggest,       
    max_evals=10
)


# Print the best set of hyperparameters
print("Best: ", best)


 10% 1/10 [01:48<16:18, 108.75s/trial, best loss: 0.5000939849624061]

## Model Evaluation

In [None]:
# Load the model from the file 
svc3_from_joblib = joblib.load('../models/svc3.sav') 

In [None]:
# Predict on training set
y_pred_ontrain = svc3_from_joblib.predict(X_train)

# Accuracy
print( accuracy_score(y_pred_ontrain, y_train) )

### Prediction on validation data

In [None]:
y_pred_onval = svc1_from_joblib.predict(X_val)
y_pred_onval

### Confusion Matrix

In [None]:
cnf_matrix = metrics.confusion_matrix(y_val, y_pred_onval)
cnf_matrix


In [None]:
class_names = [0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label');

### Accuracy, Precision, and Recall

In [None]:
target_names = ['career years played < 5', 'career years played >= 5']
print(classification_report(y_val, y_pred_onval, target_names=target_names))

### ROC

In [None]:
# y_pred_proba = logreg.predict_proba(X_test)[::,1]
y_proba_onval = svc1_from_joblib.predict_proba(X_val)[::,1]  # official
fpr, tpr, _ = metrics.roc_curve(y_val,  y_proba_onval)
auc = metrics.roc_auc_score(y_val, y_proba_onval)

plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

## Test on unseen data

### Loading the test data set

In [None]:
test = np.load('../data/processed/test.npy')

print("Dimension of training data - unsplit", test.shape)

df_test = pd.DataFrame(test, 
             columns=['ID',
                 'Games Played', 
    'Minutes Played',
    'Points Per Game',
    'Field Goals Made',
    'Field Goals Attempts',
    'Field Goals Percent',
    '3Points Made',
    '3Points Attempts',
    '3Points Percent',
    'Free Throw Made',
    'Free Throw Attempts',
    'Free Throw Percent',
    'Offensive Rebounds',
    'Defensive Rebounds',
    'Rebounds',
    'Assists',
    'Steals',
    'Blocks',
    'Turnovers'
                     ])

df_test['ID'] = df_test['ID'].astype(int)
df_test['ID'] = df_test['ID'].astype(str)

# df_test_indexed=df_test.set_index('ID')

df_test.head()

In [None]:
df_test.info()

In [None]:
df_features_test = df_test[['Games Played', 
    'Minutes Played',
    'Points Per Game',
    'Field Goals Made',
    'Field Goals Attempts',
    'Field Goals Percent',
    '3Points Made',
    '3Points Attempts',
    '3Points Percent',
    'Free Throw Made',
    'Free Throw Attempts',
    'Free Throw Percent',
    'Offensive Rebounds',
    'Defensive Rebounds',
    'Rebounds',
    'Assists',
    'Steals',
    'Blocks',
    'Turnovers']]

df_features_test.head()

### Predictions

In [None]:
# Use the loaded model to make predictions 
y_pred_ontest = svc1_from_joblib.predict(df_features_test)

df_test_predictions = pd.DataFrame(y_pred_ontest, columns=['TARGET_5Yrs'])

df_test_predictions.TARGET_5Yrs.value_counts()

In [None]:
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

sns.countplot(x = 'TARGET_5Yrs',
              data = df_test_predictions,
              palette = 'hls'
             )
plt.show()
plt.savefig('count_plot')

### Submit Prediction to Kaggle

In [None]:
# Use the loaded model to make predictions 
y_test = svc1_from_joblib.predict_proba(df_features_test)

In [None]:
y_test

In [None]:
df_test_probabilites = pd.DataFrame(y_test, 
             columns=['TARGET_5Yrs_0',
                 'TARGET_5Yrs'
                     ])

In [None]:
df_test_probabilites.head()

In [None]:
output = df_test.join(df_test_probabilites)
output=output.set_index('ID')

output

In [None]:
output['TARGET_5Yrs'].to_csv('../models/nommesen_april_week3_model-svc1.csv')