# probability calibration_SVC

In [1]:
#Since the competitors are expected to predict probabilities, focusing to specific models to calculate the probability better.
#The plot at below if a line fits similarly with the empirical line with black dashes 
#then it means the model calculates the probability better, 
#namely it is well calibrated for probability prediction. 
#The plot is from sklearn and sklearn has a built in classifier for probability calibration. 
#The plot below shows the performance of 3 ML models, SVC, LR and Naive Bayes. 
#You can pick 3 different models from here. Namely isotonic + sigmoid for SVC and LR. 

In [2]:
import matplotlib.pyplot as plt
import numpy as np
from time import time
from sklearn import svm, pipeline
from sklearn.kernel_approximation import (RBFSampler,Nystroem)
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GroupKFold, StratifiedKFold, train_test_split, KFold
from sklearn.metrics import log_loss

In [5]:
#import data

datadir ="../../Notebooks/24March18/numerai_dataset_99/"
df_train = pd.read_csv(datadir + "numerai_training_data.csv")
df_tournament = pd.read_csv(datadir + "numerai_tournament_data.csv")
df_valid = df_tournament[df_tournament['data_type'].isin(['validation'])]

feature_cols = [f for f in df_train.columns if "feature" in f]
target_col = df_train.columns[-1]

In [6]:
data_train = df_train[feature_cols].values
data_test = df_valid[feature_cols].values
df_train.era = df_train.era.factorize()[0]

# Sampling Numerai Data set

to create a meaningful sample data set
data should be equally distributed both in target labels
and training data should present each era, 
one can use pandas sample feature or sklearn traintestsplit is fine.

In [74]:
df_train_s = df_train.sample(frac=0.5,random_state=5)

In [75]:
x_tr,x_te,y_tr,y_te = train_test_split(df_train, df_train.target,test_size =0.5)

In [76]:
df_train_s.target.value_counts() # almost equally distributed

0    98430
1    98376
Name: target, dtype: int64

In [77]:
x_te.target.value_counts() # almost equally distributed

0    98425
1    98382
Name: target, dtype: int64

In [78]:
x_te.groupby("era").target.value_counts()

era     target
era1    1          400
        0          397
era10   0          628
        1          615
era100  1          953
        0          945
era101  0          945
        1          944
era102  1         1005
        0          984
era103  1          927
        0          923
era104  0          936
        1          930
era105  1          966
        0          935
era106  0          947
        1          913
era107  0          944
        1          928
era108  1          785
        0          755
era109  1          912
        0          885
era11   1          646
        0          594
era110  0          940
        1          934
era111  0          959
        1          932
                  ... 
era86   1          863
        0          847
era87   1          914
        0          886
era88   1          921
        0          897
era89   0          917
        1          881
era9    0          602
        1          593
era90   0          949
        1          

In [79]:
df_train_s.groupby("era").target.value_counts()

era     target
era1    1         410
        0         398
era10   0         639
        1         632
era100  1         954
        0         921
era101  0         971
        1         954
era102  1         991
        0         972
era103  0         952
        1         908
era104  0         967
        1         933
era105  0         982
        1         899
era106  0         982
        1         936
era107  1         945
        0         913
era108  0         789
        1         765
era109  1         914
        0         905
era11   0         638
        1         598
era110  0         965
        1         917
era111  0         971
        1         950
                 ... 
era86   1         873
        0         834
era87   0         931
        1         893
era88   0         942
        1         879
era89   0         941
        1         927
era9    0         626
        1         606
era90   0         905
        1         849
era91   0         843
        1        

# Probability Calibration with isotonic and sigmoid method

try which folding method results better log loss

In [8]:
gkfcv = GroupKFold(n_splits=10)
skfcv = StratifiedKFold(n_splits=10,random_state=27,shuffle=True)

Lsvm = svm.LinearSVC(C=1e-2)

sigmoid_calibrated_Lsvm_gkfcv = CalibratedClassifierCV(svm.LinearSVC(C=1e-2), \
                                                 cv=gkfcv.split(data_train,df_train.target,groups=df_train.era), \
                                                method='sigmoid')

sigmoid_calibrated_Lsvm_cv = CalibratedClassifierCV(svm.LinearSVC(C=1e-2),cv=10,method='sigmoid')

sigmoid_calibrated_Lsvm_skfcv = CalibratedClassifierCV(svm.LinearSVC(C=1e-2), \
                                                 cv=skfcv.split(data_train,df_train.target,groups=df_train.era), \
                                                method='sigmoid')

isotonic_calibrated_Lsvm_gkfcv = CalibratedClassifierCV(svm.LinearSVC(C=1e-2), \
                                                 cv=gkfcv.split(data_train,df_train.target,groups=df_train.era), \
                                                method='isotonic')

isotonic_calibrated_Lsvm_cv = CalibratedClassifierCV(svm.LinearSVC(C=1e-2),cv=10,method='isotonic')

isotonic_calibrated_Lsvm_skfcv = CalibratedClassifierCV(svm.LinearSVC(C=1e-2), \
                                                 cv=skfcv.split(data_train,df_train.target,groups=df_train.era), \
                                                method='isotonic')

clfs = [Lsvm,sigmoid_calibrated_Lsvm_gkfcv, sigmoid_calibrated_Lsvm_cv,sigmoid_calibrated_Lsvm_skfcv,
         isotonic_calibrated_Lsvm_gkfcv, isotonic_calibrated_Lsvm_cv,isotonic_calibrated_Lsvm_skfcv]
clf_names = ["Lsvm","sigmoid_calibrated_Lsvm_gkfcv", "sigmoid_calibrated_Lsvm_cv",
         "sigmoid_calibrated_Lsvm_skfcv", "isotonic_calibrated_Lsvm_gkfcv",\
             "isotonic_calibrated_Lsvm_cv","isotonic_calibrated_Lsvm_skfcv"]
i=0
for clf in clfs:
  
    clf.fit(data_train,df_train.target)
    y_pred = clf.predict(data_test)
    if hasattr(clf, "predict_proba"):
        prob_pos = clf.predict_proba(data_test)[:, 1]
    else:  # use decision function
        prob_pos = clf.decision_function(data_test)
        prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
        
    print("Classifier:",clf_names[i])
    print("\tLogLoss: %1.6f" % log_loss(df_valid.target, prob_pos))
    i = i + 1

Classifier: Lsvm
	LogLoss: 0.723624
Classifier: sigmoid_calibrated_Lsvm_gkfcv
	LogLoss: 0.692757
Classifier: sigmoid_calibrated_Lsvm_cv
	LogLoss: 0.692756
Classifier: sigmoid_calibrated_Lsvm_skfcv
	LogLoss: 0.692844
Classifier: isotonic_calibrated_Lsvm_gkfcv
	LogLoss: 0.692842
Classifier: isotonic_calibrated_Lsvm_cv
	LogLoss: 0.692799
Classifier: isotonic_calibrated_Lsvm_skfcv
	LogLoss: 0.692942
