In [20]:
'''
This is a snippet of code showing how to select the hyper-parameters
of a MKL method using boolean kernels
Author: Ivano Lauriola, ivano.lauriola@phd.unipd.it
'''
import MKLpy
import pandas as pd
import numpy as np
import sklearn.svm as svm
import pickle
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.model_selection import KFold, cross_val_score
from sklearn.multiclass import OneVsRestClassifier  # support from multiclass
import time
from sklearn.svm import SVC



#load data
print ('loading \'iris\' (multiclass) dataset...', end='')
from sklearn.datasets import load_iris
ds = load_iris()
X,Y = ds.data, ds.target

'''
WARNING: be sure that your matrix is not sparse! EXAMPLE:
from sklearn.datasets import load_svmlight_file
X,Y = load_svmlight_file(...)
X = X.toarray()
'''

print ('done')

loading 'iris' (multiclass) dataset...done


In [46]:
def open_pickle_file(path, pickle_file):
    file_loc = os.path.join(path, pickle_file)
    pickle_to_file = pickle.load(open(file_loc, "rb"))
    return pickle_to_file

In [47]:
X.shape

(569, 30)

In [48]:
#preprocess data
print ('preprocessing data...', end='')
#boolean kernels can be applied on binary-valued data, i.e. {0,1}.
#in this example, we binarize a real-valued dataset
#in this example, we binarize a real-valued dataset
from MKLpy.preprocessing import binarization
binarizer = binarization.AverageBinarizer()
Xbin = binarizer.fit_transform(X,Y)
print ('done')

#compute normalized homogeneous polynomial kernels with degrees 0,1,2,...,10.
print ('computing monotone Conjunctive Kernels...', end='')
from MKLpy.metrics import pairwise
from MKLpy.preprocessing import kernel_normalization
#WARNING: the maximum arity of the conjunctive kernel depends on the number of active variables for each example,
# that is 4 in the case of iris dataset binarized
KL = [kernel_normalization(pairwise.monotone_conjunctive_kernel(Xbin, c=c)) for c in range(5)]
print ('done')

preprocessing data...done
computing monotone Conjunctive Kernels...done


In [49]:

#train/test KL split (N.B. here we split a kernel list directly)
from MKLpy.model_selection import train_test_split
KLtr,KLte,Ytr,Yte = train_test_split(KL, Y, test_size=.3, random_state=42)

#MKL algorithms
from MKLpy.algorithms import EasyMKL, KOMD	#KOMD is not a MKL algorithm but a simple kernel machine like the SVM
from MKLpy.model_selection import cross_val_score, cross_val_predict
from sklearn.svm import SVC
import numpy as np
print ('tuning lambda for EasyMKL...', end='')
base_learner = SVC(C=10000)	#simil hard-margin svm
best_results = {}


tuning lambda for EasyMKL...

In [50]:

#MKL algorithms
from MKLpy.algorithms import AverageMKL, EasyMKL, KOMD	#KOMD is not a MKL algorithm but a simple kernel machine like the SVM
# #select the base-learner
# #MKL algorithms use a hard-margin as base learned (or KOMD in the case of EasyMKL). It is possible to define a different base learner
# from sklearn.svm import SVC
# base_learner = SVC(C=0.1)
# clf = EasyMKL(learner=base_learner)
# clf = clf.fit(KLtr,Ytr)
print ('training AverageMKL...', end='')
clf = AverageMKL().fit(KLtr,Ytr)	#a wrapper for averaging kernels
print ('done')
print(clf.weights)			#print the weights of the combination of base kernels
K_average = clf.ker_matrix	#the combined kernel matrix


print ('training EasyMKL...', end='')
clf = EasyMKL(lam=0.1).fit(KLtr,Ytr)		#combining kernels with the EasyMKL algorithm
#lam is a hyper-parameter in [0,1]
print ('done')
print (clf.weights)

# #evaluate the solution
# from sklearn.metrics import accuracy_score, roc_auc_score
# y_pred = clf.predict(KLte)					#predictions
# y_score = clf.decision_function(KLte)		#rank
# accuracy = accuracy_score(Yte, y_pred)
# roc_auc = roc_auc_score(Yte, y_score)
# print ('Accuracy score: %.3f, roc AUC score: %.3f' % (accuracy, roc_auc))




training AverageMKL...done
[0.2 0.2 0.2 0.2 0.2]
training EasyMKL...done
[-9.71714556e-17  6.14493694e-02  1.64774518e-01  3.03677199e-01
  4.70098913e-01]


In [51]:
clf=AverageMKL()

In [52]:
#load data
print ('loading \'iris\' dataset...', end='')
from sklearn.datasets import load_iris
import numpy as np
ds = load_iris()
X,Y = ds.data, ds.target
classes = np.unique(Y)
print ('done [%d classes]' % len(classes))

'''
WARNING: be sure that your matrix is not sparse! EXAMPLE:
from sklearn.datasets import load_svmlight_file
X,Y = load_svmlight_file(...)
X = X.toarray()
'''

#compute homogeneous polynomial kernels with degrees 0,1,2,...,10.
print ('computing Homogeneous Polynomial Kernels...', end='')
from MKLpy.metrics import pairwise
KL = [pairwise.homogeneous_polynomial_kernel(X, degree=d) for d in range(1,4)]
KLconjuctive = [kernel_normalization(pairwise.monotone_conjunctive_kernel(Xbin, c=c)) for c in range(5)]
print ('done')


# #MKL algorithms
# from MKLpy.algorithms import EasyMKL
# print ('training EasyMKL...', end='')
# clf = EasyMKL(lam=0.1).fit(KL,Y)		#combining kernels with the EasyMKL algorithm
# #multiclass_strategy should be 'ovo' for one-vs-one decomposition strategy, and 'ova' for one-vs-all/rest strategy
# print ('done')

# print (clf.weights)

loading 'iris' dataset...done [3 classes]
computing Homogeneous Polynomial Kernels...done


In [110]:
X.shape

(569, 30)

In [53]:


#load data
print ('loading \'breast cancer\' dataset...', end='')
from sklearn.datasets import load_breast_cancer
ds = load_breast_cancer()
X,Y = ds.data, ds.target
print ('done')

'''
WARNING: be sure that your matrix is not sparse! EXAMPLE:
from sklearn.datasets import load_svmlight_file
X,Y = load_svmlight_file(...)
X = X.toarray()
'''

#preprocess data
print ('preprocessing data...', end='')
from MKLpy.preprocessing import normalization, rescale_01
X = rescale_01(X) #feature scaling in [0,1]
X = normalization(X) #||X_i||_2^2 = 1

#train/test split
from sklearn.model_selection import train_test_split
Xtr,Xte,Ytr,Yte = train_test_split(X,Y, test_size=.25, random_state=42)
print ('done')

loading 'breast cancer' dataset...done
preprocessing data...done


In [54]:
# pd.DataFrame(X), print(np.unique(Y))
Y.shape

(569,)

In [55]:
#compute homogeneous polynomial kernels with degrees 0,1,2,...,10.
print ('computing Homogeneous Polynomial Kernels...', end='')
from MKLpy.metrics import pairwise
KLtr = [pairwise.homogeneous_polynomial_kernel(Xtr, degree=d) for d in range(11)]
KLte = [pairwise.homogeneous_polynomial_kernel(Xte,Xtr, degree=d) for d in range(11)]
print ('done')


computing Homogeneous Polynomial Kernels...done


In [56]:
#MKL algorithms
from MKLpy.algorithms import AverageMKL, EasyMKL, KOMD	#KOMD is not a MKL algorithm but a simple kernel machine like the SVM
print ('training AverageMKL...', end='')
clf = AverageMKL().fit(KLtr,Ytr)	#a wrapper for averaging kernels
print ('done')
print(clf.weights)			#print the weights of the combination of base kernels
K_average = clf.ker_matrix	#the combined kernel matrix


print ('training EasyMKL...', end='')
clf = EasyMKL(lam=0.1).fit(KLtr,Ytr)		#combining kernels with the EasyMKL algorithm
#lam is a hyper-parameter in [0,1]
print ('done')
print (clf.weights)

training AverageMKL...done
[0.09090909 0.09090909 0.09090909 0.09090909 0.09090909 0.09090909
 0.09090909 0.09090909 0.09090909 0.09090909 0.09090909]
training EasyMKL...done
[-1.08387208e-24  1.35673700e-02  2.76971359e-02  4.30853034e-02
  6.02816730e-02  7.96701534e-02  1.01481855e-01  1.25820437e-01
  1.52689783e-01  1.82019528e-01  2.13686761e-01]


In [57]:
#evaluate the solution
from sklearn.metrics import accuracy_score, roc_auc_score
y_pred = clf.predict(KLte) #predictions
y_score = clf.decision_function(KLte) #rank
accuracy = accuracy_score(Yte, y_pred)
roc_auc = roc_auc_score(Yte, y_score)
print ('Accuracy score: %.3f, roc AUC score: %.3f' % (accuracy, roc_auc))


#select the base-learner
#MKL algorithms use a hard-margin as base learned (or KOMD in the case of EasyMKL). It is possible to define a different base learner
from sklearn.svm import SVC
base_learner = SVC(C=0.1, kernel='rbf')
clf = EasyMKL(learner=base_learner)
clf = clf.fit(KLtr,Ytr)

Accuracy score: 0.923, roc AUC score: 0.988


clf.predict(KLte)

In [58]:
for lam in [0, 0.01, 0.1, 0.2, 0.9, 1]:	#possible lambda values for the EasyMKL algorithm
    #MKLpy.model_selection.cross_val_predict performs the cross validation automatically, it optimizes the accuracy
    #the counterpart cross_val_score optimized the roc_auc_score (use score='roc_auc')
    #WARNING: these functions will change in the next version
    scores = cross_val_predict(KLtr, Ytr, EasyMKL(learner=base_learner, lam=lam), n_folds=5, score='accuracy')
    acc = np.mean(scores)
    if not best_results or best_results['score'] < acc:
        best_results = {'lam' : lam, 'score' : acc}

In [59]:
#evaluation on the test set
from sklearn.metrics import accuracy_score
print ('done')
clf = EasyMKL(learner=base_learner, lam=best_results['lam']).fit(KLtr,Ytr)
y_pred = clf.predict(KLte)
accuracy = accuracy_score(Yte, y_pred)
print ('accuracy on the test set: %.3f, with lambda=%.2f' % (accuracy, best_results['lam']))

done
accuracy on the test set: 0.923, with lambda=0.00


In [60]:
import os

#   data_dir: main directory , data_only_drive: the big drive where everything is saved
# data only dir: main drive that has the
workDrive = '/media/ak/WorkDrive/'
mountDrive = '/media/ak/e7ab6eee-f896-4a12-9128-b31bcfb0cd38'  
dataOnlyDrive = '/media/ak/DataOnly'# external date only drive'

In [61]:
os.listdir(dataOnlyDrive)

['VXX',
 'VXX_US_Equity-20181027',
 'XM1_Comdty-20181028',
 'TY1_Comdty-20181028',
 'BTCUSD.PERP.BMEX',
 'US1_Comdty-20181028',
 'FinData',
 '.Trash-1000',
 'SYNT_2states',
 'data for testing',
 'raw bloomberg data',
 'crypto',
 'YM1_Comdty_quotes',
 'TU1_Comdty-20181028',
 'old models',
 'YM1_Comdty-20181028',
 'Data',
 'YM1_Comdty_trades',
 'lost+found']

In [62]:
data_only_drive = '/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2'  # external date only drive
data_dir = os.getenv('FINANCE_DATA')

In [63]:

labels = os.path.join(data_dir,'Labels')
symbols =os.listdir(labels)
idxSymbol = 2
featuresDates = os.path.join(data_dir,symbols[idxSymbol],'MODEL_BASED')
idxDate= 9

featuresDateList = os.listdir(os.path.join(data_dir,symbols[idxSymbol],'MODEL_BASED'))


featuresDateItems = os.path.join(os.path.join(data_dir,symbols[idxSymbol],'MODEL_BASED'), featuresDateList[3])

listFeaturesPickle = os.listdir(featuresDateItems)

In [64]:
 
os.listdir(featuresDateItems)

['CNA.L_3_states_features_date:_20171006_now:_20181229_.pickle',
 'CNA.L_3_states_features_date:_20171018_now:_20181229_.pickle',
 'CNA.L_3_states_features_date:_20180219_now:_20181229_.pickle',
 'CNA.L_3_states_features_date:_20171023_now:_20181229_.pickle',
 'CNA.L_3_states_features_date:_20180209_now:_20181229_.pickle',
 'CNA.L_3_states_features_date:_20180228_now:_20181229_.pickle',
 'CNA.L_3_states_features_date:_20180404_now:_20181229_.pickle',
 'CNA.L_3_states_features_date:_20170926_now:_20181229_.pickle',
 'CNA.L_3_states_features_date:_20180213_now:_20181229_.pickle',
 'CNA.L_3_states_features_date:_20171020_now:_20181229_.pickle',
 'CNA.L_3_states_features_date:_20171030_now:_20181229_.pickle',
 'CNA.L_3_states_features_date:_20180206_now:_20181229_.pickle',
 'CNA.L_3_states_features_date:_20180409_now:_20181229_.pickle',
 'CNA.L_3_states_features_date:_20171027_now:_20181229_.pickle',
 'CNA.L_3_states_features_date:_20180403_now:_20181229_.pickle',
 'CNA.L_3_states_features

In [65]:
featuresPickle =os.path.join(featuresDateItems,listFeaturesPickle[0])

In [66]:
def get_date_from_file(file_, numb_):
    return os.path.splitext(file_[numb_])[0]

In [67]:
labelsDate = os.path.splitext(listFeaturesPickle[0])[0].split("_")[5]

In [68]:
pickle_to_file = pickle.load(open(featuresPickle, "rb"), encoding='latin1')

In [69]:
fischerFeatures = pickle_to_file[0]
gammaFeatures = pickle_to_file[2]
informationFeatures= pickle_to_file[1]
ksiFeatures= pickle_to_file[3]


In [70]:
symbolLabel = os.path.join(labels,symbols[idxSymbol],'NON_DIRECTIONAL')

Unnamed: 0,fischer_score_dlambda,fischer_score_dsigma,fischer_score_dweight
0,2.624230e-16,0.000000,1.169507
1,1.435258e+01,-2237.318650,-1.815221
2,3.245063e+01,-4479.369060,-4.803551
3,5.163437e+01,-4481.338888,-3.297853
4,7.046081e+01,-3919.231495,-6.284311
...,...,...,...
3335,3.650906e+04,824452.905449,-1049.424133
3336,3.653781e+04,825583.945390,-1052.242669
3337,3.657620e+04,839426.960859,-1054.973891
3338,3.630492e+04,828852.744933,-1055.223745


In [72]:
labelsFile = os.path.join(symbolLabel,labelsDate+".csv")

In [82]:
labelsDf= pd.read_csv(labelsFile).fillna(0)

In [84]:
labels =labelsDf['label_PrMov__window_5__thres_arbitrary__0.1']

In [86]:
Y_all = labels

In [111]:
X_all = np.asarray(fischerFeatures)

In [112]:
X_all.shape

(3340, 3)

In [116]:
X_rescaled = rescale_01(X_all) #feature scaling in [0,1]
X_normz = normalization(X_rescaled) #||X_i||_2^2 = 1
# X_normz= np.asanyarray(X_normz)
# #train/test split

Xtr,Xte,Ytr,Yte = train_test_split(X_normz,Y_all, test_size=.85, random_state=42)
print ('done')

done


In [117]:
#compute homogeneous polynomial kernels with degrees 0,1,2,...,10.
print ('computing Homogeneous Polynomial Kernels...', end='')
from MKLpy.metrics import pairwise
KLtr = [pairwise.homogeneous_polynomial_kernel(Xtr, degree=d) for d in range(11)]
KLte = [pairwise.homogeneous_polynomial_kernel(Xte,Xtr, degree=d) for d in range(11)]
print ('done')

computing Homogeneous Polynomial Kernels...done


In [118]:
#MKL algorithms
from MKLpy.algorithms import AverageMKL, EasyMKL, KOMD	#KOMD is not a MKL algorithm but a simple kernel machine like the SVM
print ('training AverageMKL...', end='')
clf = AverageMKL().fit(KLtr,Ytr)	#a wrapper for averaging kernels
print ('done')
print(clf.weights)			#print the weights of the combination of base kernels
K_average = clf.ker_matrix	#the combined kernel matrix


print ('training EasyMKL...', end='')
clf = EasyMKL(lam=0.1).fit(KLtr,Ytr)		#combining kernels with the EasyMKL algorithm
#lam is a hyper-parameter in [0,1]
print ('done')
print (clf.weights)

training AverageMKL...done
[0.09090909 0.09090909 0.09090909 0.09090909 0.09090909 0.09090909
 0.09090909 0.09090909 0.09090909 0.09090909 0.09090909]
training EasyMKL...done
[8.74391642e-15 1.47818727e-02 9.72459861e-03 1.07557648e-02
 2.84257802e-02 4.47950332e-02 6.17903138e-02 9.37720229e-02
 1.50867167e-01 2.36132749e-01 3.48954698e-01]


In [119]:
#evaluate the solution
from sklearn.metrics import accuracy_score, roc_auc_score
y_pred = clf.predict(KLte) #predictions
y_score = clf.decision_function(KLte) #rank
accuracy = accuracy_score(Yte, y_pred)
roc_auc = roc_auc_score(Yte, y_score)
print ('Accuracy score: %.3f, roc AUC score: %.3f' % (accuracy, roc_auc))


#select the base-learner
#MKL algorithms use a hard-margin as base learned (or KOMD in the case of EasyMKL). It is possible to define a different base learner
from sklearn.svm import SVC
base_learner = SVC(C=0.1, kernel='rbf')
clf = EasyMKL(learner=base_learner)
clf = clf.fit(KLtr,Ytr)

Accuracy score: 0.784, roc AUC score: 0.669


In [120]:
from MKLpy.algorithms import AverageMKL, EasyMKL, KOMD	#KOMD is not a MKL algorithm but a simple kernel machine like the SVM
print ('training AverageMKL...', end='')
clf = AverageMKL().fit(KLtr,Ytr)	#a wrapper for averaging kernels
print ('done')
print(clf.weights)			#print the weights of the combination of base kernels
K_average = clf.ker_matrix	#the combined kernel matrix


print ('training EasyMKL...', end='')
clf = EasyMKL(lam=0.1).fit(KLtr,Ytr)		#combining kernels with the EasyMKL algorithm
#lam is a hyper-parameter in [0,1]
print ('done')
print (clf.weights)

training AverageMKL...done
[0.09090909 0.09090909 0.09090909 0.09090909 0.09090909 0.09090909
 0.09090909 0.09090909 0.09090909 0.09090909 0.09090909]
training EasyMKL...done
[8.74391642e-15 1.47818727e-02 9.72459861e-03 1.07557648e-02
 2.84257802e-02 4.47950332e-02 6.17903138e-02 9.37720229e-02
 1.50867167e-01 2.36132749e-01 3.48954698e-01]
