In [1]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics.pairwise import pairwise_kernels
import joblib
from sklearn import metrics

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification

In [2]:
#Hard dataset

X, y = make_classification(n_classes=3, n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=2)

In [3]:
a, b = make_classification(n_classes=3, n_samples=500, n_features=6, n_informative=5, n_redundant=1, random_state=2)
c, d = make_classification(n_classes=3, n_samples=500, n_features=6, n_informative=3, n_redundant=3, random_state=44)
X = np.concatenate((a,c))
y= np.concatenate((b,d))

In [4]:
#Iris dataset
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

In [5]:
#Serious dataset

X, y = make_classification(n_classes=3, n_samples=1000, n_features=6, n_informative=4, n_redundant=2, random_state=2)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [7]:
from random import randint
import random
from joblib import Parallel, delayed

def _validate(metric, bootstrap_method):
    
    if bootstrap_method is not None:
        if bootstrap_method not in ("rejection","randomchoice"):
            raise ValueError(
                'Invalid preset "%s" for bootstrap_method'
                % bootstrap_method
                )
    
    if metric is not None:
        if metric not in ("rbf", "laplacian"):
            raise ValueError(
                'Invalid preset "%s" for kernel metric'
                % metric
                )
            
def _pickDiverseSeed(sim_matrix,seed=None):
    #creating seed for specific training set, choosing by inverse similarity to previous set
    if seed is None:
            return randint(0,len(sim_matrix)-1)
    else:
        #questo è probabilmente il codice più cringe che io abbia mai scritto,
        #ma non riesco a fargli ritornare un int se non specificando l'indice

        sim_n = 1-sim_matrix[seed]

        candidate = np.random.choice(np.arange(len(sim_matrix)), size=1, replace=True,
                                     p=sim_n/np.sum(sim_n))[0]

        return candidate
    
def _pickDiverseSeed2(sim_matrix,seed): 
    #Randomly picks from an array of the most different seeds from all used ones, does not improve much
    l=len(seed)
    if len(seed) == 0:
        return randint(0,len(sim_matrix)-1)
    else:
        
        candidates = np.zeros(l)
        
        for i in range(0,len(seed)):

            sim_n = 1-sim_matrix[seed[i]]

            candidates[i] = np.random.choice(np.arange(len(sim_matrix)), size=1, replace=True, 
                                             p=sim_n/np.sum(sim_n))[0]
        

        return int(np.random.choice(candidates, size=1, replace=True)[0])
    


In [8]:
def _randomchoice_bootstrap(X, y, similarity_metric, n_estimators, verbose=0):
    #Initializing list of random training sets
    X_sets = []
    y_sets = []
    G = pairwise_kernels(X, metric=similarity_metric)
    
    seed=None
    
    for i in range(0,n_estimators):
        instance_X = np.empty(np.shape(X))
        instance_y = np.empty(np.shape(y))
        
        #creating seed for specific training set
        
        if verbose>0:
            oldseed=seed
        
        seed = _pickDiverseSeed(G,seed)
        
        if verbose>0 and oldseed is not None:
            print("Chose new seed",seed,"based on previous one:",oldseed,"w/ similarity:",G[seed][oldseed])
                  

        instance_X[0] = X[seed]
        instance_y[0] = y[seed]
        
        pool = np.arange(np.shape(X)[0])
        pool_prob = np.zeros(np.shape(X)[0])
        pool_tot = np.sum(G[seed])
        
        
        for i in range(np.shape(X)[0]):
            pool_prob[i] = G[seed][i]/pool_tot
        
        if False:  #adds noise, does not increase accuracy
            for i in range(0,int(np.shape(X)[0]/2)):
                offset = 0.3 #random.uniform(0, 1)
                
                index = randint(0,np.shape(X)[0]-1)
                while pool_prob[index] + offset > 1: 
                    index = randint(0,np.shape(X)[0]-1)
                pool_prob[index] += offset
                
                index = randint(0,np.shape(X)[0])-1
                while pool_prob[index] - offset < 0: 
                    index = randint(0,np.shape(X)[0]-1)
                pool_prob[index] -= offset
                
                
#         print(pool_prob)
        
#         print(G[seed])
        
        pick = np.random.choice(pool, size=(len(pool))-1, replace=True, p=pool_prob)
        
        for i in range(len(pick)):
            instance_X[i+1] = X[pick[i]]
            instance_y[i+1] = y[pick[i]]
        
        X_sets.append(instance_X)
        y_sets.append(instance_y)
    
    return X_sets,y_sets

#Test
X_sets,y_sets = _randomchoice_bootstrap(X_train,y_train,"rbf",n_estimators=50,verbose=1)

Chose new seed 721 based on previous one: 514 w/ similarity: 0.3083267850196758
Chose new seed 720 based on previous one: 721 w/ similarity: 0.0007143098538543377
Chose new seed 763 based on previous one: 720 w/ similarity: 4.678039932872618e-08
Chose new seed 3 based on previous one: 763 w/ similarity: 2.256928105067765e-06
Chose new seed 667 based on previous one: 3 w/ similarity: 0.015369182477468175
Chose new seed 444 based on previous one: 667 w/ similarity: 0.09834566024735635
Chose new seed 262 based on previous one: 444 w/ similarity: 6.78640565208693e-09
Chose new seed 660 based on previous one: 262 w/ similarity: 0.026275220510936757
Chose new seed 322 based on previous one: 660 w/ similarity: 0.009304210644180577
Chose new seed 764 based on previous one: 322 w/ similarity: 1.127815256132459e-06
Chose new seed 277 based on previous one: 764 w/ similarity: 0.005416084375754907
Chose new seed 139 based on previous one: 277 w/ similarity: 0.007268065705649296
Chose new seed 666 

In [9]:
#Parallel functions

def _parallel_fit(n_estimators, base_estimator, X, y, i):
       
    model = base_estimator.fit(X[i], y[i])
    model_name = "model"+str(i)+".pkl"
    joblib.dump(model, model_name)
    
def _parallel_models_predict(votes, sim_means, X, Z, i, j):
    print("test")
    model_name = "model"+str(j)+".pkl"
    model = joblib.load(model_name)
                
    votes.append(model.predict_proba(X[i].reshape(1, -1))) 
    sim_means[j] = np.mean(Z[j][i])
    
def _parallel_rejection_bootstrap(G,seed,n_estimators,X,y,X_sets,y_sets):
    instance_X = np.empty(np.shape(X))
    instance_y = np.empty(np.shape(y))

    seed = _pickDiverseSeed(G,seed)

    instance_X[0] = X[seed]
    instance_y[0] = y[seed]       

    n_entries = 1

    #populating training set


    if 1-(len(X_sets)+1)/n_estimators>0.8:
        p_thresh = 0.8
    else:
        p_thresh = np.round(1-(len(X_sets)+1)/n_estimators,1)

    #print("populating set", i, "with probability acceptance",p_thresh)


    while n_entries < len(X): #until the pool isn't filled up

        rand = randint(0,len(X)-1)

        if _accept_entry(G[seed,rand],n_entries/(len(X)-1),p_thresh):

            instance_X[n_entries] = X[rand]
            instance_y[n_entries] = y[rand]
            n_entries+=1

    X_sets.append(instance_X)
    y_sets.append(instance_y)



In [10]:
from sklearn.metrics import pairwise_distances


def _rejection_bootstrap(X, y, similarity_metric, n_estimators, n_jobs):
    
    #Initializing list of random training sets
    X_sets = []
    y_sets = []
    G = pairwise_kernels(X, metric=similarity_metric)
    
    seed=None

# CURRENTLY RETURNS ERROR IF n_jobs!=1
#     Parallel(n_jobs=n_jobs)(delayed(_parallel_rejection_bootstrap)(G,seed,n_estimators,X,y,X_sets,y_sets) 
#                                   for i in range(0,n_estimators))
    
    for i in range(0,n_estimators):  #for each estimator  
            _parallel_rejection_bootstrap(G,seed,n_estimators,X,y,X_sets,y_sets)   
    
    return X_sets,y_sets

def _accept_entry(similarity_score, p_ratio, prob_threshold = 0.7):
    #p_ratio gradually shifts the decisional weight from similarity_score to randomness as the test set gets filled up.
    #Specifically, at least least a small number of entries are accepted only through randomness (p_ratio=1)
    #this way it's unlikely for the method to come up with sets with only one class (if not impossible, but I need to check)
    
    #It is also possible to modify the prob_threshold in order to get more or less random values, assuring more variance.
    #I think this tweakability could prove useful to adapt the model to specific instances.
    
    #PLEASE NOTE THAT THIS CODE MAKES SENSE WITH NORMALIZED SIMILARITY SCORES
    #NORMALIZATION FOR FUNCTION THAT DO NOT RETURN [0,1] VALUES STILL NEEDS TO BE IMPLEMENTED.
    #It's not like it doesn't work, but prob calculations take the high road
    
    prob = similarity_score*(1-p_ratio)+ random.random()*p_ratio
    if prob > prob_threshold:
        return True
    else:
        return False
    
    
#Test
X_sets, y_sets = _rejection_bootstrap(X_train, y_train,'rbf',50,1)

In [29]:
"""
   
    Parameters
    ----------
    n_estimators : int, default=50
        The number of models to train.
        
    base_estimator : estimator, default=DecisionTreeClassifier()
        The estimator fitted on  each bootstrapped set.     
        
    n_jobs : int, default=1
        Number of parallel jobs during fitting.
        
    similarity_metric : {"rbf", "laplacian", "cosine"}, string, default="rbf"
        The metric used for pairwise_kernels().
        
    bootstrap_method={"rejection", "randomchoice"}, string, default=TODO
        The bootstrap method of choice.
    
    verbose : int, default = 0
        Controls verbosity during fitting and predicting, 0 being none and 3 being the most detailed. 
    
    


"""



class CustomEstimator(BaseEstimator, ClassifierMixin):
    
    def __init__(self, n_estimators=50,
                base_estimator=DecisionTreeClassifier(),
                n_jobs=1,
                bootstrap_method="rejection",
                similarity_metric="rbf",
                verbose = 0):
        
        
        
        self.n_estimators = n_estimators 
        self.base_estimator = base_estimator
        self.n_jobs = n_jobs
        self.bootstrap_method = bootstrap_method
        self.similarity_metric = similarity_metric
        self.verbose = verbose 
        
        _validate(self.similarity_metric, self.bootstrap_method)
        
    def fit(self,X,y):
        
        self.n_classes_= np.max(y)+1
        
        if self.verbose > 0:
            print("Bootstrapping...")
            
        if self.bootstrap_method=="rejection":  
            self.X_sets, self.y_sets = _rejection_bootstrap(X, y, self.similarity_metric, self.n_estimators, self.n_jobs)
        elif self.bootstrap_method=="randomchoice": 
            self.X_sets, self.y_sets = _randomchoice_bootstrap(X, y, self.similarity_metric, self.n_estimators)
        
        
        if self.verbose > 0:
            print("Fitting...")
        
        Parallel(n_jobs=self.n_jobs)(delayed(_parallel_fit)(self.n_estimators, self.base_estimator, self.X_sets, self.y_sets, i) 
                           for i in range(0,self.n_estimators))
            
        
        self.X_ = X
        self.y_ = y

        if self.verbose > 0:
            print("Done!")
        return self
                
    
    def predict(self, X):
        
        predicted_probabilitiy = self.predict_proba2(X)
        return np.argmax(predicted_probabilitiy, axis=1)
    
    def predict_proba(self, X):
            
        if self.verbose > 0:
            print("Predicting...")
            
        out = np.zeros(shape=(len(X),self.n_classes_))
        
        #compute similarity between instance and each training set
        
        if self.verbose > 0:
            print("Computing models similarity to instance...")
        
        Z=[]
        for j in range (0,self.n_estimators):
            Z.append(pairwise_kernels(X, self.X_sets[j], metric=self.similarity_metric))
#             if self.verbose > 1:
#                 print("Similarity matrix for instance to model",j,":",Z[j])
        
        
        
        if self.verbose > 0:
            print("Making predictions with each trained model...")
            
          
        for i in range(0,len(X)): #for each sample to predict
            if self.verbose > 1:  
                print("\n--- EVALUATING SAMPLE",i," ---")
            
            votes = []
            votes_weighted = []
            sim_means = np.zeros(self.n_estimators)
            
            voting_power = np.zeros(self.n_estimators)
            
            for j in range (0,self.n_estimators): #for each estimator trained
               
                #load estimator model
                model_name = "model"+str(j)+".pkl"
                model = joblib.load(model_name)
                
                votes.append(model.predict_proba(X[i].reshape(1, -1))) 
                sim_means[j] = np.mean(Z[j][i])
                
                
            #DOES NOT WORK WITH n_jobs !=0    
#             Parallel(n_jobs=self.n_jobs)(delayed(_parallel_models_predict)(votes, sim_means, X, Z, i, j) 
#                                          for j in range(0,self.n_estimators))
                
                
            for k in range (0,self.n_estimators):
                     
                voting_power[k] = (100*sim_means[k]/np.sum(sim_means))/100
                votes_weighted.append(votes[k]*voting_power[k])
                
                if self.verbose > 2:
                    print("model",k,"votes",np.round(votes[k],4),"having ~",np.round(sim_means[k],4),
                          "similarity (voting power:",np.round(voting_power[k],4),")")
                
            out[i]=(np.sum(votes_weighted,axis=0))
            if self.verbose > 1:
                print("FINAL PREDICTION:",out[i],)
            
        
        if self.verbose > 0:
            print("Done!")

        return out
    
    def predict_proba2(self, X):
        #New faster implementation but a bit less human-readable. Avoids loading all models for each sample evaluation.
        #It looks like the similarity scores are "less similar" from each other, further investigation required.
        
        if self.verbose > 0:
            print("Predicting...")
            
        out = np.zeros(shape=(len(X),self.n_classes_))
        
        #compute similarity between instance and each training set
        
        if self.verbose > 0:
            print("Computing models similarity to instance...")
        
        Z=np.zeros(shape=(self.n_estimators,len(X),len(self.X_sets[0])))
#         print(np.shape(Z))
            
            
        #print("Z",Z[0][0])
            
        #init
            
        preds = []
        sim_means = np.zeros(self.n_estimators)

        voting_power = np.zeros(self.n_estimators)

        for i in range (0,self.n_estimators): #for each estimator trained
            
            Z[i]=(pairwise_kernels(X, self.X_sets[i], metric=self.similarity_metric))
#             if self.verbose > 1:
#                 print("Similarity matrix for to model",i,":",Z[i])

            #load estimator model
            model_name = "model"+str(i)+".pkl"
            model = joblib.load(model_name)
          
            preds.append(model.predict_proba(X))
            
            
#         print(len(X),len(preds),self.n_classes_)  
#         print(len(a) for a in preds)
        
        votes = np.zeros((len(X),len(preds),self.n_classes_))
#         print(votes)
#         print(votes[0][0])#[estimator][sample]
#         print(preds[0][0])
        
        for i in range (0,len(X)): #for each sample to predict
            
            if self.verbose > 2:  
                    print("\n--- EVALUATING SAMPLE",i," ---")
            
            for j in range (0,self.n_estimators): #for each model
                sim_means[j] = np.mean(Z[j][i])
            
            for j in range (0,self.n_estimators):
                
                
                
                
                votes[i][j] = preds[j][i] * ((100*sim_means[j]/np.sum(sim_means))/100)
                
                if self.verbose > 2:
                    print("model",j,"votes",np.round(votes[i][j],4),"having ~",np.round(sim_means[j],4),
                          "similarity (voting power:",np.round(((100*sim_means[j]/np.sum(sim_means))/100),4),")")
                    
            out[i]=(np.sum(votes[i],axis=0))
            if self.verbose > 1:
                print("FINAL PREDICTION:",out[i],)
                
        return(out)
#         print(votes)
        
            
            
            
            
            
        
            
clf = CustomEstimator(n_estimators=50, base_estimator=DecisionTreeClassifier(), n_jobs=-1,
                             similarity_metric='laplacian', bootstrap_method="randomchoice",verbose=1)

clf.fit(X_train, y_train)
y_pred=clf.predict_proba2(X_test)



Bootstrapping...
Fitting...
Done!
Predicting...
Computing models similarity to instance...


In [32]:
# Test instance 

clf = CustomEstimator(n_estimators=50, base_estimator=DecisionTreeClassifier(), n_jobs=-1,
                             similarity_metric='laplacian', bootstrap_method="randomchoice",verbose=3)

clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
print("Accuracy =",metrics.accuracy_score(y_test, y_pred))

Bootstrapping...
Fitting...
Done!
Predicting...
Computing models similarity to instance...

--- EVALUATING SAMPLE 0  ---
model 0 votes [0.0209 0.     0.    ] having ~ 0.1809 similarity (voting power: 0.0209 )
model 1 votes [0.0197 0.     0.    ] having ~ 0.1709 similarity (voting power: 0.0197 )
model 2 votes [0.0126 0.     0.    ] having ~ 0.1095 similarity (voting power: 0.0126 )
model 3 votes [0.0226 0.     0.    ] having ~ 0.1961 similarity (voting power: 0.0226 )
model 4 votes [0.     0.     0.0096] having ~ 0.0828 similarity (voting power: 0.0096 )
model 5 votes [0.0145 0.     0.    ] having ~ 0.1256 similarity (voting power: 0.0145 )
model 6 votes [0.0267 0.     0.    ] having ~ 0.2315 similarity (voting power: 0.0267 )
model 7 votes [0.0211 0.     0.    ] having ~ 0.1832 similarity (voting power: 0.0211 )
model 8 votes [0.0347 0.     0.    ] having ~ 0.301 similarity (voting power: 0.0347 )
model 9 votes [0.0153 0.     0.    ] having ~ 0.1328 similarity (voting power: 0.0153 )


model 33 votes [0.     0.     0.0167] having ~ 0.1319 similarity (voting power: 0.0167 )
model 34 votes [0.     0.     0.0259] having ~ 0.2047 similarity (voting power: 0.0259 )
model 35 votes [0.     0.     0.0308] having ~ 0.244 similarity (voting power: 0.0308 )
model 36 votes [0.     0.     0.0124] having ~ 0.0982 similarity (voting power: 0.0124 )
model 37 votes [0.     0.     0.0201] having ~ 0.1593 similarity (voting power: 0.0201 )
model 38 votes [0.     0.     0.0126] having ~ 0.0994 similarity (voting power: 0.0126 )
model 39 votes [0.     0.     0.0262] having ~ 0.207 similarity (voting power: 0.0262 )
model 40 votes [0.     0.     0.0205] having ~ 0.1622 similarity (voting power: 0.0205 )
model 41 votes [0.     0.0126 0.    ] having ~ 0.0995 similarity (voting power: 0.0126 )
model 42 votes [0.     0.     0.0136] having ~ 0.1076 similarity (voting power: 0.0136 )
model 43 votes [0.     0.     0.0241] having ~ 0.1907 similarity (voting power: 0.0241 )
model 44 votes [0.     

model 30 votes [0.0202 0.     0.    ] having ~ 0.2795 similarity (voting power: 0.0202 )
model 31 votes [0.0236 0.     0.    ] having ~ 0.327 similarity (voting power: 0.0236 )
model 32 votes [0.0222 0.     0.    ] having ~ 0.3078 similarity (voting power: 0.0222 )
model 33 votes [0.0224 0.     0.    ] having ~ 0.3106 similarity (voting power: 0.0224 )
model 34 votes [0.0162 0.     0.    ] having ~ 0.2239 similarity (voting power: 0.0162 )
model 35 votes [0.0173 0.     0.    ] having ~ 0.2391 similarity (voting power: 0.0173 )
model 36 votes [0.0205 0.     0.    ] having ~ 0.2846 similarity (voting power: 0.0205 )
model 37 votes [0.0208 0.     0.    ] having ~ 0.2881 similarity (voting power: 0.0208 )
model 38 votes [0.0196 0.     0.    ] having ~ 0.2714 similarity (voting power: 0.0196 )
model 39 votes [0.0196 0.     0.    ] having ~ 0.2712 similarity (voting power: 0.0196 )
model 40 votes [0.0205 0.     0.    ] having ~ 0.2842 similarity (voting power: 0.0205 )
model 41 votes [0.0191

model 46 votes [0.     0.0185 0.    ] having ~ 0.2298 similarity (voting power: 0.0185 )
model 47 votes [0.     0.0223 0.    ] having ~ 0.2763 similarity (voting power: 0.0223 )
model 48 votes [0.     0.0202 0.    ] having ~ 0.2509 similarity (voting power: 0.0202 )
model 49 votes [0.     0.0219 0.    ] having ~ 0.2718 similarity (voting power: 0.0219 )
FINAL PREDICTION: [0.13868727 0.84066039 0.02065234]

--- EVALUATING SAMPLE 50  ---
model 0 votes [0.     0.0215 0.    ] having ~ 0.3077 similarity (voting power: 0.0215 )
model 1 votes [0.0216 0.     0.    ] having ~ 0.3092 similarity (voting power: 0.0216 )
model 2 votes [0.0151 0.     0.    ] having ~ 0.2155 similarity (voting power: 0.0151 )
model 3 votes [0.0228 0.     0.    ] having ~ 0.3257 similarity (voting power: 0.0228 )
model 4 votes [0.     0.0113 0.    ] having ~ 0.1621 similarity (voting power: 0.0113 )
model 5 votes [0.     0.0159 0.    ] having ~ 0.2267 similarity (voting power: 0.0159 )
model 6 votes [0.     0.0246 0. 

model 48 votes [0.0192 0.     0.    ] having ~ 0.1068 similarity (voting power: 0.0192 )
model 49 votes [0.0213 0.     0.    ] having ~ 0.1185 similarity (voting power: 0.0213 )
FINAL PREDICTION: [0.90091357 0.0847676  0.01431883]

--- EVALUATING SAMPLE 61  ---
model 0 votes [0.     0.     0.0183] having ~ 0.2483 similarity (voting power: 0.0183 )
model 1 votes [0.     0.     0.0214] having ~ 0.291 similarity (voting power: 0.0214 )
model 2 votes [0.     0.     0.0249] having ~ 0.3392 similarity (voting power: 0.0249 )
model 3 votes [0.     0.     0.0192] having ~ 0.2616 similarity (voting power: 0.0192 )
model 4 votes [0.     0.     0.0229] having ~ 0.3113 similarity (voting power: 0.0229 )
model 5 votes [0.     0.     0.0215] having ~ 0.292 similarity (voting power: 0.0215 )
model 6 votes [0.     0.     0.0168] having ~ 0.228 similarity (voting power: 0.0168 )
model 7 votes [0.     0.     0.0197] having ~ 0.2678 similarity (voting power: 0.0197 )
model 8 votes [0.     0.     0.0133] 

model 37 votes [0.0157 0.     0.    ] having ~ 0.1477 similarity (voting power: 0.0157 )
model 38 votes [0.     0.0292 0.    ] having ~ 0.2751 similarity (voting power: 0.0292 )
model 39 votes [0.     0.     0.0149] having ~ 0.1403 similarity (voting power: 0.0149 )
model 40 votes [0.     0.0185 0.    ] having ~ 0.1749 similarity (voting power: 0.0185 )
model 41 votes [0.0279 0.     0.    ] having ~ 0.2631 similarity (voting power: 0.0279 )
model 42 votes [0.     0.     0.0257] having ~ 0.2426 similarity (voting power: 0.0257 )
model 43 votes [0.     0.     0.0154] having ~ 0.1457 similarity (voting power: 0.0154 )
model 44 votes [0.     0.0203 0.    ] having ~ 0.1913 similarity (voting power: 0.0203 )
model 45 votes [0.     0.0279 0.    ] having ~ 0.2634 similarity (voting power: 0.0279 )
model 46 votes [0.     0.0143 0.    ] having ~ 0.135 similarity (voting power: 0.0143 )
model 47 votes [0.     0.0258 0.    ] having ~ 0.2437 similarity (voting power: 0.0258 )
model 48 votes [0.    

model 44 votes [0.    0.    0.019] having ~ 0.3068 similarity (voting power: 0.019 )
model 45 votes [0.     0.     0.0215] having ~ 0.3463 similarity (voting power: 0.0215 )
model 46 votes [0.    0.    0.019] having ~ 0.3059 similarity (voting power: 0.019 )
model 47 votes [0.     0.     0.0214] having ~ 0.3444 similarity (voting power: 0.0214 )
model 48 votes [0.     0.     0.0212] having ~ 0.3411 similarity (voting power: 0.0212 )
model 49 votes [0.     0.0231 0.    ] having ~ 0.3716 similarity (voting power: 0.0231 )
FINAL PREDICTION: [0.01865673 0.16676229 0.81458098]

--- EVALUATING SAMPLE 85  ---
model 0 votes [0.     0.     0.0165] having ~ 0.109 similarity (voting power: 0.0165 )
model 1 votes [0.     0.0194 0.    ] having ~ 0.1283 similarity (voting power: 0.0194 )
model 2 votes [0.     0.     0.0227] having ~ 0.1497 similarity (voting power: 0.0227 )
model 3 votes [0.     0.     0.0188] having ~ 0.1244 similarity (voting power: 0.0188 )
model 4 votes [0.     0.     0.0216] ha

model 38 votes [0.     0.     0.0181] having ~ 0.1183 similarity (voting power: 0.0181 )
model 39 votes [0.     0.     0.0207] having ~ 0.1353 similarity (voting power: 0.0207 )
model 40 votes [0.     0.     0.0233] having ~ 0.1522 similarity (voting power: 0.0233 )
model 41 votes [0.     0.     0.0179] having ~ 0.1167 similarity (voting power: 0.0179 )
model 42 votes [0.     0.     0.0169] having ~ 0.1106 similarity (voting power: 0.0169 )
model 43 votes [0.     0.     0.0237] having ~ 0.1546 similarity (voting power: 0.0237 )
model 44 votes [0.     0.     0.0253] having ~ 0.1651 similarity (voting power: 0.0253 )
model 45 votes [0.     0.     0.0178] having ~ 0.1164 similarity (voting power: 0.0178 )
model 46 votes [0.     0.0215 0.    ] having ~ 0.1404 similarity (voting power: 0.0215 )
model 47 votes [0.     0.     0.0195] having ~ 0.1271 similarity (voting power: 0.0195 )
model 48 votes [0.     0.     0.0178] having ~ 0.1161 similarity (voting power: 0.0178 )
model 49 votes [0.   

model 49 votes [0.     0.0214 0.    ] having ~ 0.1758 similarity (voting power: 0.0214 )
FINAL PREDICTION: [0.17603298 0.63042086 0.19354616]

--- EVALUATING SAMPLE 116  ---
model 0 votes [0.     0.0234 0.    ] having ~ 0.2322 similarity (voting power: 0.0234 )
model 1 votes [0.     0.0196 0.    ] having ~ 0.1945 similarity (voting power: 0.0196 )
model 2 votes [0.0131 0.     0.    ] having ~ 0.1305 similarity (voting power: 0.0131 )
model 3 votes [0.0206 0.     0.    ] having ~ 0.204 similarity (voting power: 0.0206 )
model 4 votes [0.     0.0098 0.    ] having ~ 0.0969 similarity (voting power: 0.0098 )
model 5 votes [0.0137 0.     0.    ] having ~ 0.1357 similarity (voting power: 0.0137 )
model 6 votes [0.0252 0.     0.    ] having ~ 0.2498 similarity (voting power: 0.0252 )
model 7 votes [0.0227 0.     0.    ] having ~ 0.2251 similarity (voting power: 0.0227 )
model 8 votes [0.0301 0.     0.    ] having ~ 0.2984 similarity (voting power: 0.0301 )
model 9 votes [0.0153 0.     0.    

model 22 votes [0.0231 0.     0.    ] having ~ 0.2544 similarity (voting power: 0.0231 )
model 23 votes [0.0196 0.     0.    ] having ~ 0.2157 similarity (voting power: 0.0196 )
model 24 votes [0.0213 0.     0.    ] having ~ 0.2341 similarity (voting power: 0.0213 )
model 25 votes [0.     0.0168 0.    ] having ~ 0.1845 similarity (voting power: 0.0168 )
model 26 votes [0.0183 0.     0.    ] having ~ 0.201 similarity (voting power: 0.0183 )
model 27 votes [0.0179 0.     0.    ] having ~ 0.197 similarity (voting power: 0.0179 )
model 28 votes [0.024 0.    0.   ] having ~ 0.2641 similarity (voting power: 0.024 )
model 29 votes [0.0179 0.     0.    ] having ~ 0.1966 similarity (voting power: 0.0179 )
model 30 votes [0.0184 0.     0.    ] having ~ 0.2029 similarity (voting power: 0.0184 )
model 31 votes [0.0186 0.     0.    ] having ~ 0.2045 similarity (voting power: 0.0186 )
model 32 votes [0.     0.     0.0203] having ~ 0.2237 similarity (voting power: 0.0203 )
model 33 votes [0.021 0.   

model 45 votes [0.     0.     0.0199] having ~ 0.3115 similarity (voting power: 0.0199 )
model 46 votes [0.     0.     0.0201] having ~ 0.3139 similarity (voting power: 0.0201 )
model 47 votes [0.     0.     0.0202] having ~ 0.3154 similarity (voting power: 0.0202 )
model 48 votes [0.     0.     0.0221] having ~ 0.3453 similarity (voting power: 0.0221 )
model 49 votes [0.     0.     0.0225] having ~ 0.3522 similarity (voting power: 0.0225 )
FINAL PREDICTION: [0.02013565 0.01912566 0.96073869]

--- EVALUATING SAMPLE 150  ---
model 0 votes [0.     0.     0.0168] having ~ 0.1137 similarity (voting power: 0.0168 )
model 1 votes [0.022 0.    0.   ] having ~ 0.1492 similarity (voting power: 0.022 )
model 2 votes [0.     0.0211 0.    ] having ~ 0.1427 similarity (voting power: 0.0211 )
model 3 votes [0.     0.0243 0.    ] having ~ 0.1647 similarity (voting power: 0.0243 )
model 4 votes [0.     0.0182 0.    ] having ~ 0.123 similarity (voting power: 0.0182 )
model 5 votes [0.     0.0191 0.    

model 36 votes [0.0246 0.     0.    ] having ~ 0.3692 similarity (voting power: 0.0246 )
model 37 votes [0.     0.     0.0186] having ~ 0.2795 similarity (voting power: 0.0186 )
model 38 votes [0.0235 0.     0.    ] having ~ 0.3522 similarity (voting power: 0.0235 )
model 39 votes [0.0178 0.     0.    ] having ~ 0.2673 similarity (voting power: 0.0178 )
model 40 votes [0.0212 0.     0.    ] having ~ 0.3188 similarity (voting power: 0.0212 )
model 41 votes [0.0222 0.     0.    ] having ~ 0.3331 similarity (voting power: 0.0222 )
model 42 votes [0.     0.0222 0.    ] having ~ 0.3332 similarity (voting power: 0.0222 )
model 43 votes [0.0185 0.     0.    ] having ~ 0.277 similarity (voting power: 0.0185 )
model 44 votes [0.0207 0.     0.    ] having ~ 0.31 similarity (voting power: 0.0207 )
model 45 votes [0.0244 0.     0.    ] having ~ 0.3665 similarity (voting power: 0.0244 )
model 46 votes [0.0169 0.     0.    ] having ~ 0.2543 similarity (voting power: 0.0169 )
model 47 votes [0.0242 0

model 0 votes [0.0182 0.     0.    ] having ~ 0.2342 similarity (voting power: 0.0182 )
model 1 votes [0.0218 0.     0.    ] having ~ 0.2795 similarity (voting power: 0.0218 )
model 2 votes [0.0234 0.     0.    ] having ~ 0.3008 similarity (voting power: 0.0234 )
model 3 votes [0.0209 0.     0.    ] having ~ 0.269 similarity (voting power: 0.0209 )
model 4 votes [0.0201 0.     0.    ] having ~ 0.2582 similarity (voting power: 0.0201 )
model 5 votes [0.0197 0.     0.    ] having ~ 0.2526 similarity (voting power: 0.0197 )
model 6 votes [0.0182 0.     0.    ] having ~ 0.2342 similarity (voting power: 0.0182 )
model 7 votes [0.0198 0.     0.    ] having ~ 0.2549 similarity (voting power: 0.0198 )
model 8 votes [0.0132 0.     0.    ] having ~ 0.169 similarity (voting power: 0.0132 )
model 9 votes [0.0235 0.     0.    ] having ~ 0.3014 similarity (voting power: 0.0235 )
model 10 votes [0.0232 0.     0.    ] having ~ 0.2986 similarity (voting power: 0.0232 )
model 11 votes [0.0213 0.     0. 

In [17]:
#Test parallelizzazione
clf1 = CustomEstimator(n_estimators=200, base_estimator=DecisionTreeClassifier(), n_jobs=1,
                             similarity_metric='rbf', bootstrap_method="randomchoice")

clf2 = CustomEstimator(n_estimators=200, base_estimator=DecisionTreeClassifier(), n_jobs=-1,
                             similarity_metric='rbf', bootstrap_method="randomchoice")
print("\n --- NON-PARALLEL ---\n  Fit:")
%time clf1.fit(X_train, y_train)
print("  Predict:")
%time clf1.predict(X_test)
print("\n --- PARALLEL ---\n  Fit:")
%time clf2.fit(X_train, y_train)
print("  Predict:")
%time clf2.predict(X_test)


 --- NON-PARALLEL ---
  Fit:
CPU times: total: 1.12 s
Wall time: 1.2 s
  Predict:
CPU times: total: 2.02 s
Wall time: 1.88 s

 --- PARALLEL ---
  Fit:
CPU times: total: 5.22 s
Wall time: 5.69 s
  Predict:
CPU times: total: 1.86 s
Wall time: 1.67 s


array([2, 1, 0, 0, 0, 1, 1, 0, 2, 1, 2, 0, 1, 2, 2, 0, 1, 0, 1, 1, 2, 1,
       1, 0, 0, 1, 1, 1, 2, 0, 0, 2, 1, 0, 2, 2, 2, 0, 2, 2, 1, 0, 0, 0,
       2, 1, 2, 0, 1, 0, 2, 2, 1, 2, 2, 1, 0, 2, 2, 0, 2, 2, 1, 0, 2, 0,
       0, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 1, 0,
       1, 1, 2, 2, 0, 0, 2, 0, 1, 0, 0, 1, 1, 0, 1, 0, 2, 0, 1, 2, 2, 1,
       1, 0, 1, 2, 0, 1, 1, 1, 0, 2, 2, 0, 1, 2, 0, 0, 0, 1, 2, 0, 1, 1,
       1, 2, 2, 0, 0, 1, 2, 0, 2, 0, 0, 1, 2, 2, 0, 2, 0, 0, 2, 1, 0, 0,
       2, 2, 1, 0, 2, 0, 2, 2, 0, 1, 1, 1, 1, 1, 2, 0, 0, 2, 1, 0, 2, 2,
       2, 0, 0, 1, 2, 2, 1, 1, 1, 1, 0, 1, 1, 2, 0, 0, 0, 2, 2, 0, 1, 1,
       2, 1], dtype=int64)

In [25]:
c1 = DecisionTreeClassifier()
c2 = CustomEstimator(n_estimators=50,base_estimator=c1,verbose=1,n_jobs=2)
c3 = RandomForestClassifier(n_estimators=50, max_leaf_nodes=16, n_jobs=-1)

c1.fit(X_train,y_train)
c2.fit(X_train,y_train)
c3.fit(X_train,y_train)


print(c1.predict_proba(X_test))
print(c2.predict_proba(X_test))
print(c3.predict_proba(X_test))

Bootstrapping...
Fitting...
Done!
[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0

In [27]:
#Accuracy comparison w/different parameters
from sklearn.ensemble import VotingClassifier

clf1 = CustomEstimator(n_estimators=50,base_estimator=DecisionTreeClassifier())
clf2 = CustomEstimator(n_estimators=50,base_estimator=DecisionTreeClassifier(),bootstrap_method="randomchoice")
clf3 = CustomEstimator(n_estimators=50,base_estimator=DecisionTreeClassifier(),similarity_metric="laplacian")
clf4 = CustomEstimator(n_estimators=50,base_estimator=DecisionTreeClassifier(),similarity_metric="laplacian",bootstrap_method="randomchoice")

estimators = [("rbf/rej",clf1),("rbf/raC",clf2),("Lap/rej",clf3),("Lap/raC",clf4)]

for i in range(0,len(estimators)):
    estimators[i][1].fit(X_train, y_train)
    y_pred = estimators[i][1].predict(X_test)
    print(estimators[i][0],":",metrics.accuracy_score(y_test, y_pred))
    

rbf/rej : 0.845
rbf/raC : 0.83
Lap/rej : 0.84
Lap/raC : 0.85


In [20]:
# Comparative tests

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

rnd_clf = RandomForestClassifier(n_estimators=50, max_leaf_nodes=16, n_jobs=-1)

svc_clf = SVC(probability=True)

tree_clf = DecisionTreeClassifier(max_depth=16)

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=50, max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True)

custom_clf = CustomEstimator(n_estimators=50,base_estimator=DecisionTreeClassifier())

custom_clf2 = CustomEstimator(n_estimators=50,similarity_metric="laplacian")

custom_clf3 = CustomEstimator(n_estimators=50,similarity_metric="laplacian", bootstrap_method="randomchoice")

print("Accuracy scores:")
for clf in (rnd_clf, svc_clf, tree_clf, bag_clf, custom_clf, custom_clf2, custom_clf3):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,":",metrics.accuracy_score(y_test, y_pred))
    
#     results, names = list(), list()
#     scores = evaluate_model(clf, X, y)
#     results.append(scores)
#     names.append(clf.__class__.__name__)
#     print('%s %.3f (%.3f)' % ("Cross validation score:", np.mean(scores), np.std(scores)))



Accuracy scores:
RandomForestClassifier : 0.805
SVC : 0.855
DecisionTreeClassifier : 0.81
BaggingClassifier : 0.795
CustomEstimator : 0.825
CustomEstimator : 0.845
CustomEstimator : 0.835


In [21]:
from sklearn.metrics.pairwise import check_pairwise_arrays

pairwise_kernels(X, Y=None, metric="cosine")

#[‘additive_chi2’, ‘chi2’, ‘linear’, ‘poly’, ‘polynomial’, ‘rbf’, ‘laplacian’, ‘sigmoid’, ‘cosine’]

#normalized metrics: rbf, laplacian




array([[ 1.        ,  0.36713288, -0.49438263, ...,  0.93817176,
         0.26485983,  0.72455268],
       [ 0.36713288,  1.        , -0.70683046, ...,  0.46766292,
         0.65104316, -0.10009633],
       [-0.49438263, -0.70683046,  1.        , ..., -0.35923055,
        -0.43796732,  0.0947128 ],
       ...,
       [ 0.93817176,  0.46766292, -0.35923055, ...,  1.        ,
         0.211382  ,  0.79763951],
       [ 0.26485983,  0.65104316, -0.43796732, ...,  0.211382  ,
         1.        , -0.36418777],
       [ 0.72455268, -0.10009633,  0.0947128 , ...,  0.79763951,
        -0.36418777,  1.        ]])

In [22]:
#Debug parallelizzazione
#Ipotesi: non funziona se in funzione ci sono delle variabili iterate



def doStuff(a):
    return 0
    
a = []

for i in range (0,5):
    doStuff(a)
    
b = []
    
b = Parallel(n_jobs=-1)(delayed(doStuff)(b) for i in range(0,5))

print(a)
print(b)

[]
[0, 0, 0, 0, 0]


In [23]:
a = np.array([[1, 2, 3], [4,5,6]])
a.transpose(1, 0)

array([[1, 4],
       [2, 5],
       [3, 6]])