In [1]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics.pairwise import pairwise_kernels
import joblib
from sklearn import metrics

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification

In [2]:
a, b = make_classification(n_classes=3, n_samples=500, n_features=6, n_informative=5, n_redundant=1, random_state=2)
c, d = make_classification(n_classes=3, n_samples=500, n_features=6, n_informative=3, n_redundant=3, random_state=44)
X = np.concatenate((a,c))
y= np.concatenate((b,d))

In [3]:
#Iris dataset
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

In [4]:
#Hard dataset

X, y = make_classification(n_classes=3, n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=2)

In [5]:
#Serious dataset

X, y = make_classification(n_classes=3, n_samples=1000, n_features=6, n_informative=4, n_redundant=2, random_state=2)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [7]:
from random import randint
import random
from joblib import Parallel, delayed

def _validate(metric, bootstrap_method):
    
    if bootstrap_method is not None:
        if bootstrap_method not in ("rejection","randomchoice"):
            raise ValueError(
                'Invalid preset "%s" for bootstrap_method'
                % bootstrap_method
                )
    
    if metric is not None:
        if metric not in ("rbf", "laplacian"):
            raise ValueError(
                'Invalid preset "%s" for kernel metric'
                % metric
                )
            
def _pickDiverseSeed(sim_matrix,seed=None):
    #creating seed for specific training set, choosing by inverse similarity to previous set
    if seed is None:
            return randint(0,len(sim_matrix)-1)
    else:
        #questo è probabilmente il codice più cringe che io abbia mai scritto,
        #ma non riesco a fargli ritornare un int se non specificando l'indice

        sim_n = 1-sim_matrix[seed]

        candidate = np.random.choice(np.arange(len(sim_matrix)), size=1, replace=True,
                                     p=sim_n/np.sum(sim_n))[0]

        return candidate
    
def _pickDiverseSeed2(sim_matrix,seed): 
    #Randomly picks from an array of the most different seeds from all used ones, does not improve much
    l=len(seed)
    if len(seed) == 0:
        return randint(0,len(sim_matrix)-1)
    else:
        
        candidates = np.zeros(l)
        
        for i in range(0,len(seed)):

            sim_n = 1-sim_matrix[seed[i]]

            candidates[i] = np.random.choice(np.arange(len(sim_matrix)), size=1, replace=True, 
                                             p=sim_n/np.sum(sim_n))[0]
        

        return int(np.random.choice(candidates, size=1, replace=True)[0])
    


In [8]:
def _randomchoice_bootstrap(X, y, similarity_metric, n_estimators, verbose=0):
    #Initializing list of random training sets
    X_sets = []
    y_sets = []
    G = pairwise_kernels(X, metric=similarity_metric)
    
    seed=None
    
    for i in range(0,n_estimators):
        instance_X = np.empty(np.shape(X))
        instance_y = np.empty(np.shape(y))
        
        #creating seed for specific training set
        
        if verbose>0:
            oldseed=seed
        
        seed = _pickDiverseSeed(G,seed)
        
        if verbose>0 and oldseed is not None:
            print("Chose new seed",seed,"based on previous one:",oldseed,"w/ similarity:",G[seed][oldseed])
                  

        instance_X[0] = X[seed]
        instance_y[0] = y[seed]
        
        pool = np.arange(np.shape(X)[0])
        pool_prob = np.zeros(np.shape(X)[0])
        pool_tot = np.sum(G[seed])
        
        
        for i in range(np.shape(X)[0]):
            pool_prob[i] = G[seed][i]/pool_tot
        
        if False:  #adds noise, does not increase accuracy
            for i in range(0,int(np.shape(X)[0]/2)):
                offset = 0.3 #random.uniform(0, 1)
                
                index = randint(0,np.shape(X)[0]-1)
                while pool_prob[index] + offset > 1: 
                    index = randint(0,np.shape(X)[0]-1)
                pool_prob[index] += offset
                
                index = randint(0,np.shape(X)[0])-1
                while pool_prob[index] - offset < 0: 
                    index = randint(0,np.shape(X)[0]-1)
                pool_prob[index] -= offset
                
                
#         print(pool_prob)
        
#         print(G[seed])
        
        pick = np.random.choice(pool, size=(len(pool))-1, replace=True, p=pool_prob)
        
        for i in range(len(pick)):
            instance_X[i+1] = X[pick[i]]
            instance_y[i+1] = y[pick[i]]
        
        X_sets.append(instance_X)
        y_sets.append(instance_y)
    
    return X_sets,y_sets

#Test
X_sets,y_sets = _randomchoice_bootstrap(X_train,y_train,"rbf",n_estimators=50,verbose=1)

Chose new seed 786 based on previous one: 617 w/ similarity: 0.0024939498453446494
Chose new seed 382 based on previous one: 786 w/ similarity: 0.0323211315436734
Chose new seed 527 based on previous one: 382 w/ similarity: 2.0862726053867576e-05
Chose new seed 135 based on previous one: 527 w/ similarity: 0.0036318991594347493
Chose new seed 676 based on previous one: 135 w/ similarity: 0.25304228800651596
Chose new seed 313 based on previous one: 676 w/ similarity: 0.2539719296453532
Chose new seed 258 based on previous one: 313 w/ similarity: 0.012027045433394489
Chose new seed 729 based on previous one: 258 w/ similarity: 7.502842138105072e-07
Chose new seed 81 based on previous one: 729 w/ similarity: 2.745256012186141e-07
Chose new seed 153 based on previous one: 81 w/ similarity: 0.17081572751387095
Chose new seed 416 based on previous one: 153 w/ similarity: 0.20530198573370415
Chose new seed 228 based on previous one: 416 w/ similarity: 0.19762111933319987
Chose new seed 210 b

In [9]:
#Parallel functions

def _parallel_fit(n_estimators, base_estimator, X, y, i):
       
    model = base_estimator.fit(X[i], y[i])
    model_name = "model"+str(i)+".pkl"
    joblib.dump(model, model_name)
    
def _parallel_models_predict(votes, sim_means, X, Z, i, j):
    print("test")
    model_name = "model"+str(j)+".pkl"
    model = joblib.load(model_name)
                
    votes.append(model.predict_proba(X[i].reshape(1, -1))) 
    sim_means[j] = np.mean(Z[j][i])
    
def _parallel_rejection_bootstrap(G,seed,n_estimators,X,y,X_sets,y_sets):
    instance_X = np.empty(np.shape(X))
    instance_y = np.empty(np.shape(y))

    seed = _pickDiverseSeed(G,seed)

    instance_X[0] = X[seed]
    instance_y[0] = y[seed]       

    n_entries = 1

    #populating training set


    if 1-(len(X_sets)+1)/n_estimators>0.8:
        p_thresh = 0.8
    else:
        p_thresh = np.round(1-(len(X_sets)+1)/n_estimators,1)

    #print("populating set", i, "with probability acceptance",p_thresh)


    while n_entries < len(X): #until the pool isn't filled up

        rand = randint(0,len(X)-1)

        if _accept_entry(G[seed,rand],n_entries/(len(X)-1),p_thresh):

            instance_X[n_entries] = X[rand]
            instance_y[n_entries] = y[rand]
            n_entries+=1

    X_sets.append(instance_X)
    y_sets.append(instance_y)



In [10]:
from sklearn.metrics import pairwise_distances


def _rejection_bootstrap(X, y, similarity_metric, n_estimators, n_jobs):
    
    #Initializing list of random training sets
    X_sets = []
    y_sets = []
    G = pairwise_kernels(X, metric=similarity_metric)
    
    seed=None

# CURRENTLY RETURNS ERROR IF n_jobs!=1
#     Parallel(n_jobs=n_jobs)(delayed(_parallel_rejection_bootstrap)(G,seed,n_estimators,X,y,X_sets,y_sets) 
#                                   for i in range(0,n_estimators))
    
    for i in range(0,n_estimators):  #for each estimator  
            _parallel_rejection_bootstrap(G,seed,n_estimators,X,y,X_sets,y_sets)   
    
    return X_sets,y_sets

def _accept_entry(similarity_score, p_ratio, prob_threshold = 0.7):
    #p_ratio gradually shifts the decisional weight from similarity_score to randomness as the test set gets filled up.
    #Specifically, at least least a small number of entries are accepted only through randomness (p_ratio=1)
    #this way it's unlikely for the method to come up with sets with only one class (if not impossible, but I need to check)
    
    #It is also possible to modify the prob_threshold in order to get more or less random values, assuring more variance.
    #I think this tweakability could prove useful to adapt the model to specific instances.
    
    #PLEASE NOTE THAT THIS CODE MAKES SENSE WITH NORMALIZED SIMILARITY SCORES
    #NORMALIZATION FOR FUNCTION THAT DO NOT RETURN [0,1] VALUES STILL NEEDS TO BE IMPLEMENTED.
    #It's not like it doesn't work, but prob calculations take the high road
    
    prob = similarity_score*(1-p_ratio)+ random.random()*p_ratio
    if prob > prob_threshold:
        return True
    else:
        return False
    
    
#Test
X_sets, y_sets = _rejection_bootstrap(X_train, y_train,'rbf',50,1)

In [40]:
"""
   
    Parameters
    ----------
    n_estimators : int, default=50
        The number of models to train.
        
    base_estimator : estimator, default=DecisionTreeClassifier()
        The estimator fitted on  each bootstrapped set.     
        
    n_jobs : int, default=1
        Number of parallel jobs during fitting.
        
    similarity_metric : {"rbf", "laplacian", "cosine"}, string, default="rbf"
        The metric used for pairwise_kernels().
        
    bootstrap_method={"rejection", "randomchoice"}, string, default=TODO
        The bootstrap method of choice.
    
    verbose : int, default = 0
        Controls verbosity during fitting and predicting, 0 being none and 3 being the most detailed. 
    
    


"""



class CustomEstimator(BaseEstimator, ClassifierMixin):
    
    def __init__(self, n_estimators=50,
                base_estimator=DecisionTreeClassifier(),
                n_jobs=1,
                bootstrap_method="rejection",
                similarity_metric="rbf",
                verbose = 0):
        
        
        
        self.n_estimators = n_estimators 
        self.base_estimator = base_estimator
        self.n_jobs = n_jobs
        self.bootstrap_method = bootstrap_method
        self.similarity_metric = similarity_metric
        self.verbose = verbose 
        
        _validate(self.similarity_metric, self.bootstrap_method)
        
    def fit(self,X,y):
        
        self.n_classes_= np.max(y)+1
        
        if self.verbose > 0:
            print("Bootstrapping...")
            
        if self.bootstrap_method=="rejection":  
            self.X_sets, self.y_sets = _rejection_bootstrap(X, y, self.similarity_metric, self.n_estimators, self.n_jobs)
        elif self.bootstrap_method=="randomchoice": 
            self.X_sets, self.y_sets = _randomchoice_bootstrap(X, y, self.similarity_metric, self.n_estimators)
        
        
        if self.verbose > 0:
            print("Fitting...")
        
        Parallel(n_jobs=self.n_jobs)(delayed(_parallel_fit)(self.n_estimators, self.base_estimator, self.X_sets, self.y_sets, i) 
                           for i in range(0,self.n_estimators))
            
        
        self.X_ = X
        self.y_ = y

        if self.verbose > 0:
            print("Done!")
        return self
                
    
    def predict(self, X):
        
        predicted_probabilitiy = self.predict_proba(X)
        return np.argmax(predicted_probabilitiy, axis=1)
    
    def predict_proba_old(self, X):
            
        if self.verbose > 0:
            print("Predicting...")
            
        out = np.zeros(shape=(len(X),self.n_classes_))
        
        #compute similarity between instance and each training set
        
        if self.verbose > 0:
            print("Computing models similarity to instance...")
        
        Z=[]
        for j in range (0,self.n_estimators):
            Z.append(pairwise_kernels(X, self.X_sets[j], metric=self.similarity_metric))
#             if self.verbose > 1:
#                 print("Similarity matrix for instance to model",j,":",Z[j])
        
        
        
        if self.verbose > 0:
            print("Making predictions with each trained model...")
            
          
        for i in range(0,len(X)): #for each sample to predict
            if self.verbose > 1:  
                print("\n--- EVALUATING SAMPLE",i," ---")
            
            votes = []
            votes_weighted = []
            sim_means = np.zeros(self.n_estimators)
            
            voting_power = np.zeros(self.n_estimators)
            
            for j in range (0,self.n_estimators): #for each estimator trained
               
                #load estimator model
                model_name = "model"+str(j)+".pkl"
                model = joblib.load(model_name)
                
                votes.append(model.predict_proba(X[i].reshape(1, -1))) 
                sim_means[j] = np.mean(Z[j][i])
                
                
            #DOES NOT WORK WITH n_jobs !=0    
#             Parallel(n_jobs=self.n_jobs)(delayed(_parallel_models_predict)(votes, sim_means, X, Z, i, j) 
#                                          for j in range(0,self.n_estimators))
                
                
            for k in range (0,self.n_estimators):
                     
                voting_power[k] = (100*sim_means[k]/np.sum(sim_means))/100
                votes_weighted.append(votes[k]*voting_power[k])
                
                if self.verbose > 2:
                    print("model",k,"votes",np.round(votes[k],4),"having ~",np.round(sim_means[k],4),
                          "similarity (voting power:",np.round(voting_power[k],4),")")
                
            out[i]=(np.sum(votes_weighted,axis=0))
            if self.verbose > 1:
                print("FINAL PREDICTION:",out[i],)
            
        
        if self.verbose > 0:
            print("Done!")

        return out
    
    def predict_proba(self, X):
        #New faster implementation but a bit less human-readable. Avoids loading all models for each sample evaluation.
        #It looks like the similarity scores are "less similar" from each other, further investigation required.
        
        if self.verbose > 0:
            print("Predicting...")
            
        out = np.zeros(shape=(len(X),self.n_classes_))        
        Z=np.zeros(shape=(self.n_estimators,len(X),len(self.X_sets[0])))            
        preds = []
        sim_means = np.zeros(self.n_estimators)

        if self.verbose > 0:
            print("Computing predictions for each model...")
            
        for i in range (0,self.n_estimators): #for each estimator trained
            
            Z[i]=(pairwise_kernels(X, self.X_sets[i], metric=self.similarity_metric))
            
            #load estimator model
            model_name = "model"+str(i)+".pkl"
            model = joblib.load(model_name)
          
            preds.append(model.predict_proba(X))
        
        votes = np.zeros((len(X),len(preds),self.n_classes_))
        
        if self.verbose > 0:
            print("Computing similarities between models and samples...")
        
        for i in range (0,len(X)): #for each sample to predict
            
            if self.verbose > 2:  
                    print("\n--- EVALUATING SAMPLE",i," ---")
            
            for j in range (0,self.n_estimators): #for each model
                sim_means[j] = np.mean(Z[j][i])
            
            for j in range (0,self.n_estimators):
                #weight votes
                votes[i][j] = preds[j][i] * ((100*sim_means[j]/np.sum(sim_means))/100)
                
                if self.verbose > 2:
                    print("model",j,"votes",np.round(votes[i][j],4),"having ~",np.round(sim_means[j],4),
                          "similarity (voting power:",np.round(((100*sim_means[j]/np.sum(sim_means))/100),4),")")
                    
            out[i]=(np.sum(votes[i],axis=0))
            if self.verbose > 1:
                print("SAMPLE",i,"PREDICTION:",out[i],)
                
                
        if self.verbose > 0:
            print("Done!")
                
        return(out)
        
            
            
            
            
            
        
            
clf = CustomEstimator(n_estimators=50, base_estimator=DecisionTreeClassifier(), n_jobs=-1,
                             similarity_metric='laplacian', bootstrap_method="randomchoice",verbose=3)

clf.fit(X_train, y_train)
y_pred=clf.predict_proba(X_test)



Bootstrapping...
Fitting...
Done!
Predicting...
Computing predictions for each model...
Computing similarities between models and samples...

--- EVALUATING SAMPLE 0  ---
model 0 votes [0.     0.     0.0225] having ~ 0.2405 similarity (voting power: 0.0225 )
model 1 votes [0.     0.     0.0136] having ~ 0.1453 similarity (voting power: 0.0136 )
model 2 votes [0.     0.     0.0223] having ~ 0.2384 similarity (voting power: 0.0223 )
model 3 votes [0.     0.     0.0236] having ~ 0.2519 similarity (voting power: 0.0236 )
model 4 votes [0.     0.     0.0205] having ~ 0.2184 similarity (voting power: 0.0205 )
model 5 votes [0.     0.     0.0144] having ~ 0.1538 similarity (voting power: 0.0144 )
model 6 votes [0.     0.     0.0163] having ~ 0.1735 similarity (voting power: 0.0163 )
model 7 votes [0.     0.     0.0248] having ~ 0.2647 similarity (voting power: 0.0248 )
model 8 votes [0.     0.     0.0247] having ~ 0.2635 similarity (voting power: 0.0247 )
model 9 votes [0.     0.     0.0246] 

model 1 votes [0.0303 0.     0.    ] having ~ 0.3226 similarity (voting power: 0.0303 )
model 2 votes [0.0149 0.     0.    ] having ~ 0.1589 similarity (voting power: 0.0149 )
model 3 votes [0.     0.0124 0.    ] having ~ 0.1324 similarity (voting power: 0.0124 )
model 4 votes [0.019 0.    0.   ] having ~ 0.2024 similarity (voting power: 0.019 )
model 5 votes [0.0288 0.     0.    ] having ~ 0.3061 similarity (voting power: 0.0288 )
model 6 votes [0.     0.0259 0.    ] having ~ 0.2752 similarity (voting power: 0.0259 )
model 7 votes [0.0165 0.     0.    ] having ~ 0.1759 similarity (voting power: 0.0165 )
model 8 votes [0.0148 0.     0.    ] having ~ 0.158 similarity (voting power: 0.0148 )
model 9 votes [0.0144 0.     0.    ] having ~ 0.1536 similarity (voting power: 0.0144 )
model 10 votes [0.0197 0.     0.    ] having ~ 0.2099 similarity (voting power: 0.0197 )
model 11 votes [0.     0.0291 0.    ] having ~ 0.3098 similarity (voting power: 0.0291 )
model 12 votes [0.0218 0.     0.   

model 35 votes [0.     0.     0.0211] having ~ 0.2579 similarity (voting power: 0.0211 )
model 36 votes [0.     0.0183 0.    ] having ~ 0.2242 similarity (voting power: 0.0183 )
model 37 votes [0.     0.0212 0.    ] having ~ 0.2589 similarity (voting power: 0.0212 )
model 38 votes [0.0202 0.     0.    ] having ~ 0.2466 similarity (voting power: 0.0202 )
model 39 votes [0.     0.0212 0.    ] having ~ 0.2589 similarity (voting power: 0.0212 )
model 40 votes [0.0172 0.     0.    ] having ~ 0.21 similarity (voting power: 0.0172 )
model 41 votes [0.    0.021 0.   ] having ~ 0.2565 similarity (voting power: 0.021 )
model 42 votes [0.    0.021 0.   ] having ~ 0.2565 similarity (voting power: 0.021 )
model 43 votes [0.021 0.    0.   ] having ~ 0.2574 similarity (voting power: 0.021 )
model 44 votes [0.0204 0.     0.    ] having ~ 0.2491 similarity (voting power: 0.0204 )
model 45 votes [0.     0.0193 0.    ] having ~ 0.2355 similarity (voting power: 0.0193 )
model 46 votes [0.     0.0213 0.   

model 46 votes [0.     0.     0.0194] having ~ 0.1963 similarity (voting power: 0.0194 )
model 47 votes [0.     0.0185 0.    ] having ~ 0.1865 similarity (voting power: 0.0185 )
model 48 votes [0.0224 0.     0.    ] having ~ 0.2263 similarity (voting power: 0.0224 )
model 49 votes [0.     0.     0.0221] having ~ 0.2232 similarity (voting power: 0.0221 )
SAMPLE 67 PREDICTION: [0.0996146  0.36183132 0.53855408]

--- EVALUATING SAMPLE 68  ---
model 0 votes [0.     0.     0.0209] having ~ 0.333 similarity (voting power: 0.0209 )
model 1 votes [0.     0.     0.0185] having ~ 0.2956 similarity (voting power: 0.0185 )
model 2 votes [0.     0.     0.0192] having ~ 0.3066 similarity (voting power: 0.0192 )
model 3 votes [0.     0.     0.0178] having ~ 0.2845 similarity (voting power: 0.0178 )
model 4 votes [0.     0.     0.0218] having ~ 0.3478 similarity (voting power: 0.0218 )
model 5 votes [0.     0.     0.0187] having ~ 0.2987 similarity (voting power: 0.0187 )
model 6 votes [0.     0.     

model 0 votes [0.     0.     0.0178] having ~ 0.1326 similarity (voting power: 0.0178 )
model 1 votes [0.     0.0264 0.    ] having ~ 0.1963 similarity (voting power: 0.0264 )
model 2 votes [0.    0.014 0.   ] having ~ 0.1041 similarity (voting power: 0.014 )
model 3 votes [0.     0.0115 0.    ] having ~ 0.0857 similarity (voting power: 0.0115 )
model 4 votes [0.     0.0192 0.    ] having ~ 0.1431 similarity (voting power: 0.0192 )
model 5 votes [0.     0.     0.0269] having ~ 0.2003 similarity (voting power: 0.0269 )
model 6 votes [0.     0.0249 0.    ] having ~ 0.1855 similarity (voting power: 0.0249 )
model 7 votes [0.     0.0171 0.    ] having ~ 0.1273 similarity (voting power: 0.0171 )
model 8 votes [0.     0.0173 0.    ] having ~ 0.1286 similarity (voting power: 0.0173 )
model 9 votes [0.     0.0138 0.    ] having ~ 0.1024 similarity (voting power: 0.0138 )
model 10 votes [0.     0.0202 0.    ] having ~ 0.1501 similarity (voting power: 0.0202 )
model 11 votes [0.     0.0285 0.   

model 0 votes [0.     0.0159 0.    ] having ~ 0.1378 similarity (voting power: 0.0159 )
model 1 votes [0.     0.0311 0.    ] having ~ 0.2692 similarity (voting power: 0.0311 )
model 2 votes [0.0146 0.     0.    ] having ~ 0.1264 similarity (voting power: 0.0146 )
model 3 votes [0.     0.0121 0.    ] having ~ 0.105 similarity (voting power: 0.0121 )
model 4 votes [0.     0.0185 0.    ] having ~ 0.16 similarity (voting power: 0.0185 )
model 5 votes [0.     0.0292 0.    ] having ~ 0.2535 similarity (voting power: 0.0292 )
model 6 votes [0.     0.0264 0.    ] having ~ 0.2287 similarity (voting power: 0.0264 )
model 7 votes [0.    0.016 0.   ] having ~ 0.1383 similarity (voting power: 0.016 )
model 8 votes [0.     0.0144 0.    ] having ~ 0.1246 similarity (voting power: 0.0144 )
model 9 votes [0.0139 0.     0.    ] having ~ 0.1205 similarity (voting power: 0.0139 )
model 10 votes [0.     0.0192 0.    ] having ~ 0.1665 similarity (voting power: 0.0192 )
model 11 votes [0.0308 0.     0.    ] 

model 0 votes [0.     0.     0.0219] having ~ 0.1771 similarity (voting power: 0.0219 )
model 1 votes [0.     0.     0.0131] having ~ 0.1064 similarity (voting power: 0.0131 )
model 2 votes [0.     0.     0.0253] having ~ 0.2048 similarity (voting power: 0.0253 )
model 3 votes [0.     0.     0.0285] having ~ 0.2308 similarity (voting power: 0.0285 )
model 4 votes [0.     0.     0.0208] having ~ 0.1679 similarity (voting power: 0.0208 )
model 5 votes [0.     0.     0.0142] having ~ 0.1149 similarity (voting power: 0.0142 )
model 6 votes [0.     0.     0.0156] having ~ 0.1265 similarity (voting power: 0.0156 )
model 7 votes [0.     0.     0.0229] having ~ 0.1851 similarity (voting power: 0.0229 )
model 8 votes [0.     0.     0.0238] having ~ 0.1923 similarity (voting power: 0.0238 )
model 9 votes [0.     0.     0.0266] having ~ 0.215 similarity (voting power: 0.0266 )
model 10 votes [0.     0.     0.0196] having ~ 0.1584 similarity (voting power: 0.0196 )
model 11 votes [0.     0.     0.

model 0 votes [0.     0.     0.0211] having ~ 0.0912 similarity (voting power: 0.0211 )
model 1 votes [0.    0.011 0.   ] having ~ 0.0474 similarity (voting power: 0.011 )
model 2 votes [0.     0.     0.0254] having ~ 0.1094 similarity (voting power: 0.0254 )
model 3 votes [0.     0.     0.0285] having ~ 0.123 similarity (voting power: 0.0285 )
model 4 votes [0.     0.0198 0.    ] having ~ 0.0856 similarity (voting power: 0.0198 )
model 5 votes [0.     0.0128 0.    ] having ~ 0.0553 similarity (voting power: 0.0128 )
model 6 votes [0.     0.     0.0149] having ~ 0.0645 similarity (voting power: 0.0149 )
model 7 votes [0.     0.     0.0228] having ~ 0.0984 similarity (voting power: 0.0228 )
model 8 votes [0.     0.     0.0296] having ~ 0.1278 similarity (voting power: 0.0296 )
model 9 votes [0.     0.     0.0282] having ~ 0.1218 similarity (voting power: 0.0282 )
model 10 votes [0.     0.     0.0183] having ~ 0.0789 similarity (voting power: 0.0183 )
model 11 votes [0.     0.     0.0115

model 0 votes [0.     0.0183 0.    ] having ~ 0.249 similarity (voting power: 0.0183 )
model 1 votes [0.     0.0248 0.    ] having ~ 0.3375 similarity (voting power: 0.0248 )
model 2 votes [0.     0.0152 0.    ] having ~ 0.2064 similarity (voting power: 0.0152 )
model 3 votes [0.     0.0125 0.    ] having ~ 0.1702 similarity (voting power: 0.0125 )
model 4 votes [0.     0.0206 0.    ] having ~ 0.2809 similarity (voting power: 0.0206 )
model 5 votes [0.     0.0262 0.    ] having ~ 0.3565 similarity (voting power: 0.0262 )
model 6 votes [0.     0.0245 0.    ] having ~ 0.3343 similarity (voting power: 0.0245 )
model 7 votes [0.     0.0178 0.    ] having ~ 0.2421 similarity (voting power: 0.0178 )
model 8 votes [0.     0.0176 0.    ] having ~ 0.2399 similarity (voting power: 0.0176 )
model 9 votes [0.    0.015 0.   ] having ~ 0.2038 similarity (voting power: 0.015 )
model 10 votes [0.     0.0208 0.    ] having ~ 0.2834 similarity (voting power: 0.0208 )
model 11 votes [0.     0.0253 0.    

model 0 votes [0.0228 0.     0.    ] having ~ 0.3064 similarity (voting power: 0.0228 )
model 1 votes [0.     0.     0.0143] having ~ 0.1932 similarity (voting power: 0.0143 )
model 2 votes [0.     0.0216 0.    ] having ~ 0.2905 similarity (voting power: 0.0216 )
model 3 votes [0.     0.0219 0.    ] having ~ 0.2952 similarity (voting power: 0.0219 )
model 4 votes [0.0216 0.     0.    ] having ~ 0.2908 similarity (voting power: 0.0216 )
model 5 votes [0.     0.     0.0152] having ~ 0.2046 similarity (voting power: 0.0152 )
model 6 votes [0.     0.     0.0169] having ~ 0.2271 similarity (voting power: 0.0169 )
model 7 votes [0.     0.0231 0.    ] having ~ 0.311 similarity (voting power: 0.0231 )
model 8 votes [0.0247 0.     0.    ] having ~ 0.3327 similarity (voting power: 0.0247 )
model 9 votes [0.     0.     0.0239] having ~ 0.3213 similarity (voting power: 0.0239 )
model 10 votes [0.0211 0.     0.    ] having ~ 0.2842 similarity (voting power: 0.0211 )
model 11 votes [0.     0.     0.

In [41]:
# Test instance 

clf = CustomEstimator(n_estimators=50, base_estimator=DecisionTreeClassifier(), n_jobs=-1,
                             similarity_metric='laplacian', bootstrap_method="randomchoice",verbose=1)

clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
print("Accuracy =",metrics.accuracy_score(y_test, y_pred))

Bootstrapping...
Fitting...
Done!
Predicting...
Computing predictions for each model...
Computing similarities between models and samples...
Done!
Accuracy = 0.84


In [None]:
#Test parallelizzazione
clf1 = CustomEstimator(n_estimators=200, base_estimator=DecisionTreeClassifier(), n_jobs=1,
                             similarity_metric='rbf')

clf2 = CustomEstimator(n_estimators=200, base_estimator=DecisionTreeClassifier(), n_jobs=-1,
                             similarity_metric='rbf')
print("\n --- NON-PARALLEL ---\n  Fit:")
%time clf1.fit(X_train, y_train)
print("  Predict:")
%time clf1.predict(X_test)
print("\n --- PARALLEL ---\n  Fit:")
%time clf2.fit(X_train, y_train)
print("  Predict:")
%time clf2.predict(X_test)


 --- NON-PARALLEL ---
  Fit:


In [14]:
c1 = DecisionTreeClassifier()
c2 = CustomEstimator(n_estimators=50,base_estimator=c1,verbose=1,n_jobs=2)
c3 = RandomForestClassifier(n_estimators=50, max_leaf_nodes=16, n_jobs=-1)

c1.fit(X_train,y_train)
c2.fit(X_train,y_train)
c3.fit(X_train,y_train)


print(c1.predict_proba(X_test))
print(c2.predict_proba(X_test))
print(c3.predict_proba(X_test))

Bootstrapping...
Fitting...
Done!
[[0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0

In [15]:
#Accuracy comparison w/different parameters
from sklearn.ensemble import VotingClassifier

clf1 = CustomEstimator(n_estimators=50,base_estimator=DecisionTreeClassifier())
clf2 = CustomEstimator(n_estimators=50,base_estimator=DecisionTreeClassifier(),bootstrap_method="randomchoice")
clf3 = CustomEstimator(n_estimators=50,base_estimator=DecisionTreeClassifier(),similarity_metric="laplacian")
clf4 = CustomEstimator(n_estimators=50,base_estimator=DecisionTreeClassifier(),similarity_metric="laplacian",bootstrap_method="randomchoice")

estimators = [("rbf/rej",clf1),("rbf/raC",clf2),("Lap/rej",clf3),("Lap/raC",clf4)]

for i in range(0,len(estimators)):
    estimators[i][1].fit(X_train, y_train)
    y_pred = estimators[i][1].predict(X_test)
    print(estimators[i][0],":",metrics.accuracy_score(y_test, y_pred))
    

rbf/rej : 0.885
rbf/raC : 0.85
Lap/rej : 0.875
Lap/raC : 0.845


In [20]:
# Comparative tests

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

rnd_clf = RandomForestClassifier(n_estimators=50, max_leaf_nodes=16, n_jobs=-1)

svc_clf = SVC(probability=True)

tree_clf = DecisionTreeClassifier(max_depth=16)

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=50, max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True)

custom_clf = CustomEstimator(n_estimators=50)

custom_clf2 = CustomEstimator(n_estimators=50, base_estimator=SVC(probability=True))

custom_clf3 = CustomEstimator(n_estimators=50,similarity_metric="laplacian", bootstrap_method="randomchoice")

print("Accuracy scores:")
for clf in (rnd_clf, svc_clf, tree_clf, bag_clf, custom_clf, custom_clf2, custom_clf3):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,":",metrics.accuracy_score(y_test, y_pred))
    
#     results, names = list(), list()
#     scores = evaluate_model(clf, X, y)
#     results.append(scores)
#     names.append(clf.__class__.__name__)
#     print('%s %.3f (%.3f)' % ("Cross validation score:", np.mean(scores), np.std(scores)))



Accuracy scores:
RandomForestClassifier : 0.82
SVC : 0.83
DecisionTreeClassifier : 0.77
BaggingClassifier : 0.78
CustomEstimator : 0.835
CustomEstimator : 0.855
CustomEstimator : 0.845


In [17]:
from sklearn.metrics.pairwise import check_pairwise_arrays

pairwise_kernels(X, Y=None, metric="cosine")

#[‘additive_chi2’, ‘chi2’, ‘linear’, ‘poly’, ‘polynomial’, ‘rbf’, ‘laplacian’, ‘sigmoid’, ‘cosine’]

#normalized metrics: rbf, laplacian




array([[ 1.        ,  0.36713288, -0.49438263, ...,  0.93817176,
         0.26485983,  0.72455268],
       [ 0.36713288,  1.        , -0.70683046, ...,  0.46766292,
         0.65104316, -0.10009633],
       [-0.49438263, -0.70683046,  1.        , ..., -0.35923055,
        -0.43796732,  0.0947128 ],
       ...,
       [ 0.93817176,  0.46766292, -0.35923055, ...,  1.        ,
         0.211382  ,  0.79763951],
       [ 0.26485983,  0.65104316, -0.43796732, ...,  0.211382  ,
         1.        , -0.36418777],
       [ 0.72455268, -0.10009633,  0.0947128 , ...,  0.79763951,
        -0.36418777,  1.        ]])

In [18]:
#Debug parallelizzazione
#Ipotesi: non funziona se in funzione ci sono delle variabili iterate



def doStuff(a):
    return 0
    
a = []

for i in range (0,5):
    doStuff(a)
    
b = []
    
b = Parallel(n_jobs=-1)(delayed(doStuff)(b) for i in range(0,5))

print(a)
print(b)

[]
[0, 0, 0, 0, 0]


In [19]:
a = np.array([[1, 2, 3], [4,5,6]])
a.transpose(1, 0)

array([[1, 4],
       [2, 5],
       [3, 6]])