In [1]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics.pairwise import pairwise_kernels
import joblib
from sklearn import metrics

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification

import datetime



In [2]:
# Real dataset
import os
import pandas as pd
PATH = os.path.join("Datasets", "20newsgroups.csv")
dataset = pd.read_csv("Datasets/20newsgroups.csv",header=0)

X = dataset.iloc[:,:len(dataset.columns)-1].to_numpy()
y = dataset.iloc[:,-1:].to_numpy()
y = np.ravel(y.astype(int))


In [3]:
a, b = make_classification(n_classes=3, n_samples=500, n_features=6, n_informative=5, n_redundant=1, random_state=2)
c, d = make_classification(n_classes=3, n_samples=500, n_features=6, n_informative=3, n_redundant=3, random_state=44)
X = np.concatenate((a,c))
y= np.concatenate((b,d))

In [4]:
#Serious dataset

X, y = make_classification(n_classes=3, n_samples=1000, n_features=6, n_informative=4, n_redundant=2, random_state=2)

In [5]:
#Hard dataset

X, y = make_classification(n_classes=3, n_samples=5000, n_features=20, n_informative=15, n_redundant=5, random_state=2)

In [6]:
#Iris dataset
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [8]:
from random import randint
import random
from joblib import Parallel, delayed

#support functions

def _validate(metric, bootstrap_method):
    
    if bootstrap_method is not None:
        if bootstrap_method not in ("rejection","randomchoice"):
            raise ValueError(
                'Invalid preset "%s" for bootstrap_method'
                % bootstrap_method
                )
    
    if metric is not None:
        if metric not in ("rbf", "laplacian"):
            raise ValueError(
                'Invalid preset "%s" for kernel metric'
                % metric
                )
            
def _pickDiverseSeed(K,seed=None):
    #creating seed for specific training set, choosing by inverse similarity to previous set
    if seed is None:
            return randint(0,len(K)-1)
    else:
        #questo è probabilmente il codice più cringe che io abbia mai scritto,
        #ma non riesco a fargli ritornare un int se non specificando l'indice

        sim_n = 1-K[seed]

        candidate = np.random.choice(np.arange(len(K)), size=1, replace=True,
                                     p=sim_n/np.sum(sim_n))[0]

        return candidate
    
def _pickDiverseSeed2(sim_matrix,seed): 
    #Randomly picks from an array of the most different seeds from all used ones, does not improve much
    l=len(seed)
    if len(seed) == 0:
        return randint(0,len(sim_matrix)-1)
    else:
        
        candidates = np.zeros(l)
        
        for i in range(0,len(seed)):

            sim_n = 1-sim_matrix[seed[i]]

            candidates[i] = np.random.choice(np.arange(len(sim_matrix)), size=1, replace=True, 
                                             p=sim_n/np.sum(sim_n))[0]
        

        return int(np.random.choice(candidates, size=1, replace=True)[0])
    


In [9]:
#Parallel functions

def _parallel_fit(estimator, X, y, i):
#     model = base_estimator.fit(X[i], y[i])
#     model_name = "model"+str(i)+".pkl"
#     joblib.dump(model, model_name)
    return estimator.fit(X[i], y[i])
    
def _parallel_models_predict(estimator, X):

#     model_name = "model"+str(j)+".pkl"
#     model = joblib.load(model_name)
    return estimator.predict_proba(X)

    
def _parallel_rejection_bootstrap(K,seed,n_estimators,X,y,X_sets,y_sets):
    instance_X = np.empty(np.shape(X))
    instance_y = np.empty(np.shape(y))

    seed = _pickDiverseSeed(K,seed)

    instance_X[0] = X[seed]
    instance_y[0] = y[seed]       

    n_entries = 1

    #populating training set


    if 1-(len(X_sets)+1)/n_estimators>0.8:
        p_thresh = 0.8
    else:
        p_thresh = np.round(1-(len(X_sets)+1)/n_estimators,1)

    #print("populating set", i, "with probability acceptance",p_thresh)


    while n_entries < len(X): #until the pool isn't filled up

        rand = randint(0,len(X)-1)

        if _accept_entry(K[seed,rand],n_entries/(len(X)-1),p_thresh):

            instance_X[n_entries] = X[rand]
            instance_y[n_entries] = y[rand]
            n_entries+=1

    X_sets.append(instance_X)
    y_sets.append(instance_y)



In [10]:
#bootstrap functions

def _rejection_bootstrap(X, y, K, n_estimators, n_jobs):
    
    #Initializing list of random training sets
    X_sets = []
    y_sets = []
    
    seed=None

# CURRENTLY RETURNS ERROR IF n_jobs!=1
#     Parallel(n_jobs=n_jobs)(delayed(_parallel_rejection_bootstrap)(G,seed,n_estimators,X,y,X_sets,y_sets) 
#                                   for i in range(0,n_estimators))
    
    for i in range(0,n_estimators):  #for each estimator  
            _parallel_rejection_bootstrap(K,seed,n_estimators,X,y,X_sets,y_sets)   
    
    return X_sets,y_sets

def _accept_entry(similarity_score, p_ratio, prob_threshold = 0.7):
    #p_ratio gradually shifts the decisional weight from similarity_score to randomness as the test set gets filled up.
    #Specifically, at least least a small number of entries are accepted only through randomness (p_ratio=1)
    #this way it's unlikely for the method to come up with sets with only one class (if not impossible, but I need to check)
    
    #It is also possible to modify the prob_threshold in order to get more or less random values, assuring more variance.
    #I think this tweakability could prove useful to adapt the model to specific instances.
    
    #PLEASE NOTE THAT THIS CODE MAKES SENSE WITH NORMALIZED SIMILARITY SCORES
    #NORMALIZATION FOR FUNCTION THAT DO NOT RETURN [0,1] VALUES STILL NEEDS TO BE IMPLEMENTED.
    #It's not like it doesn't work, but prob calculations take the high road
    
    prob = similarity_score*(1-p_ratio)+ random.random()*p_ratio
    if prob > prob_threshold:
        return True
    else:
        return False
    
    
def _randomchoice_bootstrap(X, y, K, n_estimators, verbose=0):
    #Initializing list of random training sets
    X_sets = []
    y_sets = []
    
    seed=None
    
    for i in range(0,n_estimators): #for each new set
        instance_X = np.empty(np.shape(X))
        instance_y = np.empty(np.shape(y))
        
        #creating seed for specific training set
        
        if verbose>1:
            oldseed=seed
        
        seed = _pickDiverseSeed(K,seed)
        
        if verbose>1 and oldseed is None:
            print("Chose seed",seed)
            
        if verbose>1 and oldseed is not None:
            print("Chose new seed",seed,"based on previous one:",oldseed,"w/ similarity:",K[seed][oldseed])
                  

        instance_X[0] = X[seed]
        instance_y[0] = y[seed]
        
        pool = np.arange(np.shape(X)[0]) #The "bingo sheet"
        pool_prob = np.zeros(np.shape(X)[0]) #Probability to be picked for each index in the sheet
        pool_tot = np.sum(K[seed]) #Sum of similarities with every point to the seed
        
        
        for j in range(np.shape(X)[0]):
            pool_prob[j] = K[seed][j]/pool_tot
        
        if False:  #adds noise, does not increase accuracy
            for i in range(0,int(np.shape(X)[0]/2)):
                offset = 0.3 #random.uniform(0, 1)
                
                index = randint(0,np.shape(X)[0]-1)
                while pool_prob[index] + offset > 1: 
                    index = randint(0,np.shape(X)[0]-1)
                pool_prob[index] += offset
                
                index = randint(0,np.shape(X)[0])-1
                while pool_prob[index] - offset < 0: 
                    index = randint(0,np.shape(X)[0]-1)
                pool_prob[index] -= offset
                
        #Extraction
        pick = np.random.choice(pool, size=(len(pool))-1, replace=True, p=pool_prob)
        
        if verbose > 2:
            print("Populating set",i,"...")
            
        for j in range(len(pick)):
            instance_X[j+1] = X[pick[j]]
            instance_y[j+1] = y[pick[j]]
        
        X_sets.append(instance_X)
        y_sets.append(instance_y)
    
    return X_sets,y_sets

In [11]:
"""
   
    Parameters
    ----------
    n_estimators : int, default=50
        The number of models to train.
        
    base_estimator : estimator, default=DecisionTreeClassifier()
        The estimator fitted on  each bootstrapped set.     
        
    n_jobs : int, default=1
        Number of parallel jobs during fitting.
        
    similarity_metric : {"rbf", "laplacian", "cosine"}, string, default="rbf"
        The metric used for pairwise_kernels().
        
    bootstrap_method={"rejection", "randomchoice"}, string, default=TODO
        The bootstrap method of choice.
    
    verbose : int, default = 0
        Controls verbosity during fitting and predicting, 0 being none and 3 being the most detailed. 
    
    


"""



class CustomEstimator(BaseEstimator, ClassifierMixin):
    
    def __init__(self, n_estimators=50,
                base_estimator=DecisionTreeClassifier(),
                n_jobs=1,
                bootstrap_method="rejection",
                similarity_metric="rbf",
                verbose = 0):
        
        
        
        self.n_estimators = n_estimators 
        self.base_estimator = base_estimator
        self.n_jobs = n_jobs
        self.bootstrap_method = bootstrap_method
        self.similarity_metric = similarity_metric
        self.verbose = verbose 
        
        _validate(self.similarity_metric, self.bootstrap_method)
        
    def fit(self,X,y):
        
        self.n_classes_= np.max(y)+1
        
        if self.verbose > 0:
            print(datetime.datetime.now().time(),"Bootstrapping...")
            
        K = pairwise_kernels(X, metric=self.similarity_metric)
        
        if self.verbose > 1:
            print("Calculated similarity matrix of shape",np.shape(K))
            
        if self.bootstrap_method=="rejection":  
            self.X_sets, self.y_sets = _rejection_bootstrap(X, y, K, self.n_estimators, self.n_jobs)
        elif self.bootstrap_method=="randomchoice": 
            self.X_sets, self.y_sets = _randomchoice_bootstrap(X, y, K, self.n_estimators, verbose=self.verbose)
        
        
        if self.verbose > 0:
            print(datetime.datetime.now().time(),"Generating models...")
        
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_parallel_fit)(self.base_estimator, self.X_sets, self.y_sets, i) 
                                                        for i in range(0,self.n_estimators))
            
        self.X_ = X
        self.y_ = y

        if self.verbose > 0:
            print(datetime.datetime.now().time(),"Done!")
        return self
                
    
    def predict(self, X):
        
        predicted_probability = self.predict_proba(X)
        return np.argmax(predicted_probability, axis=1)
    
    
#     def predict_proba_old(self, X):
            
#         if self.verbose > 0:
#             print("Predicting...")
            
#         out = np.zeros(shape=(len(X),self.n_classes_))
        
#         #compute similarity between instance and each training set
        
#         if self.verbose > 0:
#             print("Computing models similarity to instance...")
        
#         Z=[]
#         for j in range (0,self.n_estimators):
#             Z.append(pairwise_kernels(X, self.X_sets[j], metric=self.similarity_metric))
# #             if self.verbose > 1:
# #                 print("Similarity matrix for instance to model",j,":",Z[j])
        
        
        
#         if self.verbose > 0:
#             print("Making predictions with each trained model...")
            
          
#         for i in range(0,len(X)): #for each sample to predict
#             if self.verbose > 1:  
#                 print("\n--- EVALUATING SAMPLE",i," ---")
            
#             votes = []
#             votes_weighted = []
#             sim_means = np.zeros(self.n_estimators)
            
#             voting_power = np.zeros(self.n_estimators)
            
#             for j in range (0,self.n_estimators): #for each estimator trained
                
#                 votes.append(self.estimators_[j].predict_proba(X[i].reshape(1, -1))) 
                
#                 sim_means[j] = np.mean(Z[j][i])
                
                
#             for k in range (0,self.n_estimators):
                     
#                 voting_power[k] = (100*sim_means[k]/np.sum(sim_means))/100
#                 votes_weighted.append(votes[k]*voting_power[k])
                
#                 if self.verbose > 2:
#                     print("model",k,"votes",np.round(votes[k],4),"having ~",np.round(sim_means[k],4),
#                           "similarity (voting power:",np.round(voting_power[k],4),")")
                    
#             print(np.sum(votes_weighted[i],axis=0))
#             out[i]=(np.sum(votes_weighted,axis=0))
#             if self.verbose > 1:
#                 print("FINAL PREDICTION:",out[i],)
            
        
#         if self.verbose > 0:
#             print("Done!")

#         return out
    
    def predict_proba(self, X):
        #New faster implementation but a bit less human-readable. Avoids loading all models for each sample evaluation.
        
        if self.verbose > 0:
            print(datetime.datetime.now().time(),"Predicting...")
            
        out = np.zeros(shape=(len(X),self.n_classes_))        
        Z = np.zeros(shape=(self.n_estimators,len(X),len(self.X_sets[0])))            
        preds = []
        sim_means = np.zeros(self.n_estimators)

        if self.verbose > 0:
            print(datetime.datetime.now().time(),"Computing predictions for each model...")
            
        for i in range (0,self.n_estimators): #for each estimator trained            
            Z[i] = (pairwise_kernels(X, self.X_sets[i], metric = self.similarity_metric))
            
            #load estimator model
#             model_name = "model"+str(i)+".pkl"
#             model = joblib.load(model_name)
          
#             preds.append(model.predict_proba(X))
              
        preds = Parallel(n_jobs=self.n_jobs)(delayed(_parallel_models_predict)(self.estimators_[j], X) 
                                     for j in range(0,self.n_estimators))
        
        print(preds)
        
        if (self.similarity_metric=='rbf'):
            pass
        
        
        votes = np.zeros((len(X),len(preds),self.n_classes_))
        
        if self.verbose > 0:
            print(datetime.datetime.now().time(),"Computing similarities between models and samples...")
            
        
        
        for i in range (0,len(X)): #for each sample to predict
            
            if self.verbose > 2:  
                    print("\n--- EVALUATING SAMPLE",i," ---")
            
            for j in range (0,self.n_estimators): #for each model
                sim_means[j] = np.mean(Z[j][i])
            
            
            for j in range (0,self.n_estimators):
                #weight votes
                #print(np.shape(votes[i][j]),"a",len(preds[j][i]))
                votes[i][j] = preds[j][i] * ((100*sim_means[j]/np.sum(sim_means))/100)
                
                if self.verbose > 2:
                    print("model",j,"votes",np.argmax(votes[i][j]),"having ~",np.round(sim_means[j],4),
                          "similarity (voting power:",np.round(((100*sim_means[j]/np.sum(sim_means))/100),4),")")
                    
            out[i]=(np.sum(votes[i],axis=0))

    
            if self.verbose > 1:
                print("SAMPLE",i,"PREDICTION:",out[i],)
                
                
        if self.verbose > 0:
            print(datetime.datetime.now().time(),"Done!")
                
        return(out)
        

In [12]:
import dill as pickle

In [13]:
# Test instance 

clf = CustomEstimator(n_estimators=50, base_estimator=DecisionTreeClassifier(), n_jobs=-1,
                             similarity_metric='laplacian', bootstrap_method="randomchoice",verbose=3)

clf.fit(X_train, y_train)
y_pred=clf.predict_proba(X_test)

08:25:01.549945 Bootstrapping...
Calculated similarity matrix of shape (120, 120)
Chose seed 98
Populating set 0 ...
Chose new seed 35 based on previous one: 98 w/ similarity: 0.6537697851298472
Populating set 1 ...
Chose new seed 83 based on previous one: 35 w/ similarity: 0.5352614285189903
Populating set 2 ...
Chose new seed 112 based on previous one: 83 w/ similarity: 0.21762105686523284
Populating set 3 ...
Chose new seed 80 based on previous one: 112 w/ similarity: 0.4274149319487267
Populating set 4 ...
Chose new seed 30 based on previous one: 80 w/ similarity: 0.16948344949947006
Populating set 5 ...
Chose new seed 74 based on previous one: 30 w/ similarity: 0.10025884372280375
Populating set 6 ...
Chose new seed 48 based on previous one: 74 w/ similarity: 0.5352614285189903
Populating set 7 ...
Chose new seed 15 based on previous one: 48 w/ similarity: 0.20189651799465538
Populating set 8 ...
Chose new seed 36 based on previous one: 15 w/ similarity: 0.24050846320834207
Popula

model 49 votes 2 having ~ 0.2797 similarity (voting power: 0.012 )
SAMPLE 28 PREDICTION: [0. 0. 1.]

--- EVALUATING SAMPLE 29  ---
model 0 votes 1 having ~ 0.5472 similarity (voting power: 0.0205 )
model 1 votes 1 having ~ 0.5661 similarity (voting power: 0.0212 )
model 2 votes 1 having ~ 0.6036 similarity (voting power: 0.0226 )
model 3 votes 1 having ~ 0.4804 similarity (voting power: 0.018 )
model 4 votes 1 having ~ 0.5757 similarity (voting power: 0.0216 )
model 5 votes 1 having ~ 0.444 similarity (voting power: 0.0167 )
model 6 votes 1 having ~ 0.5504 similarity (voting power: 0.0206 )
model 7 votes 1 having ~ 0.5687 similarity (voting power: 0.0213 )
model 8 votes 1 having ~ 0.4456 similarity (voting power: 0.0167 )
model 9 votes 1 having ~ 0.6008 similarity (voting power: 0.0225 )
model 10 votes 1 having ~ 0.5235 similarity (voting power: 0.0196 )
model 11 votes 1 having ~ 0.6066 similarity (voting power: 0.0227 )
model 12 votes 1 having ~ 0.5441 similarity (voting power: 0.0204

In [14]:
# #Accuracy comparison w/different parameters

# clf1 = CustomEstimator(n_estimators=50,base_estimator=DecisionTreeClassifier())
# clf2 = CustomEstimator(n_estimators=50,base_estimator=DecisionTreeClassifier(),bootstrap_method="randomchoice")
# clf3 = CustomEstimator(n_estimators=50,base_estimator=DecisionTreeClassifier(),similarity_metric="laplacian")
# clf4 = CustomEstimator(n_estimators=50,base_estimator=DecisionTreeClassifier(),similarity_metric="laplacian",bootstrap_method="randomchoice")

# estimators = [("rbf/rej",clf1),("rbf/raC",clf2),("Lap/rej",clf3),("Lap/raC",clf4)]

# for i in range(0,len(estimators)):
#     estimators[i][1].fit(X_train, y_train)
#     y_pred = estimators[i][1].predict(X_test)
#     print(estimators[i][0],":",metrics.accuracy_score(y_test, y_pred))
    

In [15]:
from sklearn.metrics import accuracy_score,balanced_accuracy_score,roc_auc_score,f1_score

# 'acc': accuracy_score(y_test, y_pred),
# 'balacc': balanced_accuracy_score(y_test, y_pred),
# 'microf1': f1_score(y_test, y_pred, average='micro'),
# 'macrof1': f1_score(y_test, y_pred, average='macro')

print("Accuracy =",accuracy_score(y_test, y_pred))
print("Balanced accuracy =",balanced_accuracy_score(y_test, y_pred))
print("Micro F1 score =",f1_score(y_test, y_pred, average='micro'))
print("Macro F1 score =",f1_score(y_test, y_pred, average='macro'))

ValueError: Classification metrics can't handle a mix of multiclass and continuous-multioutput targets

In [None]:
#Test parallelizzazione
clf1 = CustomEstimator(n_estimators=50, base_estimator=DecisionTreeClassifier(), n_jobs=1,
                             similarity_metric='rbf')

clf2 = CustomEstimator(n_estimators=50, base_estimator=DecisionTreeClassifier(), n_jobs=-1,
                             similarity_metric='rbf')
print("\n --- NON-PARALLEL ---\n  Fit:")
%time clf1.fit(X_train, y_train)
print("  Predict:")
%time clf1.predict(X_test)
print("\n --- PARALLEL ---\n  Fit:")
%time clf2.fit(X_train, y_train)
print("  Predict:")
%time clf2.predict(X_test)

In [None]:
# Comparative tests

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

rnd_clf = RandomForestClassifier(n_estimators=50, max_leaf_nodes=16, n_jobs=-1)

svc_clf = SVC(probability=True)

tree_clf = DecisionTreeClassifier(max_depth=16)

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=50, max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True)

custom_clf = CustomEstimator(similarity_metric="laplacian", bootstrap_method="randomchoice",n_estimators=50, n_jobs=-1)

custom_clf2 = CustomEstimator(n_estimators=50, base_estimator=SVC(probability=True), n_jobs=-1)

custom_clf3 = CustomEstimator(n_estimators=50, similarity_metric="laplacian", bootstrap_method="randomchoice", n_jobs=-1)

for clf in (rnd_clf, svc_clf, tree_clf, bag_clf, custom_clf, custom_clf2, custom_clf3):
    
    print("---",clf.__class__.__name__,"---")
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print("Accuracy =",accuracy_score(y_test, y_pred))
    print("Balanced accuracy =",balanced_accuracy_score(y_test, y_pred))
    print("Micro F1 score =",f1_score(y_test, y_pred, average='micro'))
    print("Macro F1 score =",f1_score(y_test, y_pred, average='macro'),"\n")
    
#     results, names = list(), list()
#     scores = evaluate_model(clf, X, y)
#     results.append(scores)
#     names.append(clf.__class__.__name__)
#     print('%s %.3f (%.3f)' % ("Cross validation score:", np.mean(scores), np.std(scores)))



In [None]:
from sklearn.metrics.pairwise import check_pairwise_arrays

pairwise_kernels(X, Y=None, metric="cosine")

#[‘additive_chi2’, ‘chi2’, ‘linear’, ‘poly’, ‘polynomial’, ‘rbf’, ‘laplacian’, ‘sigmoid’, ‘cosine’]

#normalized metrics: rbf, laplacian




In [None]:
#Debug parallelizzazione
#Ipotesi: non funziona se in funzione ci sono delle variabili iterate

lst = []
lst4=[]
lst2 = [randint(0,10),randint(0,10),randint(0,10),randint(0,10),randint(0,10)]
lst3 = [randint(0,10),randint(0,10),randint(0,10),randint(0,10),randint(0,10)]

print (lst2) 
print(lst3)

def doStuff(i):
    return lst2[i]+lst3[i], lst2[i]
     
    
lst = Parallel(n_jobs=-1)(delayed(doStuff)(i) for i in range(0,5))

lst,lst4

In [None]:
a = np.array([[1, 2, 3], [4,5,6]])
a.transpose(1, 0)

In [None]:
clf = CustomEstimator(n_estimators=50, n_jobs=-1,
                             similarity_metric='laplacian', bootstrap_method="randomchoice",verbose=0)

clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

In [None]:
rnd_clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)

rnd_clf.fit(X_train, y_train)
y_pred=rnd_clf.predict(X_test)

In [None]:
import datetime
now = datetime.datetime.now()
print(now.time())



In [None]:
print(datetime.datetime.now().time())