In [1]:
import numpy as np
from sklearn.base import BaseEstimator
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics.pairwise import pairwise_kernels
import joblib
from sklearn import metrics

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

In [2]:
#Serious dataset
from sklearn.datasets import make_classification
X, y = make_classification(n_classes=3, n_samples=1000, n_features=6, n_informative=4, n_redundant=2, random_state=2)

In [3]:
#Hard dataset

X, y = make_classification(n_classes=3, n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=2)

In [4]:
#Toy dataset
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [47]:
from sklearn.metrics import pairwise_distances
from random import randint
import random

def _custom_bootstrap(X, y, similarity_function, n_estimators):
    
    #Initializing list of random training sets
    X_sets = []
    y_sets = []
    
    for i in range(0,n_estimators):
        instance_X = np.empty(np.shape(X))
        instance_y = np.empty(np.shape(y))
        
        #creating seed for specific training set
        seed = randint(0,len(X)-1)
        
        instance_X[0] = X[seed]
        instance_y[0] = y[seed]       
        
        n_entries = 1
        
        #populating training set
        
        
        if 1-(len(X_sets)+1)/n_estimators>0.8:
            p_thresh = 0.8
        else:
            p_thresh = np.round(1-(len(X_sets)+1)/n_estimators,1)
        
        #print("populating set", i, "with probability acceptance",p_thresh)
        
        
        while n_entries < len(X):
            
            rand = randint(0,len(X)-1)
            
            if _accept_entry(similarity_function[seed,rand],n_entries/(len(X)-1),p_thresh):
                
                instance_X[n_entries] = X[rand]
                instance_y[n_entries] = y[rand]
                n_entries+=1
                
        X_sets.append(instance_X)
        y_sets.append(instance_y)     
    
    return X_sets,y_sets

def _accept_entry(similarity_score, p_ratio, prob_threshold = 0.7):
    #p_ratio gradually shifts the decisional weight from similarity_score to randomness as the test set gets filled up.
    #Specifically, at least least a small number of entries are accepted only through randomness (p_ratio=1)
    #this way it's unlikely for the method to come up with sets with only one class (if not impossible, but I need to check)
    
    #It is also possible to modify the prob_threshold in order to get more or less random values, assuring more variance.
    #I think this tweakability could prove useful to adapt the model to specific instances.
    
    #PLEASE NOTE THAT THIS CODE MAKES SENSE WITH NORMALIZED SIMILARITY SCORES
    #NORMALIZATION FOR FUNCTION THAT DO NOT RETURN [0,1] VALUES STILL NEEDS TO BE IMPLEMENTED.
    #It's not like it doesn't work, but prob calculations take the high road
    
    prob = similarity_score*(1-p_ratio)+ random.random()*p_ratio
    if prob > prob_threshold:
        return True
    else:
        return False
    
def _normalize(X):
    #todo, in case the class needs to accept non-normalized similarity functions
    return X


#Test
X_sets, y_sets = _custom_bootstrap(X_train, y_train,pairwise_kernels(X_train, metric='rbf'),50)

for i in range(len(X_sets)):
    for j in range (0,len(X_sets[0][:,0])):
        if j==0:
            s = y_sets[i][j]
            g=0
        else:
            if y_sets[i][j] == s:
                g+=1
    #used to measure the effects of p_thresh
    print ("set",i,":",np.round(g/j*100,2),"% of ",y_sets[i][j],"class elements, as per seed")
    


set 0 : 87.39 % of  0.0 class elements, as per seed
set 1 : 69.75 % of  0.0 class elements, as per seed
set 2 : 89.08 % of  0.0 class elements, as per seed
set 3 : 52.1 % of  2.0 class elements, as per seed
set 4 : 55.46 % of  0.0 class elements, as per seed
set 5 : 84.03 % of  1.0 class elements, as per seed
set 6 : 63.87 % of  0.0 class elements, as per seed
set 7 : 92.44 % of  2.0 class elements, as per seed
set 8 : 57.98 % of  0.0 class elements, as per seed
set 9 : 68.91 % of  1.0 class elements, as per seed
set 10 : 63.87 % of  2.0 class elements, as per seed
set 11 : 74.79 % of  2.0 class elements, as per seed
set 12 : 73.11 % of  0.0 class elements, as per seed
set 13 : 85.71 % of  0.0 class elements, as per seed
set 14 : 80.67 % of  0.0 class elements, as per seed
set 15 : 59.66 % of  0.0 class elements, as per seed
set 16 : 84.03 % of  0.0 class elements, as per seed
set 17 : 55.46 % of  1.0 class elements, as per seed
set 18 : 49.58 % of  0.0 class elements, as per seed
set 

In [58]:
class CustomEstimator(BaseEstimator):
    
    def __init__(self, n_estimators,
                base_estimator=DecisionTreeClassifier(),
                similarity_function=pairwise_kernels(X, metric='rbf'),
                verbose = 0):
        
        self.n_estimators = n_estimators 
        self.base_estimator = base_estimator
        self.similarity_function = similarity_function
        self.verbose=verbose #0 = None; 1 = Messages to confirm that the estimator is alive while working; 2 = spits out data 
    
    def fit(self,X,y):
        
        self.n_classes_= np.max(y)+1
        
        if self.verbose > 0:
            print("Bootstrapping...")
        self.X_sets, self.y_sets = _custom_bootstrap(X, y, self.similarity_function, self.n_estimators)
        #dummy_value = self.base_estimator.fit(X,y)
        
        if self.verbose > 0:
            print("Fitting...")
        
        for i in range(self.n_estimators):        
            model = self.base_estimator.fit(self.X_sets[i], self.y_sets[i])
            model_name = "model"+str(i)+".pkl"
            joblib.dump(model, model_name)
            
            
        #self.value_ = dummy_value
        
        self.X_ = X
        self.y_ = y

        if self.verbose > 0:
            print("Done!")
        #return self #For some reason this produces an error
                
    
    def predict(self, X):
        
        predicted_probabilitiy = self.predict_proba(X)
        return np.argmax(predicted_probabilitiy, axis=1)
    
    def predict_proba(self, X):
            
        if self.verbose > 0:
            print("Predicting...")
            
        out = np.zeros(shape=(len(X),self.n_classes_))
        
        #compute similarity between instance and each training set
        
        if self.verbose > 0:
            print("Computing models similarity to instance...")
        
        Z=[]
        for j in range (0,self.n_estimators):
            Z.append(pairwise_kernels(X, self.X_sets[j], metric='rbf'))
        
        if self.verbose > 1:
            print(Z)
        
        if self.verbose > 0:
            print("Making predictions with each trained model...")
            
          
        for i in range(0,len(X)): #for each sample to predict
            if self.verbose > 1:  
                print("evaluating sample",i)
            
            votes = []
            sim_means = np.zeros(self.n_estimators)
            
            voting_power = np.zeros(self.n_estimators)
            
            for j in range (0,self.n_estimators): #for each estimator trained
               
                #load estimator model
                model_name = "model"+str(j)+".pkl"
                model = joblib.load(model_name)
                
                votes.append(model.predict_proba(X[i].reshape(1, -1))) 
                sim_means[j] = np.mean(Z[j][i]) #not sure about using the mean
            
                if self.verbose > 1:
                    print("votes:",np.round(votes[j],2),"similarity:",np.round(sim_means[j],2))
                    print(sim_means)
           
                voting_power[j] = (100*sim_means[j]/np.sum(sim_means))/100
                votes[j]=votes[j]*voting_power[j]
        
            if self.verbose > 1:
                print("voting power:",voting_power)
                
            out[i]=(np.sum(votes,axis=0))
        
        if self.verbose > 0:
            print("Done!")

        return out

In [56]:
# Test instance 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

clf = CustomEstimator(n_estimators=50,
                             base_estimator=DecisionTreeClassifier(),
                             similarity_function=pairwise_kernels(X, metric='rbf'),verbose=1)

clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
print("Accuracy =",metrics.accuracy_score(y_test, y_pred))

Bootstrapping...
Fitting...
Done!
Predicting...
Computing models similarity to instance...
Making predictions with each trained model...
Done!
Accuracy = 1.0


In [None]:
# Comparative tests

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

rnd_clf = RandomForestClassifier(n_estimators=50, max_leaf_nodes=16, n_jobs=-1)

svc_clf = SVC(probability=True)

tree_clf = DecisionTreeClassifier(max_depth=16)

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=50, max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True)

custom_clf = CustomEstimator(n_estimators=50,base_estimator=DecisionTreeClassifier(),verbose=True)

print("Accuracy scores:")
for clf in (rnd_clf, svc_clf, tree_clf, bag_clf, custom_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,":",metrics.accuracy_score(y_test, y_pred))
    
#     results, names = list(), list()
#     scores = evaluate_model(clf, X, y)
#     results.append(scores)
#     names.append(clf.__class__.__name__)
#     print('%s %.3f (%.3f)' % ("Cross validation score:", np.mean(scores), np.std(scores)))



In [None]:
from sklearn.metrics.pairwise import check_pairwise_arrays

X = iris.data
y = iris.target

pairwise_kernels(X, Y=None, metric="laplacian")

#[‘additive_chi2’, ‘chi2’, ‘linear’, ‘poly’, ‘polynomial’, ‘rbf’, ‘laplacian’, ‘sigmoid’, ‘cosine’]

#normalized metrics: rbf, laplacian, cosine
#cosine doesn't make much sense i guess