In [1]:
import numpy as np
from sklearn.base import BaseEstimator
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics.pairwise import pairwise_kernels

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

In [3]:
from sklearn.datasets import make_classification
X, y = make_classification(n_classes=3, n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=2)

In [10]:

class CustomEstimator(BaseEstimator):
    
    def __init__(self, n_estimators,
                 base_estimator=DecisionTreeClassifier(),
                 similarity_function=pairwise_kernels(X, X, metric='rbf')):
        
        self.n_estimators = n_estimators
        self.base_estimator = base_estimator
        self.similarity_function = similarity_function
        
    
    def fit(self,X,y):
        self.n_classes = np.max(y)+1
        
        print("Creating bagged training sets...")
        self.X_sets, self.y_sets = custom_bagging(X, y, self.similarity_function, self.n_estimators)
        
        #support lists to compose the estimator list that can be accepted by the voting clf
        est_clf = []
        est_names = []
        
        print("Fitting models...")
        for i in range(self.n_estimators):           
            est_clf.append(self.base_estimator.fit(self.X_sets[i], self.y_sets[i]))
            est_names.append(("model % s" % i))
            
        #self.value_ = list(zip(est_names,est_clf))
        self.value_ = est_clf
        
        return self
            
    
    def predict(self, X):
        Z=[]
        weighted_predictions=np.empty(shape=(self.n_estimators,2))
        out=np.empty(shape=(len(X)))
        
        
        for i in range (0,self.n_estimators):
            Z.append(pairwise_kernels(X, self.X_sets[i], metric='rbf'))
            
#         print(np.shape(Z))
#         print(self.n_estimators,len(X),len(self.X_sets[0]))
#         print(Z[0])
            
        for i in range(0,len(X)):
            print("evaluating sample",i)
            votes=np.zeros(shape=(self.n_classes))
            for j in range (0,self.n_estimators):
                weighted_predictions[j][0]=(self.value_[j].predict(X[i].reshape(1, -1)))
                weighted_predictions[j][1]=(np.mean(Z[j][i]))
                #print("Estimator",j,"predicts",weighted_predictions[j,0],"having a similarity score of",weighted_predictions[j,1])
                votes[int(weighted_predictions[j][0])] += weighted_predictions[j][1]
                
                if np.max(votes) == votes[int(weighted_predictions[j][0])]:
                    out[i] = weighted_predictions[j,0]
            print("Votes result:",votes)

        
        #out = self.value_.predict(X)
        
        print(out)
        return out
    
    def predict_proba(self, X):
        out = self.value_.predict_proba(X)
        return out

In [25]:
# Test instance 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

custom_clf = CustomEstimator(n_estimators=5,
                             base_estimator=tree_clf,
                             similarity_function=pairwise_kernels(X, metric='rbf'))

custom_clf.fit(X_train, y_train)
y_pred = custom_clf.predict(X_test)
print(custom_clf.__class__.__name__,"=",metrics.accuracy_score(y_test, y_pred))

#Commenting this here as I won't do it again: 
#tested 7/18/22 version with make_classification with 4 classes, 500 estimators
#Accuracy score ~0.635
#Somehow each estimator predicts the same thing for each sample

Creating bagged training sets...
populating set 0
populating set 1
populating set 2
populating set 3
populating set 4
Fitting models...
evaluating sample 0
Votes result: [0.         0.00362218 0.        ]
evaluating sample 1
Votes result: [0.        0.0002602 0.       ]
evaluating sample 2
Votes result: [0.         0.         0.00258056]
evaluating sample 3
Votes result: [0.         0.         0.00017691]
evaluating sample 4
Votes result: [0.         0.00752516 0.        ]
evaluating sample 5
Votes result: [0.         0.00258659 0.        ]
evaluating sample 6
Votes result: [0.         0.         0.00749409]
evaluating sample 7
Votes result: [0.         0.         0.00038151]
evaluating sample 8
Votes result: [0.         0.00130242 0.        ]
evaluating sample 9
Votes result: [0.         0.00322474 0.        ]
evaluating sample 10
Votes result: [0.         0.00169732 0.        ]
evaluating sample 11
Votes result: [0.         0.00437191 0.        ]
evaluating sample 12
Votes result: [0

In [24]:
from sklearn.metrics import pairwise_distances
from random import randint
import random

def custom_bagging(X, y, similarity_function, n_estimators):
    
    #Initializing list of random training sets
    X_sets = []
    y_sets = []
    
    for i in range(0,n_estimators):
        print("populating set", i)
        instance_X = np.empty(np.shape(X))
        instance_y = np.empty(np.shape(y))
        
        #creating seed for specific training set
        seed = randint(0,len(X)-1)
        
        instance_X[0] = X[seed]
        instance_y[0] = y[seed]       
        
        n_entries = 1
        
        #populating training set
        while n_entries < len(X):
            rand = randint(0,len(X)-1)
            if accept_entry(similarity_function[seed,rand],n_entries/(len(X)-1)):
                instance_X[n_entries] = X[rand]
                instance_y[n_entries] = y[rand]
                n_entries+=1
        X_sets.append(instance_X)
        y_sets.append(instance_y)     
    
    return X_sets,y_sets

def accept_entry(similarity_score, p_ratio, prob_threshold = 0.5):
    #p_ratio gradually shifts the decisional weight from similarity_score to randomness as the test set gets filled up.
    #Specifically, at least least a small number of entries are accepted only through randomness (p_ratio=1)
    #this way it's unlikely for the method to come up with sets with only one class (if not impossible, but I need to check)
    
    #It is also possible to modify the prob_threshold in order to get more or less random values, assuring more variance.
    #I think this tweakability could prove useful to adapt the model to specific instances.
    
    #PLEASE NOTE THAT THIS CODE MAKES SENSE WITH NORMALIZED SIMILARITY SCORES
    #NORMALIZATION FOR FUNCTION THAT DO NOT RETURN [0,1] VALUES STILL NEEDS TO BE IMPLEMENTED.
    #It's not like it doesn't work, but prob calculations take the high road
    
    prob = similarity_score*(1-p_ratio)+ random.random()*p_ratio
    if prob > prob_threshold:
        return True
    else:
        return False
    
def normalize(X):
    #todo, in case the class needs to accept non-normalized similarity functions
    return X


#Test
X_sets, y_sets = custom_bagging(X, y,pairwise_kernels(X, metric='rbf'),5)

for i in range(len(X_sets)):
    print("set",i,":")
    for j in range (0,len(X_sets[0][:,0])):
        if j==0:
            print ("seed: ",y_sets[i][j])
            s = y_sets[i][j]
            g=0
        else:
            if y_sets[i][j] == s:
                g+=1
    print (g,"/",j,"elements with same seed value (",g/j*100,"%)")
    


populating set 0
populating set 1
populating set 2
populating set 3
populating set 4
set 0 :
seed:  1.0
472 / 999 elements with same seed value ( 47.247247247247245 %)
set 1 :
seed:  2.0
472 / 999 elements with same seed value ( 47.247247247247245 %)
set 2 :
seed:  0.0
480 / 999 elements with same seed value ( 48.048048048048045 %)
set 3 :
seed:  0.0
462 / 999 elements with same seed value ( 46.246246246246244 %)
set 4 :
seed:  1.0
454 / 999 elements with same seed value ( 45.44544544544545 %)


In [8]:
# Comparative tests
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)

log_clf = LogisticRegression()

svc_clf = SVC(probability=True)

tree_clf = DecisionTreeClassifier(max_depth=2)

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True)

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)

custom_clf = CustomEstimator(n_estimators=5,base_estimator=svc_clf)

from sklearn import metrics

for clf in (rnd_clf, log_clf, svc_clf, tree_clf, bag_clf, ada_clf, custom_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,"=",metrics.accuracy_score(y_test, y_pred))



RandomForestClassifier = 0.805
LogisticRegression = 0.845
SVC = 0.9
DecisionTreeClassifier = 0.545
BaggingClassifier = 0.83
AdaBoostClassifier = 0.74
fitting model...
evaluating sample 0
Votes result: [0.00034781 0.         0.        ]
evaluating sample 1
Votes result: [0.         0.00164938 0.        ]
evaluating sample 2
Votes result: [0.00031312 0.         0.        ]
evaluating sample 3
Votes result: [0.         0.         0.01930342]
evaluating sample 4
Votes result: [0.         0.00026846 0.        ]
evaluating sample 5
Votes result: [0.        0.        0.0007126]
evaluating sample 6
Votes result: [0.0036796 0.        0.       ]
evaluating sample 7
Votes result: [0.         0.         0.00083389]
evaluating sample 8
Votes result: [0.00083023 0.         0.        ]
evaluating sample 9
Votes result: [0.         0.00793976 0.        ]
evaluating sample 10
Votes result: [0.        0.0010145 0.       ]
evaluating sample 11
Votes result: [0.00115879 0.         0.        ]
evaluating s

Votes result: [0.         0.00036045 0.        ]
evaluating sample 198
Votes result: [0.00000000e+00 5.36846012e-05 0.00000000e+00]
evaluating sample 199
Votes result: [0.00413549 0.         0.        ]
[0. 1. 0. 2. 1. 2. 0. 2. 0. 1. 1. 0. 1. 0. 2. 1. 1. 1. 2. 2. 1. 1. 0. 1.
 0. 1. 1. 2. 2. 2. 2. 2. 1. 0. 2. 0. 0. 0. 0. 2. 1. 0. 0. 2. 1. 1. 2. 1.
 2. 2. 2. 1. 2. 1. 0. 0. 0. 0. 2. 0. 0. 2. 2. 2. 0. 0. 2. 1. 2. 0. 1. 0.
 1. 1. 2. 1. 2. 2. 2. 0. 2. 1. 0. 0. 1. 2. 0. 2. 0. 1. 0. 1. 1. 0. 0. 0.
 0. 1. 2. 1. 0. 0. 1. 2. 1. 1. 1. 2. 0. 1. 0. 0. 0. 1. 1. 1. 1. 2. 1. 1.
 1. 0. 1. 2. 0. 2. 0. 2. 2. 1. 0. 1. 0. 1. 1. 2. 1. 1. 2. 1. 0. 2. 0. 1.
 1. 2. 0. 2. 1. 1. 2. 2. 0. 0. 2. 0. 2. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1.
 2. 0. 2. 1. 2. 1. 1. 2. 0. 1. 1. 2. 0. 0. 2. 0. 2. 2. 0. 1. 1. 1. 1. 0.
 0. 2. 2. 0. 2. 1. 1. 0.]
CustomEstimator = 0.835
