### SemiBoost implementation, as adapted by Alexandra Sklokin, from papabloblo on github.

#### References: 

papabloblo, SemiBoost, (2018), GitHub repository, https://github.com/papabloblo/semi_boost

In [5]:
import numpy as np
from sklearn import neighbors
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy import sparse
from scipy.spatial.distance import pdist,squareform
from sklearn.metrics.pairwise import rbf_kernel
import pandas as pd
import time
import math

class SemiBoostClassifier():

    #Note that convergence of the model may depend on the BaseModel. This is the model which will be boosted.
    def __init__(self, base_model = SVC(probability=True)):
        self.BaseModel = base_model

    #Function to build the classifier model. Iteratively boost the BaseModel by pseudolabeling.
    def fit(self, X, y1,
            n_neighbors=4, n_jobs = 1,
            max_models = 15,
            sample_percent = 0.5,
            sigma_percentile = 90,
            similarity_kernel = 'rbf',
            verbose = False):
        
        # Convert from Dataframe to npArray
        y = np.array(y1)
        
        # Localize labeled data
        #idx_label = np.argwhere((y == labels[0]) | (y == labels[1])).flatten()
        #idx_not_label = np.argwhere(y == unlabels[0]).flatten()
        l = []
        u = []
        for i in range(y.size):
            if y[i] in [0,1]:
                l.append(i)
            else:
                u.append(i)
        idx_label = np.array(l)
        idx_not_label = np.array(u)
        
        if verbose:
            print('There are still ', idx_not_label.shape[0], ' unlabeled observations')

        # The parameter C is defined in the paper as
        if idx_not_label.shape[0]!=0:
            C = idx_label.shape[0]/idx_not_label.shape[0]
        else:
            C = float('inf')

        # First we need to create the similarity matrix
        if similarity_kernel == 'knn':

            self.S = neighbors.kneighbors_graph(X,
                                                n_neighbors=n_neighbors,
                                                mode='distance',
                                                include_self=True,
                                                n_jobs=n_jobs)

            self.S = sparse.csr_matrix(self.S)

        elif similarity_kernel == 'rbf':
            # First aprox
            self.S = np.sqrt(rbf_kernel(X, gamma = 1))
            # set gamma parameter as the 15th percentile
            sigma = np.percentile(np.log(self.S), sigma_percentile)
            sigma_2 = (1/sigma**2)*np.ones((self.S.shape[0],self.S.shape[0]))
            self.S = np.power(self.S, sigma_2)
            # Matrix to sparse
            self.S = sparse.csr_matrix(self.S)

        else:
            print('No kernel type ', similarity_kernel)
        
        # If there is no unlabelled data, simply train the classifier on all of the data (supervised learning).
        if idx_not_label.shape[0]==0:
            self.models = []
            clf = self.BaseModel
            clf.fit(X, y)
            self.models.append(clf)
            return

        #=============================================================
        # Initialise variables
        #=============================================================
        self.models = []
        self.weights = []
        H = np.zeros(idx_not_label.shape[0])

        # Loop for adding sequential models
        for t in range(max_models):
            
            #=============================================================
            # Calculate p_i and q_i for every sample
            #=============================================================
            p_1 = np.einsum('ij,j', self.S[:,idx_label].todense(), (y[idx_label]==1))[idx_not_label]*np.exp(-2*H)
            p_2 = np.einsum('ij,j', self.S[:,idx_not_label].todense(), np.exp(H))[idx_not_label]*np.exp(-H)
            p = np.add(p_1, p_2)
            p = np.squeeze(np.asarray(p))

            q_1 = np.einsum('ij,j', self.S[:,idx_label].todense(), (y[idx_label]==-1))[idx_not_label]*np.exp(2*H)
            q_2 = np.einsum('ij,j', self.S[:,idx_not_label].todense(), np.exp(-H))[idx_not_label]*np.exp(H)
            q = np.add(q_1, q_2)
            q = np.squeeze(np.asarray(q))

            #=============================================================
            # Compute predicted label z_i
            #=============================================================
            z = np.sign(p-q)
            z_conf = np.abs(p-q)
            
            #=============================================================
            # Sample sample_percent most confident predictions
            #=============================================================
            # Sampling weights
            sample_weights = z_conf/np.sum(z_conf)
            
            # If there are non-zero weights
            if np.any((sample_weights != 0)):
                if z.size != 1 :                    
                    idx_aux = np.random.choice(np.arange(z.size), size = int(sample_percent*idx_not_label.size), p = sample_weights, replace = False)
                else :
                    idx_aux = [0]
                    
                idx_sample = idx_not_label[idx_aux]
            else:
                print('No similar unlabeled observations left.')
                break

            # Create new X_t, y_t
            idx_total_sample = np.concatenate([idx_label,idx_sample]).flatten().tolist()
            #X_t = X[idx_total_sample,]
            X_t = X.iloc[idx_total_sample]
            if z.size !=1:
                np.put(y, idx_sample, z[idx_aux]) # Include predicted to train new model
            else:
                y[idx_sample] = z
            y_t = y[idx_total_sample]

            #=============================================================
            # Fit BaseModel to samples using predicted labels
            #=============================================================
            # Fit model to unlabeled observations
            clf = self.BaseModel
            clf.fit(X_t, y_t)
            # Make predictions for unlabeled observations
            h = clf.predict(X.iloc[idx_not_label])

            # Refresh indexes
            idx_label = idx_total_sample
            idx_not_label = np.array([i for i in np.arange(y.size) if i not in idx_label])
                
            # If no samples are left without label, break
            if idx_not_label.size == 0:
                if verbose:
                    print('All observations have been labeled')
                    print('Number of iterations: ', t + 1)
                break
            else:
                if verbose:
                    print('There are still ', idx_not_label.shape[0], ' unlabeled observations')

            #=============================================================
            # Compute weight (a) for the BaseModel as in (12)
            #=============================================================
            e = (np.dot(p,h==-1) + np.dot(q,h==1))/(np.sum(np.add(p,q)))
            if e!=0:
                a = 0.25*np.log((1-e)/e)
            else:
                e =  0.00001
                a = 0.25*np.log((1-e)/e)
            
            #=============================================================
            # Update final model
            #=============================================================
            # If a<0 the model is not converging
            if a<0:
                if verbose:
                    print('Problematic convergence of the model. a<0')
                break
            elif np.isposinf(a):
                print('Problematic convergence of the model. a=inf')
                self.models.append(clf)
                self.weights.append(a)
                break

            # Save model
            self.models.append(clf)
            #save weights
            self.weights.append(a)
            # Update
            H = np.zeros(idx_not_label.size)
            w = np.sum(self.weights)
            for i in range(len(self.models)):
                H = np.add(H, self.weights[i]*self.models[i].predict(X.iloc[idx_not_label]))
                # H = np.add(H, self.weights[i]*self.models[i].predict_proba(X[idx_not_label])[:,1]/w)

            # H = np.array(list(map(lambda x: 1 if x>0 else -1, H)))
            #=============================================================
            # Breaking conditions
            #=============================================================

            # Maximum number of models reached
            if (t==max_models) & verbose:
                print('Maximum number of models reached')

            # If no samples are left without label, break
            if idx_not_label.size == 0:
                if verbose:
                    print('All observations have been labeled')
                    print('Number of iterations: ',t + 1)
                break

        if verbose:
            print('\n The model weights are \n')
            print(self.weights)

    def predict(self, X, semi):
        if semi:
            estimate = np.zeros(X.shape[0])
            # Predict weighting each model
            w = np.sum(self.weights)
            for i in range(len(self.models)):
                # estimate = np.add(estimate,  self.weights[i]*self.models[i].predict_proba(X)[:,1]/w)
                estimate = np.add(estimate, self.weights[i]*self.models[i].predict(X))
            estimate = np.array(list(map(lambda x: 1 if x>0 else -1, estimate)))
            estimate = estimate.astype(int)
            return estimate
        else:
            return self.models[-1].predict(X)
    
    def predict_proba(self, X, semi):
        if semi:
            estimate = np.zeros(X.shape[0])
            # Predict weighting each model
            w = np.sum(self.weights)
            for i in range(len(self.models)):
                estimate = np.add(estimate,  self.weights[i]*self.models[i].predict_proba(X)[:,1]/w)
                # estimate = np.add(estimate, self.weights[i]*self.models[i].predict(X))
            # estimate = np.array(list(map(lambda x: 1 if x>0 else -1, estimate)))
            # estimate = estimate.astype(int)
            return estimate
        else:
            return self.models[-1].predict_proba(X)[:,1]


# Testing the Code

In [2]:
filename1 = 'online_shoppers_intentions'

df1r10 = pd.read_csv("../data/train/noresampling/test.csv", index_col=0)
df1u10 = pd.read_csv("../data/train/undersampled/"+filename1+"_10.csv", index_col=0)
df1t = pd.read_csv("../data/test/"+filename1+".csv", index_col=0)

numerical_features1 = ["Administrative", "Administrative_Duration", "Informational", "Informational_Duration", 
                      "ProductRelated", "ProductRelated_Duration", "BounceRates", "ExitRates", "PageValues", "SpecialDay"]
numerical_features1 = []
categorical_features1 = ["OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend", "Month"]
#categorical_features1 = []
target1 = 'Revenue'

In [3]:
def SemiBoost_Test(dftrain, dftest, categorical_features, numerical_features, target, model, semi):
        
    start_time = time.time()
    
    X_train = dftrain[numerical_features+categorical_features]
    y_train = dftrain[target]
    pseudo = y_train.fillna(-1)
    
    X_test = dftest[numerical_features+categorical_features]
    y_test = dftest[target]
    
    #print(pseudo.to_list())
    
    X_train[categorical_features] = X_train[categorical_features].astype('category')
    X_test[categorical_features] = X_test[categorical_features].astype('category')
    
    model.fit(X_train, pseudo, verbose=True, similarity_kernel = 'rbf')
    y_pred = model.predict(X_test, semi)
    y_pred_prob = model.predict_proba(X_test, semi)
    
    execution = time.time() - start_time
    
    return [execution, y_pred, y_pred_prob]

In [4]:
#Sanity Check

#[t, p, pp] = SemiBoost_Test(df1u10, df1t, categorical_features1, numerical_features1, target1, SemiBoostClassifier(RandomForestClassifier()), False)

#pp
#p