In [1]:
from   sklearn.base            import BaseEstimator, ClassifierMixin
from   sklearn                 import datasets
import numpy                   as     np
from   sklearn.neighbors       import KernelDensity
from   sklearn.model_selection import GridSearchCV
from   sklearn.model_selection import KFold
from   sklearn.model_selection import train_test_split
from   sklearn.metrics         import roc_auc_score,roc_curve,auc
import utils
import multiprocessing         as mp

In [2]:
#https://jakevdp.github.io/PythonDataScienceHandbook/05.13-kernel-density-estimation.html

In [76]:
class KDEClassifier(BaseEstimator, ClassifierMixin):
    
    """Bayesian generative classification based on KDE
    
    Parameters
    ----------
    bandwidth : float
        the kernel bandwidth within each class
    kernel : str
        the kernel name, passed to KernelDensity
    """
    
    def __init__(self, bandwidth=1.0,k=5,kernel='gaussian'):
        
        self.bandwidth = bandwidth
        self.kernel    = kernel
        self.k         = 5
        
    def fit(self, X, y):
        
        self.classes_   = np.sort(np.unique(y))
        training_sets   = [X[y == yi] for yi in self.classes_]
        print(len(training_sets))
        self.models_    = [KernelDensity(bandwidth=self.bandwidth,
                                      kernel=self.kernel).fit(Xi)
                           for Xi in training_sets]
        
        #self.logpriors_ = [np.log(Xi.shape[0] / X.shape[0])
        #                   for Xi in training_sets]

        return self
    
    def each_prior_prob(self,x,X,y):
        
        
        dist          = [np.linalg.norm(x-xi) for xi in X]
        anchor_k      = sorted(dist)[self.k]
        dist_adj_k    = dist/anchor_k
        weight_k      = list(map(lambda x: 0 if x >= 1 else 1- x,dist_adj_k))
        indicator     = [y==i for i in set(y)]

        n             = len(set(y))
        indic_weight  = [indicator[i]*weight_k for i in range(n)]
        new           = np.hstack([np.expand_dims(indic_weight[0], axis=1), np.expand_dims(indic_weight[1], axis=1)])

        if n >2:
            for i in range(2,n):
                new       = np.hstack([new, np.expand_dims(indic_weight[i], axis=1)])
                
        prob_weight       = [sum(new[:,i]) for i in range(n)]
        weight_sum        = sum(weight_k)
        log_prob_prior    = np.log(list(map(lambda x: 0+0.000001 if x==0 else x/weight_sum,prob_weight)))
        
        return log_prob_prior
    
    
    def prior_prob_fit(self,X_test,X_train,y):
    
        n                 = len(set(y))
        prior_prob        = np.zeros((X_test.shape[0],n),dtype=np.float32)
        
        #kfold             = 50
        #mp_pool           = mp.Pool(kfold)
        
        
        
        for i in range(X_test.shape[0]):
            prior_prob[i] = self.each_prior_prob(X_test[i],X_train,y)
        
        #prior_prob        = []
        #prior_prob.extend(mp_pool.starmap(self.each_prior_prob, zip(parent_data.values(),np.repeat(ss,no_of_targets),parent_data.keys(),np.repeat(error_type,no_of_targets))))
        
        #mp_pool.close()
        #mp_pool.join()
        
        return prior_prob
    
    def predict_proba(self,X_valid,X_train,y_train):
        

        logprobs        = np.array([model.score_samples(X_valid)
                             for model in self.models_]).T
        self.logpriors_ = self.prior_prob_fit(X_valid,X_train,y_train)
        result          = np.exp(logprobs + self.logpriors_)
        
        
        print(logprobs)
        print(self.logpriors_)
        print(result)
        return result/result.sum(1, keepdims=True)
        
    def predict(self, X_valid,X_train,y_train):
        return self.classes_[np.argmax(self.predict_proba(X_valid,X_train,y_train), 1)]

In [70]:
def test_dataset():
    ris    = datasets.load_iris()
    df_train, df_test, target_train, target_test = train_test_split(iris.data, iris.target, test_size=0.33, random_state=42)

In [71]:
iris    = datasets.load_iris()
df_train, df_test, target_train, target_test = train_test_split(iris.data, iris.target, test_size=0.33, random_state=42)

In [72]:
def init_param():
    
    hyper_parameter               = {}
    hyper_parameter['bandwidths'] = 10 ** np.linspace(0, 2, 2)
    hyper_parameter['k']          = np.arange(5,6)
    
    # Compute ROC curve and ROC area for each class
    
    fpr                           = dict()
    tpr                           = dict()
    roc_auc                       = dict()
    
    return hyper_parameter,fpr,tpr,roc_auc

In [73]:
def GridSearchCV_w_bayes(df_train,target_train,nFolds = 5,random_state = 0,shuffle=True,hyper_parameter=None):
    
    bandwidths = hyper_parameter['bandwidths']
    k          = hyper_parameter['k']
    
    for bandwidth in bandwidths:
        for k_iter in k:
            
            kf           = KFold(n_splits=nFolds, random_state=random_state, shuffle=shuffle)
            roc_auc_iter = 0
            
            for train_index, valid_index in kf.split(df_train):
                
                X_train, X_valid = df_train[train_index],df_train[valid_index]
                y_train, y_valid = target_train[train_index], target_train[valid_index]
                model            = KDEClassifier(bandwidth=bandwidth,k=k_iter)
                model.fit(X_train,y_train)
                predicted        = model.predict_proba(X_valid,X_train,y_train)
                n_classes        = len(set(y_train))
                y_valid_iter     = np.zeros((y_valid.shape[0],n_classes))
                
                print(y_valid)
                print("predicted")
                print(predicted)
                for i in range(n_classes):
                    y_valid_iter[:,i]    =  y_valid==i
                    fpr[i], tpr[i], _    =  roc_curve(y_valid_iter[:,i], predicted[:, i])
                    roc_auc_iter         += auc(fpr[i], tpr[i])/n_classes

            roc_auc_iter                 = roc_auc_iter/nFolds        
            roc_auc[(bandwidth,k_iter)]  = roc_auc_iter
            
    best_bandwidth,best_k = sorted(roc_auc.items(),key = lambda k:-k[1])[0][0],sorted(roc_auc.items(),key = lambda k:-k[1])[0][1]
    return best_bandwidth,best_k

In [65]:
hyper_parameter,fpr,tpr,roc_auc = init_param()

In [74]:
#best_bandwidth,best_k           = GridSearchCV_w_bayes(df_train=df_train,target_train = target_train,nFolds = 5,random_state = 0,shuffle=True,hyper_parameter=hyper_parameter)

In [81]:
train_path        = '../data/input/input_pkl/train/'
train             = utils.read_pickles(train_path)
#train             = train[0:20000]  
target            = train.target.values
train.drop(['target','ID_code'], axis=1, inplace=True)

train             = train.values

100%|██████████| 5/5 [00:00<00:00, 27.91it/s]


In [82]:
test_path         = '../data/input/input_pkl/test/'
test              = utils.read_pickles(test_path) 
test              = test.values 

100%|██████████| 5/5 [00:00<00:00, 30.29it/s]


In [61]:
train.shape, target.shape, test.shape

((200, 200), (200,), (100, 201))

In [83]:
best_bandwidth,best_k   = GridSearchCV_w_bayes(df_train=train,target_train = target,nFolds = 5,random_state = 0,shuffle=True,hyper_parameter=hyper_parameter)

2
[[-4413.04678523 -4659.47607688]
 [-3921.75970032  -220.87706755]
 [-3390.67368014 -3623.2277422 ]
 ...
 [-4229.41926446 -4060.09136068]
 [ -219.88069938 -4645.77518829]
 [ -223.03574787 -4629.54773583]]
[[  0.         -13.815511  ]
 [  0.         -13.815511  ]
 [  0.         -13.815511  ]
 ...
 [ -0.14748399  -1.9868715 ]
 [  0.         -13.815511  ]
 [  0.         -13.815511  ]]
[[0.00000000e+000 0.00000000e+000]
 [0.00000000e+000 1.18661081e-102]
 [0.00000000e+000 0.00000000e+000]
 ...
 [0.00000000e+000 0.00000000e+000]
 [3.21384985e-096 0.00000000e+000]
 [1.37026774e-097 0.00000000e+000]]
[0 0 0 ... 0 0 0]
predicted
[[nan nan]
 [ 0.  1.]
 [nan nan]
 ...
 [nan nan]
 [ 1.  0.]
 [ 1.  0.]]


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').