In [1]:
print ("Kernel is Alive!")

Kernel is Alive!


In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from hep_ml.speedup import LookupClassifier

In [85]:
from sklearn.base import ClassifierMixin, BaseEstimator

__author__ = 'Adam Dendek'


class DummyClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.threshold = 0.5
    
    def fit(self, X, y, sample_weight=None):
        pass

    def predict_proba(self, X):
        print(X)
        probas = numpy.where(X<self.threshold, [0.1,0.9], [0.9,0.1] )
        return probas


### create data

In [86]:
data_dict = np.array([(1,1), (1,9), (9,1), (9,9)], dtype=[('x', '<i4'), ('y', '<i4')])
data = pd.DataFrame(data_dict, columns=['x', 'y'],  )
data.head()

Unnamed: 0,x,y
0,1,1
1,1,9
2,9,1
3,9,9


In [87]:
data["Label"] = [0,0,1,1]
target = data.Label
features = data.drop("Label", axis=1).astype(numpy.float64)

In [88]:
base_classifier = DummyClassifier()

n_bins = 2

classifier = LookupClassifier(base_estimator=base_classifier, n_bins=n_bins, keep_trained_estimator=False)
classifier.fit(features, target)

[[0 0]
 [0 1]
 [1 0]
 [1 1]]


LookupClassifier(base_estimator=DummyClassifier(),
         keep_trained_estimator=False, max_cells=500000000, n_bins=2)

In [89]:
print(classifier._lookup_table)

[[ 0.1  0.9]
 [ 0.1  0.1]
 [ 0.9  0.9]
 [ 0.9  0.1]]


In [103]:
testX_dict =np.array([(1,9)], dtype=[('x', '<f4'), ('y', '<f4')])
testX = pd.DataFrame(testX_dict, columns=['x', 'y'],  )

testX.head()                

Unnamed: 0,x,y
0,1,9


In [105]:
classifier.predict_proba(testX)[:,0]

array([ 0.1])

# Generate C++ test class

In [109]:
def generate_header():
    return """    
#pragma once

#include <vector>
#include <map>


class PatBBDTSeedClassifier 
{
public:
    PatBBDTSeedClassifier();

    void 
    double getMvaValue(const std::vector<double>& parametersVector );
    

private:
    //initialization phase
    std::vector <std::pair<std::string, std::vector<double>>> initBinEdgeMaps();
    std::vector<double> initLookupTable();

    // model prediction phase
    std::vector<int> getBinIndices(const std::vector<double>& parametersVector);
    int convertBinIndicesToLookupIndex(std::vector<int>& binIndices);
    double getBBDTPrediction(int lookupIndex);


    std::vector <std::pair<std::string, std::vector<double>>> m_binsEdgeMap;
    std::vector<double> m_lookupTable;
};

"""

In [110]:
def generate_source_file():
    return """
#include "PatBBDTSeedClassifier.h"
#include <stdlib.h>
#include <stdio.h>


double PatBBDTSeedClassifier::getMvaValue(const std::vector<double>& parametersVector )
{
    auto binIndices = getBinIndices(parametersVector);
    int lookupTableIndex = convertBinIndicesToLookupIndex(binIndices);
    return getBBDTPrediction(lookupTableIndex);
}



PatBBDTSeedClassifier::PatBBDTSeedClassifier( )
{
    initialize();
}

void PatBBDTSeedClassifier::initialize()
{

    m_binsEdgeMap = initBinEdgeMaps();
    m_lookupTable = initLookupTable();
}

std::vector<int> PatBBDTSeedClassifier::getBinIndices(const std::vector<double>& parametersVector)
{
    std::vector<int> binIndicesMap;
    int actualFeature = 0;
    for (const auto& featurePair: m_binsEdgeMap){
        int binNumber = 0;
        for(const auto& binValue : featurePair.second) {
            if (parametersVector[actualFeature] > binValue) {
                binIndicesMap.push_back(binNumber);
                break;
            }
            binNumber++;
        }
        actualFeature++;
    }
    return binIndicesMap;
}
/**
 * adopt to C++ python method
 *  hep_ml.speedup.LookupClassifier.convert_lookup_index_to_bins()
 */


int PatBBDTSeedClassifier::convertBinIndicesToLookupIndex(std::vector<int> &binIndices)
{
    int index = 0;
    const int binPerFeature = 2;

    for (const auto& indice : binIndices){
        index *= binPerFeature+1;
        index += indice;
    }
    return index;
}

double PatBBDTSeedClassifier::getBBDTPrediction(int lookupIndex)
{
    return m_lookupTable[lookupIndex];
}
    """

In [111]:
def generate_InitBinEdgeMap_function():
    function_body = """
std::vector< std::pair<std::string, std::vector<double> > > PatBBDTSeedClassifier::initBinEdgeMaps()
{
    std::vector< std::pair<std::string, std::vector<double> > >   binMap ={
    """
    for feature, bins_edges in classifier.bin_edges.iteritems():
        function_body += "\t {\"" + str(feature) +"\", {"
        for bin_edge in bins_edges:
            function_body += str(bin_edge)+ ","
        function_body +=  "}},\n"
    
    function_body += """\n };
return binMap;
}
    """
    return function_body


In [112]:
def generate_initLookupTable_read_binary():
    function_body = """

/** right now I am using the simplest idea of importing lookup table.
 *  More sophisticated method is described in 
 *  http://stackoverflow.com/questions/39529799/initialization-of-very-big-vector-in-c/39531749#39531749
 *   thread. Don't know if will be implemented. 
 */

std::vector<double> PatBBDTSeedClassifier::initLookupTable()
{
    FILE *inFile;
    const int elementNumber = """
    function_body +=  str(len(classifier._lookup_table[:,0])) +";\n"
    function_body += """
    std::vector<double> lookup_table(elementNumber);
    if (!(inFile = fopen("BBDT_lookuptable_binary.dat", "rb")))
        exit(EXIT_FAILURE);

    fread(&lookup_table[0], sizeof(double), elementNumber, inFile);
    fclose(inFile);

    return lookup_table;
}
    """
    return function_body

In [113]:
file = open("../Source/PatBBDTSeedClassifier.h", 'w')
file.write(generate_header())
file.close()

file = open("../Source/PatBBDTSeedClassifier.cpp", 'w')
file_content = generate_source_file()
file_content += generate_InitBinEdgeMap_function()
file_content += generate_initLookupTable_read_binary()

file.write(file_content)
file.close()

In [114]:
from struct import pack
file_name ="../Source/BBDT_lookuptable_binary.dat"
with open(file_name, 'wb') as file_out:
    for u in classifier._lookup_table[:,0]:
        file_out.write(pack(b'<d', u)) # Pack doubles as little-endian 