In [86]:
import time
from micromlgen import port
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Classifiers Load
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn import datasets
from sklearn import tree

### 1. Train using Iris Flowers dataset and generate C version of DT, RF, and XGBoost Classifiers

In [12]:
# Iris Datasets load and train DT RF and XG
from sklearn.datasets import load_iris

iris = datasets.load_iris()
x = iris.data
y = iris.target
d = [{"sepal_length":row[0], 
      "sepal_width":row[1], 
      "petal_length":row[2], 
      "petal_width":row[3]} for row in x]
df = pd.DataFrame(d) # construct dataframe
df["types"] = y # assign types
df = df.sample(frac=1.0) # random shuffle rows
df.head()
# train test split, ratio = 0.8
features = df[["sepal_length","sepal_width","petal_length","petal_width"]]
types = df["types"]
train_features, test_features, train_types, test_types = train_test_split(features,types,train_size=0.8, random_state=1)
train_features_x, test_features_x, train_types_x, test_types_x = train_test_split(features,types,train_size=0.998, random_state=1)
train_features_x, test_features_100x, train_types_x, test_types_x = train_test_split(features,types,train_size=0.334, random_state=1)

In [13]:
print(test_features_x.shape)
print(test_features_100x.shape)

(1, 4)
(100, 4)


### 1.1 Iris Decision Tree 

In [14]:
# Iris DT
clf_DT = tree.DecisionTreeClassifier()
clf_DT = clf_DT.fit(train_features, train_types)


start = time.time()
prediction = clf_DT.predict(test_features_x)
stop = time.time()
print(f"Unit Inference time: {stop - start}s")

start = time.time()
prediction = clf_DT.predict(test_features_100x)
stop = time.time()
print(f"Inference time for 100 samples: {stop - start}s")

prediction = clf_DT.predict(test_features)
#Copy paste the port() output in DT_iris.h
print(classification_report(test_types, prediction))
print(port(clf_DT))

f = open("./Trained_classifiers/DT_iris.h", "w")
f.write(port(clf_DT))
f.close()

Unit Inference time: 0.0008769035339355469s
Inference time for 100 samples: 0.0010919570922851562s
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       0.75      1.00      0.86         6
           2       1.00      0.71      0.83         7

    accuracy                           0.93        30
   macro avg       0.92      0.90      0.90        30
weighted avg       0.95      0.93      0.93        30

#pragma once
#include <cstdarg>
namespace Eloquent {
    namespace ML {
        namespace Port {
            class DecisionTree {
                public:
                    /**
                    * Predict class for features vector
                    */
                    int predict(float *x) {
                        if (x[3] <= 0.800000011920929) {
                            return 0;
                        }

                        else {
                            if (x[3] <= 1.75) {
                  

### 1.2 Iris Random Forest

In [5]:
# Iris RF
clf_RF = RandomForestClassifier()
clf_RF.fit(train_features, train_types)

start = time.time()
prediction = clf_RF.predict(test_features_x)
stop = time.time()
print(f"Unit Inference time: {stop - start}s")

start = time.time()
prediction = clf_RF.predict(test_features_100x)
stop = time.time()
print(f"Inference time for 100 samples: {stop - start}s")

prediction = clf_RF.predict(test_features)

#Copy paste the port() output in RF_iris.h
print(classification_report(test_types, prediction, target_names=["type0","type1","type2"]))
print(port(clf_DT))

f = open("./Trained_classifiers/RF_iris.h", "w")
f.write(port(clf_RF))
f.close()

Unit Inference time: 0.0020134449005126953s
Inference time for 100 samples: 0.0032079219818115234s
              precision    recall  f1-score   support

       type0       1.00      1.00      1.00        13
       type1       0.92      1.00      0.96        11
       type2       1.00      0.83      0.91         6

    accuracy                           0.97        30
   macro avg       0.97      0.94      0.96        30
weighted avg       0.97      0.97      0.97        30

#pragma once
#include <cstdarg>
namespace Eloquent {
    namespace ML {
        namespace Port {
            class DecisionTree {
                public:
                    /**
                    * Predict class for features vector
                    */
                    int predict(float *x) {
                        if (x[3] <= 0.75) {
                            return 0;
                        }

                        else {
                            if (x[2] <= 4.8500001430511475) {
                 

## 2. Train using Heart Disease dataset and generate C version of DT, RF, and XGBoost Classifiers

In [88]:
from sklearn.preprocessing import StandardScaler

dataset = pd.read_csv('dataset.csv')
# dataset = pd.get_dummies(dataset, columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'])
standardScaler = StandardScaler()
columns_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
# dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
y = dataset['target']
X = dataset.drop(['target'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
X_train_x, X_test_x, y_train_x, y_test_x = train_test_split(X, y, test_size = 0.001, random_state = 0)
X_train_x, X_test_100x, y_train_x, y_test_x = train_test_split(X, y, test_size = 0.33, random_state = 0)

In [8]:
print(X_test_x.shape)
print(X_test_100x.shape)

(1, 13)
(100, 13)


### 2.1 Heart Disease DT

In [15]:
dt_classifier = tree.DecisionTreeClassifier(max_features = 4, random_state = 0)
dt_classifier.fit(X_train, y_train)

start = time.time()
prediction = dt_classifier.predict(X_test_x)
stop = time.time()
print(f"Unit Inference time: {stop - start}s")

start = time.time()
prediction = dt_classifier.predict(X_test_100x)
stop = time.time()
print(f"Time to infer for 100 samples: {stop - start}s")

prediction = dt_classifier.predict(X_test)

#Copy paste the port() output in DT_Heart.h
print(classification_report(y_test, prediction))
print(port(dt_classifier))

f = open("./Trained_classifiers/DT_Heart.h", "w")
f.write(port(dt_classifier))
f.close()

Unit Inference time: 0.0018150806427001953s
Time to infer for 100 samples: 0.0021817684173583984s
              precision    recall  f1-score   support

           1       0.85      0.68      0.75        34
           2       0.68      0.85      0.75        27

    accuracy                           0.75        61
   macro avg       0.76      0.76      0.75        61
weighted avg       0.77      0.75      0.75        61

#pragma once
#include <cstdarg>
namespace Eloquent {
    namespace ML {
        namespace Port {
            class DecisionTree {
                public:
                    /**
                    * Predict class for features vector
                    */
                    int predict(float *x) {
                        if (x[11] <= 0.5) {
                            if (x[7] <= 158.5) {
                                if (x[12] <= 2.5) {
                                    if (x[6] <= 0.5) {
                                        if (x[9] <= 0.10000000149011612) {

### 2.2 Heart Disease RF

In [20]:
rf_classifier = RandomForestClassifier (n_estimators = 10, random_state = 0)
# n_estimators = 500, random_state = 0
rf_classifier.fit(X_train, y_train)


start = time.time()
prediction = rf_classifier.predict(X_test_x)
stop = time.time()
print(f"Unit Inference time: {stop - start}s")

start = time.time()
prediction = rf_classifier.predict(X_test_100x)
stop = time.time()
print(f"Time to infer for 100 samples: {stop - start}s")

prediction = rf_classifier.predict(X_test)

#Copy paste the port() output in RF_Heart.h
print(classification_report(y_test, prediction))
print(port(rf_classifier))

f = open("./Trained_classifiers/RF_Heart.h", "w")
f.write(port(rf_classifier))
f.close()

Unit Inference time: 0.002681732177734375s
Time to infer for 100 samples: 0.003768444061279297s
              precision    recall  f1-score   support

           1       0.86      0.91      0.89        34
           2       0.88      0.81      0.85        27

    accuracy                           0.87        61
   macro avg       0.87      0.86      0.87        61
weighted avg       0.87      0.87      0.87        61

#pragma once
#include <cstdarg>
namespace Eloquent {
    namespace ML {
        namespace Port {
            class RandomForest {
                public:
                    /**
                    * Predict class for features vector
                    */
                    int predict(float *x) {
                        uint8_t votes[2] = { 0 };
                        // tree #1
                        if (x[9] <= 0.75) {
                            if (x[4] <= 318.5) {
                                if (x[4] <= 228.5) {
                                    if (x[0] 

## 3. Train using Breast Cancer dataset and generate C version of DT, RF, and XGBoost Classifiers

In [82]:
from sklearn.datasets import load_breast_cancer;

cancer = load_breast_cancer();
X = cancer.data;
y = cancer.target;

from sklearn.model_selection import train_test_split;

X_train , X_test, y_train, y_test = train_test_split(X,
                                                     y,
                                                     train_size=0.8,
                                                     random_state = 0);
X_train_xx , X_test_x, y_train_xx, y_test_xx = train_test_split(X,
                                                     y,
                                                     train_size=0.999,
                                                     random_state = 0);
X_train_xx , X_test_100x, y_train_xx, y_test_xx = train_test_split(X,
                                                     y,
                                                     train_size=0.825,
                                                     random_state = 0);

In [83]:
print(X_test_x.shape)
print(X_test_100x.shape)

(1, 30)
(100, 30)


### 3.1 Breast Cancer DT

In [25]:
from sklearn import tree
B_tree = tree.DecisionTreeClassifier(criterion    =  'entropy',
                              max_depth    =  3,
                              random_state =  0 );
B_tree.fit(X_train, y_train)

start = time.time()
prediction = B_tree.predict(X_test_x)
stop = time.time()
print(f"Unit Inference time: {stop - start}s")

start = time.time()
prediction = B_tree.predict(X_test_100x)
stop = time.time()
print(f"Time to infer for 100 samples: {stop - start}s")

prediction = B_tree.predict(X_test)

#Copy paste the port() output in DT_Cancer.h
print(classification_report(y_test, prediction))
print(port(B_tree))

f = open("./Trained_classifiers/DT_Cancer.h", "w")
f.write(port(B_tree))
f.close()

Unit Inference time: 0.0003592967987060547s
Time to infer for 100 samples: 0.0005981922149658203s
              precision    recall  f1-score   support

           0       0.90      0.98      0.94        47
           1       0.98      0.93      0.95        67

    accuracy                           0.95       114
   macro avg       0.94      0.95      0.95       114
weighted avg       0.95      0.95      0.95       114

#pragma once
#include <cstdarg>
namespace Eloquent {
    namespace ML {
        namespace Port {
            class DecisionTree {
                public:
                    /**
                    * Predict class for features vector
                    */
                    int predict(float *x) {
                        if (x[27] <= 0.1423499956727028) {
                            if (x[22] <= 107.75) {
                                if (x[18] <= 0.016490000300109386) {
                                    return 1;
                                }

              

### 3.2 Breast Cancer RF

In [28]:
rf_classifier = RandomForestClassifier (n_estimators = 10, random_state = 0)
rf_classifier.fit(X_train, y_train)

start = time.time()
prediction = rf_classifier.predict(X_test_x)
stop = time.time()
print(f"Unit Inference time: {stop - start}s")

start = time.time()
prediction = rf_classifier.predict(X_test_100x)
stop = time.time()
print(f"Time to infer for 100 samples: {stop - start}s")

prediction = rf_classifier.predict(X_test)

#Copy paste the port() output in RF_Cancer.h
print(classification_report(y_test, prediction))
print(port(rf_classifier))

f = open("./Trained_classifiers/RF_Cancer.h", "w")
f.write(port(rf_classifier))
f.close()

Unit Inference time: 0.0025169849395751953s
Time to infer for 100 samples: 0.0031867027282714844s
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        47
           1       0.97      0.97      0.97        67

    accuracy                           0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114

#pragma once
#include <cstdarg>
namespace Eloquent {
    namespace ML {
        namespace Port {
            class RandomForest {
                public:
                    /**
                    * Predict class for features vector
                    */
                    int predict(float *x) {
                        uint8_t votes[2] = { 0 };
                        // tree #1
                        if (x[6] <= 0.07226499915122986) {
                            if (x[26] <= 0.2620000094175339) {
                                if (x[1] <= 21.574999809265137) {
 

## 4. Train using Handwritten Digits dataset and generate C version of DT, RF, and XGBoost Classifiers

In [80]:
import idx2numpy

X_train_3D = idx2numpy.convert_from_file('train-images.idx3-ubyte')
X_train = X_train_3D.flatten().reshape(60000,784)

y_train = idx2numpy.convert_from_file('train-labels.idx1-ubyte')

X_test_3D = idx2numpy.convert_from_file('t10k-images.idx3-ubyte')
X_test =  X_test_3D.flatten().reshape(10000,784)

y_test = idx2numpy.convert_from_file('t10k-labels.idx1-ubyte')


X_train_xx , X_test_x, y_train_xx, y_test_xx = train_test_split(X_train,
                                                     y_train,
                                                     train_size=0.999999999,
                                                     random_state = 0);
X_train_xx , X_test_100x, y_train_xx, y_test_xx = train_test_split(X_train,
                                                     y_train,
                                                     train_size=0.99834,
                                                     random_state = 0);

In [30]:
print(X_test_x.shape)
print(X_test_100x.shape)

(1, 784)
(100, 784)


### 4.1 Handwritten Digits DT

In [31]:
from sklearn import tree
dt_classifier_digits = tree.DecisionTreeClassifier(max_depth = 10)
dt_classifier_digits.fit(X_train, y_train)

start = time.time()
prediction = dt_classifier_digits.predict(X_test_x)
stop = time.time()
print(f"Unit Inference time: {stop - start}s")

start = time.time()
prediction = dt_classifier_digits.predict(X_test_100x)
stop = time.time()
print(f"Time to infer for 100 samples: {stop - start}s")

prediction = dt_classifier_digits.predict(X_test)

#Copy paste the port() output in DT_digits.h
print(classification_report(y_test, prediction))
print(port(dt_classifier_digits))

f = open("./Trained_classifiers/DT_digits.h", "w")
f.write(port(dt_classifier_digits))
f.close()

Unit Inference time: 0.0002486705780029297s
Time to infer for 100 samples: 0.0003638267517089844s
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       980
           1       0.95      0.96      0.95      1135
           2       0.86      0.84      0.85      1032
           3       0.82      0.84      0.83      1010
           4       0.86      0.85      0.86       982
           5       0.84      0.80      0.82       892
           6       0.91      0.87      0.89       958
           7       0.90      0.88      0.89      1028
           8       0.80      0.82      0.81       974
           9       0.81      0.86      0.83      1009

    accuracy                           0.87     10000
   macro avg       0.87      0.86      0.86     10000
weighted avg       0.87      0.87      0.87     10000

#pragma once
#include <cstdarg>
namespace Eloquent {
    namespace ML {
        namespace Port {
            class DecisionTree {
             

### 4.2 Handwritten Digits RF

In [32]:
rf_classifier = RandomForestClassifier (n_estimators = 1, random_state = 0)
rf_classifier.fit(X_train, y_train)

start = time.time()
prediction = rf_classifier.predict(X_test_x)
stop = time.time()
print(f"Unit Inference time: {stop - start}s")

start = time.time()
prediction = rf_classifier.predict(X_test_100x)
stop = time.time()
print(f"Time to infer for 100 samples: {stop - start}s")

prediction = rf_classifier.predict(X_test)

#Copy paste the port() output in RF_Digits.h
print(classification_report(y_test, prediction))
print(port(rf_classifier))

f = open("./Trained_classifiers/RF_Digits.h", "w")
f.write(port(rf_classifier))
f.close()

Unit Inference time: 0.0006115436553955078s
Time to infer for 100 samples: 0.0008363723754882812s
              precision    recall  f1-score   support

           0       0.88      0.90      0.89       980
           1       0.93      0.95      0.94      1135
           2       0.81      0.80      0.80      1032
           3       0.79      0.78      0.79      1010
           4       0.78      0.79      0.79       982
           5       0.75      0.74      0.74       892
           6       0.85      0.83      0.84       958
           7       0.85      0.83      0.84      1028
           8       0.77      0.72      0.74       974
           9       0.75      0.79      0.77      1009

    accuracy                           0.82     10000
   macro avg       0.81      0.81      0.81     10000
weighted avg       0.82      0.82      0.82     10000

#pragma once
#include <cstdarg>
namespace Eloquent {
    namespace ML {
        namespace Port {
            class RandomForest {
             

## 5. Train using Banknote Authentication dataset and generate C version of DT, RF, and XGBoost Classifiers

In [67]:
#reading the data
data = pd.read_csv('./bank_notes.csv')
data.head()

#missing value counts in each of these columns
miss = data.isnull().sum()/len(data)
miss = miss[miss > 0]
miss.sort_values(inplace=True)
miss

X = data[["variance","skewness","curtosis","entropy"]]
y = data["Target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_1, X_test_x, y_train_1, y_test_1 = train_test_split(X, y, test_size=0.0001, random_state=42)
X_train_1, X_test_100x, y_train_1, y_test_1 = train_test_split(X, y, test_size=0.0723, random_state=42)

In [68]:
print(X_test_x.shape)
print(X_test_100x.shape)

(1, 4)
(100, 4)


### 5.1 Banknote DT

In [37]:
from sklearn import tree

Bank_DT = tree.DecisionTreeClassifier(criterion    =  'entropy',
                              max_depth    =  3,
                              random_state =  0 );
Bank_DT.fit(X_train, y_train)

start = time.time()
prediction = Bank_DT.predict(X_test_x)
stop = time.time()
print(f"Unit Inference time: {stop - start}s")

start = time.time()
prediction = Bank_DT.predict(X_test_100x)
stop = time.time()
print(f"Time to infer for 100 samples: {stop - start}s")

prediction = Bank_DT.predict(X_test)

#Copy paste the port() output in DT_Banknote.h
print(classification_report(y_test, prediction))
print(port(Bank_DT))

f = open("./Trained_classifiers/DT_Banknote.h", "w")
f.write(port(Bank_DT))
f.close()

Unit Inference time: 0.0014922618865966797s
Time to infer for 100 samples: 0.001779794692993164s
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       148
           1       0.95      0.94      0.94       127

    accuracy                           0.95       275
   macro avg       0.95      0.95      0.95       275
weighted avg       0.95      0.95      0.95       275

#pragma once
#include <cstdarg>
namespace Eloquent {
    namespace ML {
        namespace Port {
            class DecisionTree {
                public:
                    /**
                    * Predict class for features vector
                    */
                    int predict(float *x) {
                        if (x[0] <= 0.3201649934053421) {
                            if (x[1] <= 5.865350008010864) {
                                if (x[2] <= 3.0642000436782837) {
                                    return 1;
                                }

         

### 5.2 Banknote RF

In [39]:
rndF = RandomForestClassifier(max_depth=5, random_state=0)
rndF.fit(X_train, y_train)

start = time.time()
prediction = rndF.predict(X_test_x)
stop = time.time()
print(f"Unit Inference time: {stop - start}s")

start = time.time()
prediction = rndF.predict(X_test_100x)
stop = time.time()
print(f"Time to infer for 100 samples: {stop - start}s")

RndF_pred = rndF.predict(X_test)
print(classification_report(y_test, RndF_pred))
print(port(rndF))

#Copy paste the port() output in RF_Banknote.h
f = open("./Trained_classifiers/RF_Banknote.h", "w")
f.write(port(rndF))
f.close()

Unit Inference time: 0.0032367706298828125s
Time to infer for 100 samples: 0.005101442337036133s
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       148
           1       0.97      0.98      0.98       127

    accuracy                           0.98       275
   macro avg       0.98      0.98      0.98       275
weighted avg       0.98      0.98      0.98       275

#pragma once
#include <cstdarg>
namespace Eloquent {
    namespace ML {
        namespace Port {
            class RandomForest {
                public:
                    /**
                    * Predict class for features vector
                    */
                    int predict(float *x) {
                        uint8_t votes[2] = { 0 };
                        // tree #1
                        if (x[0] <= 0.3201649934053421) {
                            if (x[1] <= 5.518749952316284) {
                                if (x[0] <= -1.787850022315979) {
     

## 6. Train using Haberman’s Survival dataset and generate C version of DT, RF, and XGBoost Classifiers 

In [64]:
url = "haberman.data"
names = ['Age', 'Year operation', 'Axillary nodes detected', 'Survival status']
dataset = pd.read_csv(url, names=names)
array = dataset.values
X = array[:,:3]
y = array[:,3]
random_state = 4
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        random_state=random_state)
_, X_test_x, _, _ = train_test_split(X, y, test_size=0.001,
                                                        random_state=random_state)
_, X_test_100x, _, _ = train_test_split(X, y, test_size=0.324,
                                                        random_state=random_state)

In [65]:
print(X_test_x.shape)
print(X_test_100x.shape)

(1, 3)
(100, 3)


### 6.1 Haberman’s Survival DT

In [45]:
DT_clf = tree.DecisionTreeClassifier(criterion    =  'entropy',
                              max_depth    =  3,
                              random_state =  0 );
DT_clf.fit(X_train, y_train)

start = time.time()
prediction = DT_clf.predict(X_test_x)
stop = time.time()
print(f"Unit Inference time: {stop - start}s")


start = time.time()
prediction = DT_clf.predict(X_test_100x)
stop = time.time()
print(f"Time to infer for 100 samples: {stop - start}s")

#Copy paste the port() output in DT_Survival.h
DT_clf_pred = DT_clf.predict(X_test)
print(classification_report(y_test, DT_clf_pred))
print(port(DT_clf))

f = open("./Trained_classifiers/DT_Survival.h", "w")
f.write(port(DT_clf))
f.close()

Unit Inference time: 0.0003256797790527344s
Time to infer for 100 samples: 0.00035119056701660156s
              precision    recall  f1-score   support

           1       0.79      1.00      0.88        46
           2       1.00      0.25      0.40        16

    accuracy                           0.81        62
   macro avg       0.90      0.62      0.64        62
weighted avg       0.85      0.81      0.76        62

#pragma once
#include <cstdarg>
namespace Eloquent {
    namespace ML {
        namespace Port {
            class DecisionTree {
                public:
                    /**
                    * Predict class for features vector
                    */
                    int predict(float *x) {
                        if (x[2] <= 2.5) {
                            if (x[0] <= 76.0) {
                                if (x[0] <= 38.5) {
                                    return 0;
                                }

                                else {
          

### 6.2 Haberman’s Survival RF

In [46]:
Hab_rndF = RandomForestClassifier(max_depth=5, random_state=0)
Hab_rndF.fit(X_train, y_train)

start = time.time()
prediction = Hab_rndF.predict(X_test_x)
stop = time.time()
print(f"Unit Inference time: {stop - start}s")

start = time.time()
prediction = Hab_rndF.predict(X_test_100x)
stop = time.time()
print(f"Time to infer for 100 samples: {stop - start}s")

#Copy paste the port() output in RF_Survival.h
Hab_RndF_pred = Hab_rndF.predict(X_test)
print(classification_report(y_test, Hab_RndF_pred))
print(port(Hab_rndF))

f = open("./Trained_classifiers/RF_Survival.h", "w")
f.write(port(Hab_rndF))
f.close()

Unit Inference time: 0.0014619827270507812s
Time to infer for 100 samples: 0.002106904983520508s
              precision    recall  f1-score   support

           1       0.76      0.98      0.86        46
           2       0.67      0.12      0.21        16

    accuracy                           0.76        62
   macro avg       0.71      0.55      0.53        62
weighted avg       0.74      0.76      0.69        62

#pragma once
#include <cstdarg>
namespace Eloquent {
    namespace ML {
        namespace Port {
            class RandomForest {
                public:
                    /**
                    * Predict class for features vector
                    */
                    int predict(float *x) {
                        uint8_t votes[2] = { 0 };
                        // tree #1
                        if (x[2] <= 2.5) {
                            if (x[0] <= 71.5) {
                                if (x[0] <= 47.5) {
                                    if (x[1] <=

## 7. Train using Titanic dataset and generate C version of DT, RF, and XGBoost Classifiers 

In [59]:
import numpy as np

def setAgeBoundaries (  ):
 for dataset in combine:
     dataset.loc[ dataset['Age'] <= 5, 'Age'] = 0
     dataset.loc[(dataset['Age'] > 5 ) & (dataset['Age'] <= 16), 'Age'] = 1
     dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 2
     dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 3
     dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 4
     dataset.loc[ dataset['Age'] > 64, 'Age'] = 5

def normalizeFamily( ):
 for dataset in combine:
   dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

def pivotingData ( data, entry1, entry2, groupBy, sortBy ):
 return data[[ entry1 , entry2 ]].groupby([groupBy], as_index=False).mean().sort_values(by=sortBy, ascending=False)

def printPivotedData( data ):
 #only categorical values
 print ( pivotingData ( data, 'Pclass','Survived','Pclass','Survived' ) )
 print ( pivotingData ( data, 'Sex','Survived','Sex','Survived' ) )
 print ( pivotingData ( data, 'SibSp','Survived','SibSp','Survived' ) )
 print ( pivotingData ( data, 'Parch','Survived','Parch','Survived' ) )


def normalizeSex ( ):
 for dataset in combine:
   dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

def normalizeAges ( ):
 guess_ages = np.zeros((2,3))
 for dataset in combine:
   for i in range(0, 2):
     for j in range(0, 3):
       guess_df = dataset[(dataset['Sex'] == i) & (dataset['Pclass'] == j+1)]['Age'].dropna()
       age_guess = guess_df.median()
       guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5

   for i in range(0, 2):
     for j in range(0, 3):
       dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),'Age'] = guess_ages[i,j]

   dataset['Age'] = dataset['Age'].astype(int)

def normalizeEmbarked( ):
 freq_port = train_df.Embarked.dropna().mode()[0]

 for dataset in combine:
   dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

 for dataset in combine:
   dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

def normalizeFare():

 for dataset in combine:
   dataset.loc[(dataset['Fare'] < 9), 'Fare'] = 0
   dataset.loc[(dataset['Fare'] >= 9) & (dataset ['Fare'] < 12), 'Fare'] = 1
   dataset.loc[(dataset['Fare'] >= 12) & (dataset ['Fare'] < 15), 'Fare'] = 2
   dataset.loc[(dataset['Fare'] >= 15) & (dataset ['Fare'] < 20), 'Fare'] = 3
   dataset.loc[(dataset['Fare'] >= 20) & (dataset ['Fare'] < 30), 'Fare'] = 4
   dataset.loc[(dataset['Fare'] >= 30) & (dataset ['Fare'] < 55), 'Fare'] = 5
   dataset.loc[(dataset['Fare'] >= 55) & (dataset ['Fare'] < 95), 'Fare'] = 6
   dataset.loc[(dataset['Fare'] >= 95),'Fare'] = 7
   dataset['Fare'] = dataset['Fare'].astype(int)

def normalizeAgeClass( ):
 for dataset in combine:
   dataset['Age*Class*Fare'] = dataset.Age * dataset.Pclass * dataset.Fare
   dataset['Age*Class'] = dataset.Age * dataset.Pclass
   dataset['Age*Fare'] = dataset.Age * dataset.Fare


def normalizeData( ):
 normalizeSex ( )
 normalizeAges( )
 setAgeBoundaries( )
 normalizeFamily( )
 normalizeEmbarked( )
 normalizeFare( )
 normalizeAgeClass( ) 


def getFareClass(data,cat):
 return data.loc[data['Fare'] == cat]
          
def main ( ):
 global train_df
 global test_df
 global combine

 # Training and Testing Data
train_df = pd.read_csv('train_titanic.csv')
test_df = pd.read_csv('test_titanic.csv')
# Drop Useless Features
train_df = train_df.drop(['Ticket', 'Cabin', 'Name'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin', 'Name'], axis=1)

test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)
# Normalize both data sets
combine = [train_df, test_df]
normalizeData( )
train_df = train_df.drop(['Parch', 'SibSp'], axis=1)
test_df = test_df.drop(['Parch', 'SibSp'], axis=1)
train_df = train_df.drop(['FareBand'], axis=1)
 
#visualizeNumericalCorrelation(getFareClass(train_df,0),'Age','Survived')
combine = [train_df, test_df]

# Setting up data
X_train = train_df.drop(["Survived","PassengerId","Fare","Age","Pclass"], axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop(["PassengerId","Fare","Age","Pclass"], axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

X = X_train
y = Y_train

random_state = 0
#    X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=random_state)

_, X_test_x, _, _ = train_test_split(X, y, test_size=0.001,random_state=random_state)
_, X_test_100x, _, _ = train_test_split(X, y, test_size=0.1119,random_state=random_state)

In [48]:
print(X_test_x.shape)
print(X_test_100x.shape)

(1, 6)
(100, 6)


### 7.1 Titanic DT

In [50]:
Titanic_DT_clf = tree.DecisionTreeClassifier(criterion    =  'entropy',
                              max_depth    =  3,
                              random_state =  0 );
Titanic_DT_clf.fit(X_train, y_train)

start = time.time()
prediction = Titanic_DT_clf.predict(X_test_x)
stop = time.time()
print(f"Unit Inference time: {stop - start}s")

start = time.time()
prediction = Titanic_DT_clf.predict(X_test_100x)
stop = time.time()
print(f"Time to infer for 100 samples: {stop - start}s")

#Copy paste the port() output in DT_Titanic.h
Titanic_DT_clf_pred = Titanic_DT_clf.predict(X_test)
print(classification_report(y_test, Titanic_DT_clf_pred))
print(port(Titanic_DT_clf))

f = open("./Trained_classifiers/DT_Titanic.h", "w")
f.write(port(Titanic_DT_clf))
f.close()

Unit Inference time: 0.0016279220581054688s
Time to infer for 100 samples: 0.0018639564514160156s
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       110
           1       0.79      0.70      0.74        69

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.81      0.81      0.81       179

#pragma once
#include <cstdarg>
namespace Eloquent {
    namespace ML {
        namespace Port {
            class DecisionTree {
                public:
                    /**
                    * Predict class for features vector
                    */
                    int predict(float *x) {
                        if (x[0] <= 0.5) {
                            if (x[4] <= 3.5) {
                                if (x[2] <= 4.5) {
                                    return 0;
                                }

                                else {
             

### 7.2 Titanic RF

In [51]:
Titanic_rndF = RandomForestClassifier(max_depth=5, random_state=0)
Titanic_rndF.fit(X_train, y_train)

start = time.time()
prediction = Titanic_rndF.predict(X_test_x)
stop = time.time()
print(f"Unit Inference time: {stop - start}s")

start = time.time()
prediction = Titanic_rndF.predict(X_test_100x)
stop = time.time()
print(f"Time to infer for 100 samples: {stop - start}s")

#Copy paste the port() output in RF_Titanic.h
Titanic_rndF_pred = Titanic_rndF.predict(X_test)
print(classification_report(y_test, Titanic_rndF_pred))
print(port(Titanic_rndF))

f = open("./Trained_classifiers/RF_Titanic.h", "w")
f.write(port(Titanic_rndF))
f.close()

Unit Inference time: 0.002201080322265625s
Time to infer for 100 samples: 0.0037064552307128906s
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       110
           1       0.78      0.78      0.78        69

    accuracy                           0.83       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179

#pragma once
#include <cstdarg>
namespace Eloquent {
    namespace ML {
        namespace Port {
            class RandomForest {
                public:
                    /**
                    * Predict class for features vector
                    */
                    int predict(float *x) {
                        uint8_t votes[2] = { 0 };
                        // tree #1
                        if (x[0] <= 0.5) {
                            if (x[4] <= 3.5) {
                                if (x[2] <= 4.5) {
                                    if (x[2] <= 2