arminwitte · arminwitte · Mar 16, 2023 · Mar 15, 2023 · Mar 16, 2023 · Mar 16, 2023
diff --git a/binarybeech/binarybeech.py b/binarybeech/binarybeech.py
@@ -9,7 +9,8 @@
 import pandas as pd
 import scipy.optimize as opt
 
-import binarybeech.utils as utils
+from binarybeech.extra import k_fold_split
+import binarybeech.math as math
 from binarybeech.datahandler import data_handler_factory
 from binarybeech.metrics import metrics_factory
 from binarybeech.reporter import Reporter
@@ -131,7 +132,7 @@ def train(self, k=5, plot=True, slack=1.0):
         beta = self._beta(pres["alpha"])
         qual_cv = np.zeros((len(beta), k))
         # split df for k-fold cross-validation
-        sets = utils.k_fold_split(df, k)
+        sets = k_fold_split(df, k)
         for i, data in enumerate(sets):
             c = CART(
                 data[0],
@@ -459,8 +460,8 @@ def _opt_fun(self, tree):
 
         def fun(gamma):
             y_ = y_hat + gamma * delta
-            p = utils.logistic(y_)
-            return utils.logistic_loss(y, p)
+            p = math.logistic(y_)
+            return math.logistic_loss(y, p)
 
         return fun
 

diff --git a/binarybeech/extra.py b/binarybeech/extra.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+# coding: utf-8
+import numpy as np
+
+def k_fold_split(df, k=1, frac=None, random=False, shuffle=True, replace=True):
+    if shuffle:
+        df = df.sample(frac=1.0, replace=False)
+
+    if frac is None:
+        frac = 1.0 - 1.0 / (k + 1.0)
+
+    N = len(df.index)
+    n = int(np.ceil(N / k))
+    sets = []
+    for i in reversed(range(k)):
+        if random:
+            test = df.sample(frac=1.0 - frac, replace=replace)
+        else:
+            test = df.iloc[i * n : min(N, (i + 1) * n), :]
+        training = df.loc[df.index.difference(test.index), :]
+        sets.append((training, test))
+    return sets
diff --git a/binarybeech/math.py b/binarybeech/math.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+# coding: utf-8
+import numpy as np
+
+def gini_impurity(x):
+    unique, counts = np.unique(x, return_counts=True)
+    N = x.size
+    p = counts / N
+    return 1.0 - np.sum(p**2)
+
+
+def shannon_entropy(x):
+    unique, counts = np.unique(x, return_counts=True)
+    N = x.size
+    p = counts / N
+    return -np.sum(p * np.log2(p))
+
+
+def misclassification_cost(x):
+    unique, counts = np.unique(x, return_counts=True)
+    N = x.size
+    p = np.max(counts) / N
+    return 1.0 - p
+
+
+def logistic_loss(y, p):
+    p = np.clip(p, 1e-12, 1.0 - 1e-12)
+    return -np.sum(y * np.log(p) + (1 - y) * np.log(1 - p))
+
+
+def mean_squared_error(y, y_hat):
+    e = y - y_hat
+    return 1 / e.size * (e.T @ e)
+
+
+def r_squared(y, y_hat): 
+    e = y - y_hat
+    sse = e.T @ e
+    sst = np.sum((y - np.nanmean(y)) ** 2)
+    return 1 - sse / sst
+
+
+def majority_class(x):
+    unique, counts = np.unique(x, return_counts=True)
+    ind_max = np.argmax(counts)
+    return unique[ind_max]
+
+
+def odds(x):
+    unique, counts = np.unique(x, return_counts=True)
+    d = {0: 0, 1: 0}
+    for i, u in enumerate(unique):
+        d[u] = counts[i]
+    if d[0] == 0:
+        return np.Inf
+    odds = d[1] / d[0]
+    return odds
+
+
+def log_odds(x):
+    o = odds(x)
+    o = np.clip(o, 1e-12, 1e12)
+    logodds = np.log(o)
+    return logodds
+
+
+def probability(x):
+    if x == np.Inf:
+        return 1.0
+    return x / (1 + x)
+
+
+def logistic(x):
+    return 1.0 / (1.0 + np.exp(-x))
+
+
+def precision(m):
+    return np.diag(m) / np.sum(m, axis=1)
+
+
+def recall(m):
+    return np.diag(m) / np.sum(m, axis=0)
+
+
+def F1(P, R):
+    return 2 * P * R / (P + R)
+
+
+def accuracy(m):
+    return np.sum(np.diag(m)) / np.sum(np.sum(m))
diff --git a/binarybeech/metrics.py b/binarybeech/metrics.py
@@ -6,7 +6,8 @@
 import numpy as np
 import pandas as pd
 
-import binarybeech.utils as utils
+#import binarybeech.utils as utils
+import binarybeech.math as math
 
 
 class Metrics(ABC):
@@ -18,42 +19,42 @@ def _y(self, df):
 
     def _gini_impurity(self, df):
         y = self._y(df)
-        return utils.gini_impurity(y)
+        return math.gini_impurity(y)
 
     def _shannon_entropy(self, df):
         y = self._y(df)
-        return utils.shannon_entropy(y)
+        return math.shannon_entropy(y)
 
     def _misclassification_cost(self, df):
         y = self._y(df)
-        return utils.misclassification_cost(y)
+        return math.misclassification_cost(y)
 
     def _logistic_loss(self, df):
         y = self._y(df)
         p_ = self.node_value(df)
         p = np.ones_like(y) * p_
-        return utils.logistic_loss(y, p)
+        return math.logistic_loss(y, p)
 
     def _mean_squared_error(self, df):
         y = self._y(df)
         y_hat = self.node_value(df)
-        return utils.mean_squared_error(y, y_hat)
+        return math.mean_squared_error(y, y_hat)
 
     def _r_squared(self, y_hat, df):
         y = self._y(df)
-        return utils.r_squared(y, y_hat)
+        return math.r_squared(y, y_hat)
 
     def _mean(self, df):
         y = self._y(df)
         return np.nanmean(y)
 
     def _majority_class(self, df):
         y = self._y(df)
-        return utils.majority_class(y)
+        return math.majority_class(y)
 
     def _odds(self, df):
         y = self._y(df)
-        return utils.odds(y)
+        return math.odds(y)
 
     def _log_odds(self, df):
         odds = self._odds(df)
@@ -209,7 +210,7 @@ def goodness_of_fit(self, y_hat, data):
 
     @staticmethod
     def output_transform(arr):
-        return utils.logistic(arr)
+        return math.logistic(arr)
 
     @staticmethod
     def check_data_type(arr):

diff --git a/binarybeech/utils.py b/binarybeech/utils.py
@@ -2,7 +2,9 @@
 # coding: utf-8
 import numpy as np
 import treelib
+from binarybeech.binarybeech import CART
 
+from binarybeech.extra import k_fold_split
 
 def print_bars(d, max_width=70):
     max_val = max(d.values())
@@ -52,91 +54,29 @@ def k_fold_split(df, k=1, frac=None, random=False, shuffle=True, replace=True):
         training = df.loc[df.index.difference(test.index), :]
         sets.append((training, test))
     return sets
-
-
-def gini_impurity(x):
-    unique, counts = np.unique(x, return_counts=True)
-    N = x.size
-    p = counts / N
-    return 1.0 - np.sum(p**2)
-
-
-def shannon_entropy(x):
-    unique, counts = np.unique(x, return_counts=True)
-    N = x.size
-    p = counts / N
-    return -np.sum(p * np.log2(p))
-
-
-def misclassification_cost(x):
-    unique, counts = np.unique(x, return_counts=True)
-    N = x.size
-    p = np.max(counts) / N
-    return 1.0 - p
-
-
-def logistic_loss(y, p):
-    p = np.clip(p, 1e-12, 1.0 - 1e-12)
-    return -np.sum(y * np.log(p) + (1 - y) * np.log(1 - p))
-
-
-def mean_squared_error(y, y_hat):
-    e = y - y_hat
-    return 1 / e.size * (e.T @ e)
 
-
-def r_squared(y, y_hat): 
-    e = y - y_hat
-    sse = e.T @ e
-    sst = np.sum((y - np.nanmean(y)) ** 2)
-    return 1 - sse / sst
-
-
-def majority_class(x):
-    unique, counts = np.unique(x, return_counts=True)
-    ind_max = np.argmax(counts)
-    return unique[ind_max]
-
-
-def odds(x):
-    unique, counts = np.unique(x, return_counts=True)
-    d = {0: 0, 1: 0}
-    for i, u in enumerate(unique):
-        d[u] = counts[i]
-    if d[0] == 0:
-        return np.Inf
-    odds = d[1] / d[0]
-    return odds
-
-
-def log_odds(x):
-    o = odds(x)
-    o = np.clip(o, 1e-12, 1e12)
-    logodds = np.log(o)
-    return logodds
-
-
-def probability(x):
-    if x == np.Inf:
-        return 1.0
-    return x / (1 + x)
-
-
-def logistic(x):
-    return 1.0 / (1.0 + np.exp(-x))
-
-
-def precision(m):
-    return np.diag(m) / np.sum(m, axis=1)
-
-
-def recall(m):
-    return np.diag(m) / np.sum(m, axis=0)
-
-
-def F1(P, R):
-    return 2 * P * R / (P + R)
-
-
-def accuracy(m):
-    return np.sum(np.diag(m)) / np.sum(np.sum(m))
+def model_missings(df, y_name, X_names=None, cart_settings={}):
+    if X_names is None:
+        X_names = [n for n in df.columns]
+        X_names.remove(y_name)
+    df_ = df.copy()
+    has_missings = df.isnull().any()
+    for x_name in X_names:
+
+        if not has_missings[x_name]:
+            continue
+
+        m_X_names = [n for n in df.columns]
+        m_X_names.remove(x_name)
+        m_X_names.remove(y_name)
+        kwargs = dict(
+            max_depth=3,
+            min_leaf_samples=5,
+            min_split_samples=4,
+            )
+        kwargs = {**kwargs, **cart_settings}
+        mod = CART(df[~df[x_name].isnull()],x_name,X_names=m_X_names,**cart_settings)
+        mod.create_tree()
+        df_.loc[df[x_name].isnull(),x_name] = mod.predict(df[df[x_name].isnull()])
+
+    return df_
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+from binarybeech.utils import model_missings
+
+
+def test_model_missings():
+    df_titanic = pd.read_csv("data/titanic.csv")
+
+    df_new = model_missings(df_titanic,"Survived",X_names=["Age"])
+
+    has_missings = df_new.isnull().any()
+    assert has_missings["Age"] == False