Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Model missings #10

Merged
merged 6 commits into from
Mar 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 5 additions & 4 deletions binarybeech/binarybeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
import pandas as pd
import scipy.optimize as opt

import binarybeech.utils as utils
from binarybeech.extra import k_fold_split
import binarybeech.math as math
from binarybeech.datahandler import data_handler_factory
from binarybeech.metrics import metrics_factory
from binarybeech.reporter import Reporter
Expand Down Expand Up @@ -131,7 +132,7 @@ def train(self, k=5, plot=True, slack=1.0):
beta = self._beta(pres["alpha"])
qual_cv = np.zeros((len(beta), k))
# split df for k-fold cross-validation
sets = utils.k_fold_split(df, k)
sets = k_fold_split(df, k)
for i, data in enumerate(sets):
c = CART(
data[0],
Expand Down Expand Up @@ -459,8 +460,8 @@ def _opt_fun(self, tree):

def fun(gamma):
y_ = y_hat + gamma * delta
p = utils.logistic(y_)
return utils.logistic_loss(y, p)
p = math.logistic(y_)
return math.logistic_loss(y, p)

return fun

Expand Down
22 changes: 22 additions & 0 deletions binarybeech/extra.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env python
# coding: utf-8
import numpy as np

def k_fold_split(df, k=1, frac=None, random=False, shuffle=True, replace=True):
if shuffle:
df = df.sample(frac=1.0, replace=False)

if frac is None:
frac = 1.0 - 1.0 / (k + 1.0)

N = len(df.index)
n = int(np.ceil(N / k))
sets = []
for i in reversed(range(k)):
if random:
test = df.sample(frac=1.0 - frac, replace=replace)
else:
test = df.iloc[i * n : min(N, (i + 1) * n), :]
training = df.loc[df.index.difference(test.index), :]
sets.append((training, test))
return sets
90 changes: 90 additions & 0 deletions binarybeech/math.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env python
# coding: utf-8
import numpy as np

def gini_impurity(x):
unique, counts = np.unique(x, return_counts=True)
N = x.size
p = counts / N
return 1.0 - np.sum(p**2)


def shannon_entropy(x):
unique, counts = np.unique(x, return_counts=True)
N = x.size
p = counts / N
return -np.sum(p * np.log2(p))


def misclassification_cost(x):
unique, counts = np.unique(x, return_counts=True)
N = x.size
p = np.max(counts) / N
return 1.0 - p


def logistic_loss(y, p):
p = np.clip(p, 1e-12, 1.0 - 1e-12)
return -np.sum(y * np.log(p) + (1 - y) * np.log(1 - p))


def mean_squared_error(y, y_hat):
e = y - y_hat
return 1 / e.size * (e.T @ e)


def r_squared(y, y_hat):
e = y - y_hat
sse = e.T @ e
sst = np.sum((y - np.nanmean(y)) ** 2)
return 1 - sse / sst


def majority_class(x):
unique, counts = np.unique(x, return_counts=True)
ind_max = np.argmax(counts)
return unique[ind_max]


def odds(x):
unique, counts = np.unique(x, return_counts=True)
d = {0: 0, 1: 0}
for i, u in enumerate(unique):
d[u] = counts[i]
if d[0] == 0:
return np.Inf
odds = d[1] / d[0]
return odds


def log_odds(x):
o = odds(x)
o = np.clip(o, 1e-12, 1e12)
logodds = np.log(o)
return logodds


def probability(x):
if x == np.Inf:
return 1.0
return x / (1 + x)


def logistic(x):
return 1.0 / (1.0 + np.exp(-x))


def precision(m):
return np.diag(m) / np.sum(m, axis=1)


def recall(m):
return np.diag(m) / np.sum(m, axis=0)


def F1(P, R):
return 2 * P * R / (P + R)


def accuracy(m):
return np.sum(np.diag(m)) / np.sum(np.sum(m))
21 changes: 11 additions & 10 deletions binarybeech/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import numpy as np
import pandas as pd

import binarybeech.utils as utils
#import binarybeech.utils as utils
import binarybeech.math as math


class Metrics(ABC):
Expand All @@ -18,42 +19,42 @@ def _y(self, df):

def _gini_impurity(self, df):
y = self._y(df)
return utils.gini_impurity(y)
return math.gini_impurity(y)

def _shannon_entropy(self, df):
y = self._y(df)
return utils.shannon_entropy(y)
return math.shannon_entropy(y)

def _misclassification_cost(self, df):
y = self._y(df)
return utils.misclassification_cost(y)
return math.misclassification_cost(y)

def _logistic_loss(self, df):
y = self._y(df)
p_ = self.node_value(df)
p = np.ones_like(y) * p_
return utils.logistic_loss(y, p)
return math.logistic_loss(y, p)

def _mean_squared_error(self, df):
y = self._y(df)
y_hat = self.node_value(df)
return utils.mean_squared_error(y, y_hat)
return math.mean_squared_error(y, y_hat)

def _r_squared(self, y_hat, df):
y = self._y(df)
return utils.r_squared(y, y_hat)
return math.r_squared(y, y_hat)

def _mean(self, df):
y = self._y(df)
return np.nanmean(y)

def _majority_class(self, df):
y = self._y(df)
return utils.majority_class(y)
return math.majority_class(y)

def _odds(self, df):
y = self._y(df)
return utils.odds(y)
return math.odds(y)

def _log_odds(self, df):
odds = self._odds(df)
Expand Down Expand Up @@ -209,7 +210,7 @@ def goodness_of_fit(self, y_hat, data):

@staticmethod
def output_transform(arr):
return utils.logistic(arr)
return math.logistic(arr)

@staticmethod
def check_data_type(arr):
Expand Down
114 changes: 27 additions & 87 deletions binarybeech/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
# coding: utf-8
import numpy as np
import treelib
from binarybeech.binarybeech import CART

from binarybeech.extra import k_fold_split

def print_bars(d, max_width=70):
max_val = max(d.values())
Expand Down Expand Up @@ -52,91 +54,29 @@ def k_fold_split(df, k=1, frac=None, random=False, shuffle=True, replace=True):
training = df.loc[df.index.difference(test.index), :]
sets.append((training, test))
return sets


def gini_impurity(x):
unique, counts = np.unique(x, return_counts=True)
N = x.size
p = counts / N
return 1.0 - np.sum(p**2)


def shannon_entropy(x):
unique, counts = np.unique(x, return_counts=True)
N = x.size
p = counts / N
return -np.sum(p * np.log2(p))


def misclassification_cost(x):
unique, counts = np.unique(x, return_counts=True)
N = x.size
p = np.max(counts) / N
return 1.0 - p


def logistic_loss(y, p):
p = np.clip(p, 1e-12, 1.0 - 1e-12)
return -np.sum(y * np.log(p) + (1 - y) * np.log(1 - p))


def mean_squared_error(y, y_hat):
e = y - y_hat
return 1 / e.size * (e.T @ e)


def r_squared(y, y_hat):
e = y - y_hat
sse = e.T @ e
sst = np.sum((y - np.nanmean(y)) ** 2)
return 1 - sse / sst


def majority_class(x):
unique, counts = np.unique(x, return_counts=True)
ind_max = np.argmax(counts)
return unique[ind_max]


def odds(x):
unique, counts = np.unique(x, return_counts=True)
d = {0: 0, 1: 0}
for i, u in enumerate(unique):
d[u] = counts[i]
if d[0] == 0:
return np.Inf
odds = d[1] / d[0]
return odds


def log_odds(x):
o = odds(x)
o = np.clip(o, 1e-12, 1e12)
logodds = np.log(o)
return logodds


def probability(x):
if x == np.Inf:
return 1.0
return x / (1 + x)


def logistic(x):
return 1.0 / (1.0 + np.exp(-x))


def precision(m):
return np.diag(m) / np.sum(m, axis=1)


def recall(m):
return np.diag(m) / np.sum(m, axis=0)


def F1(P, R):
return 2 * P * R / (P + R)


def accuracy(m):
return np.sum(np.diag(m)) / np.sum(np.sum(m))
def model_missings(df, y_name, X_names=None, cart_settings={}):
if X_names is None:
X_names = [n for n in df.columns]
X_names.remove(y_name)
df_ = df.copy()
has_missings = df.isnull().any()
for x_name in X_names:

if not has_missings[x_name]:
continue

m_X_names = [n for n in df.columns]
m_X_names.remove(x_name)
m_X_names.remove(y_name)
kwargs = dict(
max_depth=3,
min_leaf_samples=5,
min_split_samples=4,
)
kwargs = {**kwargs, **cart_settings}
mod = CART(df[~df[x_name].isnull()],x_name,X_names=m_X_names,**cart_settings)
mod.create_tree()
df_.loc[df[x_name].isnull(),x_name] = mod.predict(df[df[x_name].isnull()])

return df_
12 changes: 12 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd

from binarybeech.utils import model_missings


def test_model_missings():
df_titanic = pd.read_csv("data/titanic.csv")

df_new = model_missings(df_titanic,"Survived",X_names=["Age"])

has_missings = df_new.isnull().any()
assert has_missings["Age"] == False