Skip to content

Commit

Permalink
Merge pull request #11 from arminwitte/metrics_refactoring
Browse files Browse the repository at this point in the history
Metrics refactoring
  • Loading branch information
arminwitte committed Mar 21, 2023
2 parents 0566cd6 + 24636d7 commit 42ea2ea
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 146 deletions.
44 changes: 25 additions & 19 deletions binarybeech/binarybeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ def __init__(
self.X_names = X_names

if metrics_type is None:
metrics_type, metrics = metrics_factory.from_data(df, y_name)
metrics_type, metrics = metrics_factory.from_data(df[self.y_name])
else:
metrics = metrics_factory.create_metrics(metrics_type, y_name)
metrics = metrics_factory.create_metrics(metrics_type)
self.metrics_type = metrics_type
self.metrics = metrics

Expand Down Expand Up @@ -70,13 +70,15 @@ def validate(self, df=None):
if df is None:
df = self.df
y_hat = self.predict(df)
return self.metrics.validate(y_hat, df)
y = df[self.y_name]
return self.metrics.validate(y, y_hat)

def goodness_of_fit(self, df=None):
if df is None:
df = self.df
y_hat = self.predict(df)
return self.metrics.goodness_of_fit(y_hat, df)
y = df[self.y_name]
return self.metrics.goodness_of_fit(y, y_hat)

class CART(Model):
def __init__(
Expand Down Expand Up @@ -195,7 +197,9 @@ def create_tree(self, leaf_loss_threshold=1e-12):
return self.tree

def _node_or_leaf(self, df):
loss_parent = self.metrics.loss(df)
y = df[self.y_name]
y_hat = self.metrics.node_value(y)
loss_parent = self.metrics.loss(y, y_hat)
# p = self._probability(df)
if (
loss_parent < self.leaf_loss_threshold
Expand All @@ -204,11 +208,11 @@ def _node_or_leaf(self, df):
or len(df.index) < self.min_leaf_samples
or self.depth >= self.max_depth
):
return self._leaf(df)
return self._leaf(y, y_hat)

loss_best, split_df, split_threshold, split_name = self._loss_best(df)
if not split_df:
return self._leaf(df)
return self._leaf(y, y_hat)
# print(
# f"Computed split:\nloss: {loss_best:.2f} (parent: {loss_parent:.2f})\nattribute: {split_name}\nthreshold: {split_threshold}\ncount: {[len(df_.index) for df_ in split_df]}"
# )
Expand All @@ -219,8 +223,8 @@ def _node_or_leaf(self, df):
for i in range(2):
branches.append(self._node_or_leaf(split_df[i]))
self.depth -= 1
unique, counts = np.unique(df[self.y_name], return_counts=True)
value = self.metrics.node_value(df)
#unique, counts = np.unique(df[self.y_name], return_counts=True)
value = y_hat
item = Node(
branches=branches,
attribute=split_name,
Expand All @@ -229,19 +233,18 @@ def _node_or_leaf(self, df):
decision_fun=self.data_handlers[split_name].decide,
)
item.pinfo["N"] = len(df.index)
item.pinfo["r"] = self.metrics.loss_prune(df)
item.pinfo["r"] = self.metrics.loss_prune(y, y_hat)
item.pinfo["R"] = item.pinfo["N"] / len(self.df.index) * item.pinfo["r"]
else:
item = self._leaf(df)
item = self._leaf(y, y_hat)

return item

def _leaf(self, df):
value = self.metrics.node_value(df)
leaf = Node(value=value)
def _leaf(self,y, y_hat):
leaf = Node(value=y_hat)

leaf.pinfo["N"] = len(df.index)
leaf.pinfo["r"] = self.metrics.loss_prune(df)
leaf.pinfo["N"] = y.size
leaf.pinfo["r"] = self.metrics.loss_prune(y, y_hat)
leaf.pinfo["R"] = leaf.pinfo["N"] / len(self.df.index) * leaf.pinfo["r"]
return leaf

Expand Down Expand Up @@ -469,7 +472,8 @@ def validate(self, df=None):
if df is None:
df = self.df
y_hat = self.predict(df)
return self.metrics.validate(y_hat, df)
y = df[self.y_name]
return self.metrics.validate(y, y_hat)


class RandomForest(Model):
Expand Down Expand Up @@ -555,7 +559,8 @@ def validate_oob(self):
df.loc[index, "majority_vote"] = unique[idx_max]
df = df.dropna(subset=["majority_vote"])
df = df.astype({"majority_vote": "int"})
return self.metrics.validate(df["majority_vote"].values, df)
y = df[self.y_name]
return self.metrics.validate(y, df["majority_vote"].values)

def _oob_predict(self, df):
for i, t in enumerate(self.trees):
Expand All @@ -577,7 +582,8 @@ def validate(self, df=None):
if df is None:
df = self.df
y_hat = self.predict(df)
return self.metrics.validate(y_hat, df)
y = df[self.y_name]
return self.metrics.validate(y, y_hat)

def variable_importance(self):
d = {}
Expand Down
15 changes: 9 additions & 6 deletions binarybeech/datahandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,10 @@ def split(self, df):
]
N = len(df.index)
n = [len(df_.index) for df_ in split_df]
loss = n[0] / N * self.metrics.loss(split_df[0]) + n[
val = [self.metrics.node_value(df_[self.y_name]) for df_ in split_df]
loss = n[0] / N * self.metrics.loss(split_df[0][self.y_name], val[0]) + n[
1
] / N * self.metrics.loss(split_df[1])
] / N * self.metrics.loss(split_df[1][self.y_name], val[1])
if loss < self.loss:
success = True
self.loss = loss
Expand Down Expand Up @@ -139,9 +140,10 @@ def split(self, df):
]
N = len(df.index)
n = [len(df_.index) for df_ in self.split_df]
self.loss = n[0] / N * self.metrics.loss(self.split_df[0]) + n[
val = [self.metrics.node_value(df_[self.y_name]) for df_ in self.split_df]
self.loss = n[0] / N * self.metrics.loss(self.split_df[0][self.y_name], val[0]) + n[
1
] / N * self.metrics.loss(self.split_df[1])
] / N * self.metrics.loss(self.split_df[1][self.y_name], val[1])

return success

Expand Down Expand Up @@ -210,9 +212,10 @@ def _opt_fun(self, df):
def fun(x):
split_df = [df[df[split_name] < x], df[df[split_name] >= x]]
n = [len(df_.index) for df_ in split_df]
return n[0] / N * self.metrics.loss(split_df[0]) + n[
val = [self.metrics.node_value(df_[self.y_name]) for df_ in split_df]
return n[0] / N * self.metrics.loss(split_df[0][self.y_name], val[0]) + n[
1
] / N * self.metrics.loss(split_df[1])
] / N * self.metrics.loss(split_df[1][self.y_name], val[1])

return fun

Expand Down
8 changes: 6 additions & 2 deletions binarybeech/math.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,13 @@ def log_odds(x):


def probability(x):
if x == np.Inf:
return 1.0
#if x == np.Inf:
# return 1.0
return x / (1 + x)

def max_probability(x):
unique, counts = np.unique(x, return_counts=True)
return np.max(counts) / x.size


def logistic(x):
Expand Down

0 comments on commit 42ea2ea

Please sign in to comment.