Skip to content

Commit

Permalink
Merge pull request #14 from arminwitte/trainingdata
Browse files Browse the repository at this point in the history
Trainingdata
  • Loading branch information
arminwitte committed May 1, 2023
2 parents 8ff0bbe + 3152347 commit 81f45df
Show file tree
Hide file tree
Showing 17 changed files with 809 additions and 448 deletions.
72 changes: 12 additions & 60 deletions binarybeech/attributehandler.py
Expand Up @@ -101,19 +101,7 @@ def decide(x, threshold):

@staticmethod
def check(x):
x = x[~pd.isna(x)]
unique = np.unique(x)
l = len(unique)

if l / len(x) > 0.2:
return False

dtype = x.values.dtype

if not np.issubdtype(dtype, np.number) and l > 2:
return True

return False
return math.check_nominal(x, max_unique_fraction=0.2, exclude_dichotomous=True)


class DichotomousAttributeHandler(AttributeHandlerBase):
Expand Down Expand Up @@ -163,19 +151,7 @@ def decide(x, threshold):

@staticmethod
def check(x):
x = x[~pd.isna(x)]
unique = np.unique(x)
l = len(unique)

if l / len(x) > 0.2:
return False

dtype = x.values.dtype

if l == 2:
return True

return False
return math.check_dichotomous(x)


class IntervalAttributeHandler(AttributeHandlerBase):
Expand Down Expand Up @@ -232,15 +208,7 @@ def decide(x, threshold):

@staticmethod
def check(x):
x = x[~pd.isna(x)]
unique = np.unique(x)
l = len(unique)
dtype = x.values.dtype

if np.issubdtype(dtype, np.number) and l > 2:
return True

return False
return math.check_interval(x)


class NullAttributeHandler(AttributeHandlerBase):
Expand Down Expand Up @@ -312,16 +280,7 @@ def decide(x, threshold):

@staticmethod
def check(x):

x = x[~pd.isna(x)]
unique = np.unique(x)
l = len(unique)
dtype = x.values.dtype

if np.issubdtype(dtype, np.number) and l > 2:
return True

return False
return math.check_interval(x)


class UnsupervisedNominalAttributeHandler(AttributeHandlerBase):
Expand Down Expand Up @@ -380,19 +339,7 @@ def decide(x, threshold):

@staticmethod
def check(x):
x = x[~pd.isna(x)]
unique = np.unique(x)
l = len(unique)

if l / len(x) > 0.2:
return False

dtype = x.values.dtype

if not np.issubdtype(dtype, np.number) and l > 2:
return True

return False
return math.check_nominal(x, max_unique_fraction=0.2, exclude_dichotomous=False)


# =========================
Expand All @@ -415,7 +362,10 @@ def get_attribute_handler_class(self, arr, group_name="default"):

raise ValueError("no data handler class for this type of data")

def create_attribute_handlers(self, df, y_name, X_names, metrics):
def create_attribute_handlers(self, training_data, metrics):
df = training_data.df
y_name = training_data.y_name
X_names = training_data.X_names
dhc = self.get_attribute_handler_class(
df[y_name], group_name=metrics.attribute_handler_group()
)
Expand Down Expand Up @@ -443,4 +393,6 @@ def create_attribute_handlers(self, df, y_name, X_names, metrics):
attribute_handler_factory.register(
"nominal", UnsupervisedNominalAttributeHandler, group_name="unsupervised"
)
attribute_handler_factory.register("null", NullAttributeHandler, group_name="unsupervised")
attribute_handler_factory.register(
"null", NullAttributeHandler, group_name="unsupervised"
)
112 changes: 75 additions & 37 deletions binarybeech/binarybeech.py
Expand Up @@ -9,29 +9,41 @@
import pandas as pd
import scipy.optimize as opt

from binarybeech.extra import k_fold_split
from binarybeech.datamanager import DataManager
from binarybeech.extra import k_fold_split
from binarybeech.reporter import Reporter
from binarybeech.trainingdata import TrainingData
from binarybeech.tree import Node, Tree


class Model(ABC):
def __init__(
self, df, y_name, X_names, attribute_handlers, metrics_type, handle_missings
self,
training_data,
df,
y_name,
X_names,
attribute_handlers,
metrics_type,
handle_missings,
):
if not y_name:
y_name = "__internal_placeholder_for_y__"
df[y_name] = 0
self.y_name = y_name
if isinstance(training_data, TrainingData):
self.training_data = training_data
elif isinstance(df, pd.DataFrame):
self.training_data = TrainingData(
df, y_name=y_name, X_names=X_names, handle_missings=handle_missings
)
else:
raise TypeError(
"Wrong data type. Either pass training_data as a TrainingData object or df as a pandas DataFrame."
)

if X_names is None:
X_names = list(df.columns)
X_names.remove(self.y_name)
self.X_names = X_names
self.y_name = self.training_data.y_name
self.X_names = self.training_data.X_names

self.dmgr = DataManager(df, y_name, X_names, metrics_type, attribute_handlers)
self.dmgr = DataManager(self.training_data, metrics_type, attribute_handlers)

self.df = self._handle_missings(df, handle_missings)
self.training_data.df = self._handle_missings(df, handle_missings)

def _handle_missings(self, df, mode):
df = df.dropna(subset=[self.y_name])
Expand All @@ -58,14 +70,14 @@ def predict(self, df):

def validate(self, df=None):
if df is None:
df = self.df
df = self.training_data.df
y_hat = self.predict(df)
y = df[self.y_name]
return self.dmgr.metrics.validate(y, y_hat)

def goodness_of_fit(self, df=None):
if df is None:
df = self.df
df = self.training_data.df
y_hat = self.predict(df)
y = df[self.y_name]
return self.dmgr.metrics.goodness_of_fit(y, y_hat)
Expand All @@ -74,8 +86,9 @@ def goodness_of_fit(self, df=None):
class CART(Model):
def __init__(
self,
df,
y_name,
training_data=None,
df=None,
y_name=None,
X_names=None,
min_leaf_samples=1,
min_split_samples=1,
Expand All @@ -85,7 +98,13 @@ def __init__(
attribute_handlers=None,
):
super().__init__(
df, y_name, X_names, attribute_handlers, metrics_type, handle_missings
training_data,
df,
y_name,
X_names,
attribute_handlers,
metrics_type,
handle_missings,
)
self.tree = None
self.leaf_loss_threshold = 1e-12
Expand Down Expand Up @@ -115,19 +134,20 @@ def train(self, k=5, plot=True, slack=1.0):
train decision tree by k-fold cross-validation
"""
# shuffle dataframe
df = self.df.sample(frac=1.0)
df = self.training_data.df.sample(frac=1.0)

# train tree with full dataset
self.create_tree()
pres = self.prune()
beta = self._beta(pres["alpha"])
qual_cv = np.zeros((len(beta), k))
# split df for k-fold cross-validation
sets = k_fold_split(df, k)
self.training_data.split(k=k)
sets = self.training_data.data_sets
for i, data in enumerate(sets):
c = CART(
data[0],
self.y_name,
df=data[0],
y_name=self.y_name,
X_names=self.X_names,
min_leaf_samples=self.min_leaf_samples,
min_split_samples=self.min_split_samples,
Expand Down Expand Up @@ -179,7 +199,7 @@ def _qualities(self, beta, data):

def create_tree(self, leaf_loss_threshold=1e-12):
self.leaf_loss_threshold = leaf_loss_threshold
root = self._node_or_leaf(self.df)
root = self._node_or_leaf(self.training_data.df)
self.tree = Tree(root)
n_leafs = self.tree.leaf_count()
print(f"A tree with {n_leafs} leafs was created")
Expand Down Expand Up @@ -223,7 +243,9 @@ def _node_or_leaf(self, df):
)
item.pinfo["N"] = len(df.index)
item.pinfo["r"] = self.dmgr.metrics.loss_prune(y, y_hat)
item.pinfo["R"] = item.pinfo["N"] / len(self.df.index) * item.pinfo["r"]
item.pinfo["R"] = (
item.pinfo["N"] / len(self.training_data.df.index) * item.pinfo["r"]
)
else:
item = self._leaf(y, y_hat)

Expand All @@ -234,7 +256,9 @@ def _leaf(self, y, y_hat):

leaf.pinfo["N"] = y.size
leaf.pinfo["r"] = self.dmgr.metrics.loss_prune(y, y_hat)
leaf.pinfo["R"] = leaf.pinfo["N"] / len(self.df.index) * leaf.pinfo["r"]
leaf.pinfo["R"] = (
leaf.pinfo["N"] / len(self.training_data.df.index) * leaf.pinfo["r"]
)
return leaf

def _loss_best(self, df):
Expand Down Expand Up @@ -339,8 +363,9 @@ def _g2(self, node):
class GradientBoostedTree(Model):
def __init__(
self,
df,
y_name,
training_data=None,
df=None,
y_name=None,
X_names=None,
sample_frac=1,
n_attributes=None,
Expand All @@ -352,9 +377,15 @@ def __init__(
attribute_handlers=None,
):
super().__init__(
df, y_name, X_names, attribute_handlers, init_metrics_type, handle_missings
training_data,
df,
y_name,
X_names,
attribute_handlers,
init_metrics_type,
handle_missings,
)
self.df = self.df.copy()
self.df = self.training_data.df.copy()
self.N = len(self.df.index)

self.init_tree = None
Expand All @@ -371,8 +402,8 @@ def __init__(

def _initial_tree(self):
c = CART(
self.df,
self.y_name,
df=self.df,
y_name=self.y_name,
X_names=self.X_names,
max_depth=0,
metrics_type=self.init_metrics_type,
Expand Down Expand Up @@ -423,8 +454,8 @@ def train(self, M):
)
kwargs = {**kwargs, **self.cart_settings}
c = CART(
df.sample(frac=self.sample_frac, replace=True),
"pseudo_residuals",
df=df.sample(frac=self.sample_frac, replace=True),
y_name="pseudo_residuals",
X_names=X_names,
**kwargs,
)
Expand Down Expand Up @@ -466,8 +497,9 @@ def validate(self, df=None):
class RandomForest(Model):
def __init__(
self,
df,
y_name,
training_data=None,
df=None,
y_name=None,
X_names=None,
verbose=False,
sample_frac=1,
Expand All @@ -478,9 +510,15 @@ def __init__(
attribute_handlers=None,
):
super().__init__(
df, y_name, X_names, attribute_handlers, metrics_type, handle_missings
training_data,
df,
y_name,
X_names,
attribute_handlers,
metrics_type,
handle_missings,
)
self.df = self.df.copy()
self.df = self.training_data.df.copy()
self.N = len(self.df.index)

self.trees = []
Expand Down Expand Up @@ -509,7 +547,7 @@ def train(self, M):
attribute_handlers=self.dmgr,
)
kwargs = {**kwargs, **self.cart_settings}
c = CART(df, self.y_name, X_names=X_names, **kwargs)
c = CART(df=df, y_name=self.y_name, X_names=X_names, **kwargs)
c.create_tree()
self.trees.append(c.tree)
self.oob_indices.append(self.df.index.difference(df.index))
Expand Down
16 changes: 9 additions & 7 deletions binarybeech/datamanager.py
Expand Up @@ -3,26 +3,28 @@
from binarybeech.attributehandler import attribute_handler_factory
from binarybeech.metrics import metrics_factory


class DataManager:
def __init__(self, df, y_name, X_names, method, attribute_handlers):
def __init__(self, training_data, method, attribute_handlers):
self.method = method
self.attribute_handlers = {}

if method is None:
metrics_type, metrics = metrics_factory.from_data(df[self.y_name])
metrics_type, metrics = metrics_factory.from_data(
training_data.df[training_data.y_name]
)
else:
metrics = metrics_factory.create_metrics(method)
metrics_type = method
self.metrics = metrics
self.metrics_type = metrics_type

if attribute_handlers is None:
attribute_handlers = attribute_handler_factory.create_attribute_handlers(
df, y_name, X_names, self.metrics
training_data, self.metrics
)
self.attribute_handlers = attribute_handlers
self.items = self.attribute_handlers.items


def __getitem__(self, key):
return self.attribute_handlers[key]
return self.attribute_handlers[key]

0 comments on commit 81f45df

Please sign in to comment.