Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Trainingdata #14

Merged
merged 14 commits into from
May 1, 2023
72 changes: 12 additions & 60 deletions binarybeech/attributehandler.py
Expand Up @@ -101,19 +101,7 @@ def decide(x, threshold):

@staticmethod
def check(x):
x = x[~pd.isna(x)]
unique = np.unique(x)
l = len(unique)

if l / len(x) > 0.2:
return False

dtype = x.values.dtype

if not np.issubdtype(dtype, np.number) and l > 2:
return True

return False
return math.check_nominal(x, max_unique_fraction=0.2, exclude_dichotomous=True)


class DichotomousAttributeHandler(AttributeHandlerBase):
Expand Down Expand Up @@ -163,19 +151,7 @@ def decide(x, threshold):

@staticmethod
def check(x):
x = x[~pd.isna(x)]
unique = np.unique(x)
l = len(unique)

if l / len(x) > 0.2:
return False

dtype = x.values.dtype

if l == 2:
return True

return False
return math.check_dichotomous(x)


class IntervalAttributeHandler(AttributeHandlerBase):
Expand Down Expand Up @@ -232,15 +208,7 @@ def decide(x, threshold):

@staticmethod
def check(x):
x = x[~pd.isna(x)]
unique = np.unique(x)
l = len(unique)
dtype = x.values.dtype

if np.issubdtype(dtype, np.number) and l > 2:
return True

return False
return math.check_interval(x)


class NullAttributeHandler(AttributeHandlerBase):
Expand Down Expand Up @@ -312,16 +280,7 @@ def decide(x, threshold):

@staticmethod
def check(x):

x = x[~pd.isna(x)]
unique = np.unique(x)
l = len(unique)
dtype = x.values.dtype

if np.issubdtype(dtype, np.number) and l > 2:
return True

return False
return math.check_interval(x)


class UnsupervisedNominalAttributeHandler(AttributeHandlerBase):
Expand Down Expand Up @@ -380,19 +339,7 @@ def decide(x, threshold):

@staticmethod
def check(x):
x = x[~pd.isna(x)]
unique = np.unique(x)
l = len(unique)

if l / len(x) > 0.2:
return False

dtype = x.values.dtype

if not np.issubdtype(dtype, np.number) and l > 2:
return True

return False
return math.check_nominal(x, max_unique_fraction=0.2, exclude_dichotomous=False)


# =========================
Expand All @@ -415,7 +362,10 @@ def get_attribute_handler_class(self, arr, group_name="default"):

raise ValueError("no data handler class for this type of data")

def create_attribute_handlers(self, df, y_name, X_names, metrics):
def create_attribute_handlers(self, training_data, metrics):
df = training_data.df
y_name = training_data.y_name
X_names = training_data.X_names
dhc = self.get_attribute_handler_class(
df[y_name], group_name=metrics.attribute_handler_group()
)
Expand Down Expand Up @@ -443,4 +393,6 @@ def create_attribute_handlers(self, df, y_name, X_names, metrics):
attribute_handler_factory.register(
"nominal", UnsupervisedNominalAttributeHandler, group_name="unsupervised"
)
attribute_handler_factory.register("null", NullAttributeHandler, group_name="unsupervised")
attribute_handler_factory.register(
"null", NullAttributeHandler, group_name="unsupervised"
)
112 changes: 75 additions & 37 deletions binarybeech/binarybeech.py
Expand Up @@ -9,29 +9,41 @@
import pandas as pd
import scipy.optimize as opt

from binarybeech.extra import k_fold_split
from binarybeech.datamanager import DataManager
from binarybeech.extra import k_fold_split
from binarybeech.reporter import Reporter
from binarybeech.trainingdata import TrainingData
from binarybeech.tree import Node, Tree


class Model(ABC):
def __init__(
self, df, y_name, X_names, attribute_handlers, metrics_type, handle_missings
self,
training_data,
df,
y_name,
X_names,
attribute_handlers,
metrics_type,
handle_missings,
):
if not y_name:
y_name = "__internal_placeholder_for_y__"
df[y_name] = 0
self.y_name = y_name
if isinstance(training_data, TrainingData):
self.training_data = training_data
elif isinstance(df, pd.DataFrame):
self.training_data = TrainingData(
df, y_name=y_name, X_names=X_names, handle_missings=handle_missings
)
else:
raise TypeError(
"Wrong data type. Either pass training_data as a TrainingData object or df as a pandas DataFrame."
)

if X_names is None:
X_names = list(df.columns)
X_names.remove(self.y_name)
self.X_names = X_names
self.y_name = self.training_data.y_name
self.X_names = self.training_data.X_names

self.dmgr = DataManager(df, y_name, X_names, metrics_type, attribute_handlers)
self.dmgr = DataManager(self.training_data, metrics_type, attribute_handlers)

self.df = self._handle_missings(df, handle_missings)
self.training_data.df = self._handle_missings(df, handle_missings)

def _handle_missings(self, df, mode):
df = df.dropna(subset=[self.y_name])
Expand All @@ -58,14 +70,14 @@ def predict(self, df):

def validate(self, df=None):
if df is None:
df = self.df
df = self.training_data.df
y_hat = self.predict(df)
y = df[self.y_name]
return self.dmgr.metrics.validate(y, y_hat)

def goodness_of_fit(self, df=None):
if df is None:
df = self.df
df = self.training_data.df
y_hat = self.predict(df)
y = df[self.y_name]
return self.dmgr.metrics.goodness_of_fit(y, y_hat)
Expand All @@ -74,8 +86,9 @@ def goodness_of_fit(self, df=None):
class CART(Model):
def __init__(
self,
df,
y_name,
training_data=None,
df=None,
y_name=None,
X_names=None,
min_leaf_samples=1,
min_split_samples=1,
Expand All @@ -85,7 +98,13 @@ def __init__(
attribute_handlers=None,
):
super().__init__(
df, y_name, X_names, attribute_handlers, metrics_type, handle_missings
training_data,
df,
y_name,
X_names,
attribute_handlers,
metrics_type,
handle_missings,
)
self.tree = None
self.leaf_loss_threshold = 1e-12
Expand Down Expand Up @@ -115,19 +134,20 @@ def train(self, k=5, plot=True, slack=1.0):
train decision tree by k-fold cross-validation
"""
# shuffle dataframe
df = self.df.sample(frac=1.0)
df = self.training_data.df.sample(frac=1.0)

# train tree with full dataset
self.create_tree()
pres = self.prune()
beta = self._beta(pres["alpha"])
qual_cv = np.zeros((len(beta), k))
# split df for k-fold cross-validation
sets = k_fold_split(df, k)
self.training_data.split(k=k)
sets = self.training_data.data_sets
for i, data in enumerate(sets):
c = CART(
data[0],
self.y_name,
df=data[0],
y_name=self.y_name,
X_names=self.X_names,
min_leaf_samples=self.min_leaf_samples,
min_split_samples=self.min_split_samples,
Expand Down Expand Up @@ -179,7 +199,7 @@ def _qualities(self, beta, data):

def create_tree(self, leaf_loss_threshold=1e-12):
self.leaf_loss_threshold = leaf_loss_threshold
root = self._node_or_leaf(self.df)
root = self._node_or_leaf(self.training_data.df)
self.tree = Tree(root)
n_leafs = self.tree.leaf_count()
print(f"A tree with {n_leafs} leafs was created")
Expand Down Expand Up @@ -223,7 +243,9 @@ def _node_or_leaf(self, df):
)
item.pinfo["N"] = len(df.index)
item.pinfo["r"] = self.dmgr.metrics.loss_prune(y, y_hat)
item.pinfo["R"] = item.pinfo["N"] / len(self.df.index) * item.pinfo["r"]
item.pinfo["R"] = (
item.pinfo["N"] / len(self.training_data.df.index) * item.pinfo["r"]
)
else:
item = self._leaf(y, y_hat)

Expand All @@ -234,7 +256,9 @@ def _leaf(self, y, y_hat):

leaf.pinfo["N"] = y.size
leaf.pinfo["r"] = self.dmgr.metrics.loss_prune(y, y_hat)
leaf.pinfo["R"] = leaf.pinfo["N"] / len(self.df.index) * leaf.pinfo["r"]
leaf.pinfo["R"] = (
leaf.pinfo["N"] / len(self.training_data.df.index) * leaf.pinfo["r"]
)
return leaf

def _loss_best(self, df):
Expand Down Expand Up @@ -339,8 +363,9 @@ def _g2(self, node):
class GradientBoostedTree(Model):
def __init__(
self,
df,
y_name,
training_data=None,
df=None,
y_name=None,
X_names=None,
sample_frac=1,
n_attributes=None,
Expand All @@ -352,9 +377,15 @@ def __init__(
attribute_handlers=None,
):
super().__init__(
df, y_name, X_names, attribute_handlers, init_metrics_type, handle_missings
training_data,
df,
y_name,
X_names,
attribute_handlers,
init_metrics_type,
handle_missings,
)
self.df = self.df.copy()
self.df = self.training_data.df.copy()
self.N = len(self.df.index)

self.init_tree = None
Expand All @@ -371,8 +402,8 @@ def __init__(

def _initial_tree(self):
c = CART(
self.df,
self.y_name,
df=self.df,
y_name=self.y_name,
X_names=self.X_names,
max_depth=0,
metrics_type=self.init_metrics_type,
Expand Down Expand Up @@ -423,8 +454,8 @@ def train(self, M):
)
kwargs = {**kwargs, **self.cart_settings}
c = CART(
df.sample(frac=self.sample_frac, replace=True),
"pseudo_residuals",
df=df.sample(frac=self.sample_frac, replace=True),
y_name="pseudo_residuals",
X_names=X_names,
**kwargs,
)
Expand Down Expand Up @@ -466,8 +497,9 @@ def validate(self, df=None):
class RandomForest(Model):
def __init__(
self,
df,
y_name,
training_data=None,
df=None,
y_name=None,
X_names=None,
verbose=False,
sample_frac=1,
Expand All @@ -478,9 +510,15 @@ def __init__(
attribute_handlers=None,
):
super().__init__(
df, y_name, X_names, attribute_handlers, metrics_type, handle_missings
training_data,
df,
y_name,
X_names,
attribute_handlers,
metrics_type,
handle_missings,
)
self.df = self.df.copy()
self.df = self.training_data.df.copy()
self.N = len(self.df.index)

self.trees = []
Expand Down Expand Up @@ -509,7 +547,7 @@ def train(self, M):
attribute_handlers=self.dmgr,
)
kwargs = {**kwargs, **self.cart_settings}
c = CART(df, self.y_name, X_names=X_names, **kwargs)
c = CART(df=df, y_name=self.y_name, X_names=X_names, **kwargs)
c.create_tree()
self.trees.append(c.tree)
self.oob_indices.append(self.df.index.difference(df.index))
Expand Down
16 changes: 9 additions & 7 deletions binarybeech/datamanager.py
Expand Up @@ -3,26 +3,28 @@
from binarybeech.attributehandler import attribute_handler_factory
from binarybeech.metrics import metrics_factory


class DataManager:
def __init__(self, df, y_name, X_names, method, attribute_handlers):
def __init__(self, training_data, method, attribute_handlers):
self.method = method
self.attribute_handlers = {}

if method is None:
metrics_type, metrics = metrics_factory.from_data(df[self.y_name])
metrics_type, metrics = metrics_factory.from_data(
training_data.df[training_data.y_name]
)
else:
metrics = metrics_factory.create_metrics(method)
metrics_type = method
self.metrics = metrics
self.metrics_type = metrics_type

if attribute_handlers is None:
attribute_handlers = attribute_handler_factory.create_attribute_handlers(
df, y_name, X_names, self.metrics
training_data, self.metrics
)
self.attribute_handlers = attribute_handlers
self.items = self.attribute_handlers.items


def __getitem__(self, key):
return self.attribute_handlers[key]
return self.attribute_handlers[key]