## GOSTD

In [9]:
from gosdt import ThresholdGuessBinarizer, GOSDTClassifier
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = pd.read_csv('average_expression_t_cluster.csv')
X = data.drop(columns=['cluster'])
y = data['cluster']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Binarize continuous features
#    "ThresholdGuessBinarizer" uses a gradient boosting approach internally
#    to propose thresholds that transform continuous data into 0/1 features.
binarizer = ThresholdGuessBinarizer(
    n_estimators=40,   
    max_depth=1,       
    random_state=42
)
X_train_bin = binarizer.fit_transform(X_train, y_train)
X_test_bin = binarizer.transform(X_test)

# 2.Warm start for GOSDT
#    Train a simple GBDT on the binarized features to get "warm labels".
warm_start_clf = GradientBoostingClassifier(
    n_estimators=40,
    max_depth=1,
    random_state=42
)
warm_start_clf.fit(X_train_bin, y_train)
warm_labels = warm_start_clf.predict(X_train_bin)

# 3. Fit GOSDT
clf = GOSDTClassifier(
    regularization=0.001,
    #allow_small_reg=True,
    time_limit=60,
    depth_budget=50,
    verbose=True
)

clf.fit(X_train_bin, y_train, y_ref=warm_labels)

# 4. Evaluate GOSDT
train_preds = clf.predict(X_train_bin)
test_preds = clf.predict(X_test_bin)

train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)

print(f"GOSDT Training Accuracy: {train_acc:.4f}")
print(f"GOSDT Test Accuracy:     {test_acc:.4f}")


GOSDT Training Accuracy: 0.9643
GOSDT Test Accuracy:     0.7500




Using Configuration: {
    "cancellation": true,
    "depth_budget": 51,
    "diagnostics": false,
    "feature_transform": true,
    "look_ahead": true,
    "model_limit": 1,
    "non_binary": false,
    "profile": "",
    "reference_LB": false,
    "regularization": 0.0357142873108387,
    "rule_list": false,
    "similar_support": true,
    "time_limit": 60,
    "trace": "",
    "tree": "",
    "upperbound": 0.0,
    "verbose": true,
    "worker_limit": 1
}


Initializing Optimization Framework.
Starting Optimization.
Time: 0, Objective: [0.107143, 0.107143], Boundary: 0, Graph Size: 1, Queue Size: 0
Optimization Complete.
Training Duration: 0
Number of Optimizer Iterations: 0
Size of Problem Graph: 1
Objective Boundary: [0.107143, 0.107143]
Models Generated: 1
Loss: 0.0357143
Complexity: 0.0714286


In [10]:
clf.get_model_string()

AttributeError: 'GOSDTClassifier' object has no attribute 'get_model_string'