# Env

In [None]:
!git clone https://github.com/serengil/chefboost.git

In [None]:
!pip install chefboost

In [None]:
# replace the folders where the package is installed with the github version
!rm -r '/usr/local/lib/python3.10/dist-packages/chefboost/'
!cp -r '/content/chefboost/chefboost' '/usr/local/lib/python3.10/dist-packages/chefboost'

# functions

## load precomputed feature

In [None]:
import pandas as pd
def load_features(train_path, test_path):
    """
    (Same function in Cluster_feature.ipynb) Load precomputed features/embeddings for training and testing

    Args:
        train_path (str): where the training corpus features (after dimension reduction) is saved. Each row contains the precomputed features and the ground truth class label for one document.
        test_path (str): where the testing corpus features (after dimension reduction) is saved. Each row contains the precomputed features and the ground truth class label for one document.

    Returns:
        df (Dataframe): training corpus features
        df_test (Dataframe): testing corpus features
    """

    df = pd.read_csv(train_path,sep=',')
    df['Class_label'] = df['Class_label'].astype(str)

    df_test = pd.read_csv(test_path,sep=',')
    df_test['Class_label'] = df_test['Class_label'].astype(str)

    return df, df_test

## DT training and evaluation

In [None]:
import os
from chefboost import Chefboost as chef

def train_dt(output_folder, train_df, dt_algorithm = 'CART', num_cores = 2, max_depth = 5, enableRandomForest=False, num_of_trees=5):
    """
    Load precomputed features for training and testing

    Args:
        output_folder: where the  should be saved. Under this folder, the predefined algorithm creates subfolders ./outputs/rules/ to save the rules of the trained decision tree as rules.py, rules.json, model.pkl.
        train_df (Dataframe): training corpus features. Each row contains the precomputed features and the ground truth class label for one document (Column name for the class label is supposed to be 'Class_label').
        dt_algorithm (str): define the training algorithm for the decision tree. Possible values are 'CART', 'C4.5', 'ID3'. Default is 'CART'.
        num_cores (int): number of cpu cores used in parallel during decision training. Default is 2 for running in Google Colab cloud service as the most compatible option.
        max_depth (int): maximum depth of the decision tree. Default is 5.

    Returns:
       model(object): trained decision tree model, an object defined by the chefboost library. Note that the rules of the decision tree are saved as the if-elif-else python statements during the training process in the output folder where the model is saved.
    """
    os.makedirs(output_folder, exist_ok=True)
    os.chdir(output_folder)

    if enableRandomForest:
        config = {'algorithm': dt_algorithm,'num_cores':num_cores,'max_depth':max_depth,'enableRandomForest': enableRandomForest, 'num_of_trees': num_of_trees}
    else:
        config = {'algorithm': dt_algorithm,'num_cores':num_cores,'max_depth':max_depth}
    model = chef.fit(train_df, config = config, target_label = 'Class_label')
    chef.save_model(model, "model.pkl")
    return model

In [1]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate_df(model, df_test):
    """
    Run inference of the trained decision tree on the testing test features and compute the accuracy and f1 score

    Args:
        model(object): trained decision tree model
        df_test (Dataframe): testing corpus features. Each row contains the precomputed features and the ground truth class label for one document (Column name for the class label is supposed to be 'Class_label').

    Returns:
        acc (float): accuracy of the model
        f1 (float): f1 score of the model
    """
    no_feats = len(df_test.columns)-1
    instance_no = len(df_test.index)

    X_test = df_test.iloc[:,:no_feats]
    y_test = df_test.iloc[:,no_feats]

    predictions = []
    for i in range(instance_no):
        prediction = chef.predict(model, X_test.iloc[i])
        predictions.append(prediction)

    acc = accuracy_score(y_test,predictions)
    f1 = f1_score(y_test,predictions)

    return acc, f1

In [None]:
def count_rules(model_py_file_path):
    """
    Compute the number of rules of the trained decision tree

    Args:
        model_py_file_path(str): path to the python script where trained decision tree model rules are saved as 'rules.py'

    Returns:
        rule_num (float): number of rules of the given decision tree
    """
    with open(model_py_file_path, 'r') as infile:
        content = infile.read()
        return_no = content.count('return')
        else_no = content.count('else')
        rule_num = return_no - else_no
    return rule_num