In [7]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Define all possible feature definition, scaling, selection, and ML algorithm options
feature_definition_columns = [
    "General_Descriptors", "Advanced_Descriptors", "Graph_based_Signatures", "Toxicophores", "Fragments"
]

scaling_columns = ["normalizer", "robust_scaler", "standard_scaler", "no_scaling", "MinMaxScaler"]
selection_columns = [
    "variance_threshold", "select_percentile", "selectfpr", "selectfwe", "selectfdr", "select_rfe", "no_feature_selection"
]
ml_algorithm_columns = [
    "neural_networks", "adaboost", "decision_tree", "extra_tree", "random_rorest",
    "extra_trees", "gradient_boosting", "xgboost", "svm", "nu_svm"
]

# Combine all columns into a single list for the DataFrame
all_columns = feature_definition_columns + scaling_columns + selection_columns + ml_algorithm_columns

# Function to transform a single pipeline into a binary presence-absence vector
def transform_single_pipeline_binary(pipeline, feature_defs, scalings, selections, ml_algorithms):
    """
    Transforms a single AutoML pipeline into a binary vector indicating the presence (1) 
    or absence (0) of a feature definition, scaler, selection method, or ML algorithm.

    Args:
    - pipeline (dict): Dictionary representing a single pipeline configuration.
    - feature_defs (list): Feature definitions to be checked.
    - scalings (list): Scaling methods to be checked.
    - selections (list): Selection methods to be checked.
    - ml_algorithms (list): ML algorithms to be checked.

    Returns:
    - pd.DataFrame: Binary vector indicating presence (1) or absence (0).
    """
    row = {col: 0 for col in all_columns}  # Initialize all columns with 0

    for section in pipeline["<start>"]:
        if isinstance(section, dict):
            for key, value in section.items():
                col_name = key.replace("<", "").replace(">", "")  # Normalize column names

                # Handle feature definitions
                if col_name == "feature_definition":
                    for feature in value:
                        if feature in feature_defs:
                            row[feature] = 1

                # Handle feature scaling, selection, and ML algorithms
                elif col_name in ["feature_scaling", "feature_selection", "ml_algorithms"]:
                    if isinstance(value[0], dict):  # Check if it's a dictionary
                        method_name = list(value[0].keys())[0].replace("<", "").replace(">", "")
                        row[method_name] = 1
                    else:  # Handle cases where the value is a string (e.g., 'MinMaxScaler')
                        row[value[0]] = 1

    return pd.DataFrame([row])



In [8]:
# Example pipeline for testing
example_pipeline = {'<start>': [{'<feature_definition>': ['Toxicophores']}, '#', {'<feature_scaling>': ['MinMaxScaler']}, '#', {'<feature_selection>': [{'<selectfpr>': ['SelectFpr', {'<value_rand_1>': ['0.40']}, {'<score_function>': ['chi2']}]}]}, '#', {'<ml_algorithms>': [{'<extra_trees>': ['ExtraTreesClassifier', {'<n_estimators>': ['75']}, {'<criterion>': ['gini']}, {'<max_depth>': ['12']}, {'<min_samples_split>': ['12']}, {'<min_samples_leaf>': ['8']}, {'<max_features>': ['log2']}, {'<class_weight_rf>': ['None']}]}]}]}

# Convert single pipeline to binary vector DataFrame
df_example_pipeline_binary = transform_single_pipeline_binary(
    example_pipeline, feature_definition_columns, scaling_columns, selection_columns, ml_algorithm_columns
)



In [9]:
df_example_pipeline_binary

Unnamed: 0,General_Descriptors,Advanced_Descriptors,Graph_based_Signatures,Toxicophores,Fragments,normalizer,robust_scaler,standard_scaler,no_scaling,MinMaxScaler,variance_threshold,select_percentile,selectfpr,selectfwe,selectfdr,select_rfe,no_feature_selection,neural_networks,adaboost,decision_tree,extra_tree,random_rorest,extra_trees,gradient_boosting,xgboost,svm,nu_svm
0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [11]:
import pandas as pd

# Define all possible feature definition, scaling, selection, and ML algorithm options
feature_definition_columns = [
    "General_Descriptors", "Advanced_Descriptors", "Graph_based_Signatures", "Toxicophores", "Fragments"
]

scaling_columns = ["normalizer", "minmax_scaler", "maxabs_scaler","robust_scaler", "standard_scaler", "no_scaling"]
selection_columns = [
    "variance_threshold", "select_percentile", "selectfpr", "selectfwe", "selectfdr", "select_rfe", "no_feature_selection"
]
ml_algorithm_columns = [
    "neural_networks", "adaboost", "decision_tree", "extra_tree", "random_rorest",
    "extra_trees", "gradient_boosting", "xgboost", "svm", "nu_svm"
]

# Combine all columns into a single list for the DataFrame
all_columns = feature_definition_columns + scaling_columns + selection_columns + ml_algorithm_columns

# Function to transform a single pipeline into a binary presence-absence vector
def transform_single_pipeline_binary(pipeline, feature_defs, scalings, selections, ml_algorithms):
    """
    Transforms a single AutoML pipeline into a binary vector indicating the presence (1) 
    or absence (0) of a feature definition, scaler, selection method, or ML algorithm.

    Args:
    - pipeline (dict): Dictionary representing a single pipeline configuration.
    - feature_defs (list): Feature definitions to be checked.
    - scalings (list): Scaling methods to be checked.
    - selections (list): Selection methods to be checked.
    - ml_algorithms (list): ML algorithms to be checked.

    Returns:
    - pd.DataFrame: Binary vector indicating presence (1) or absence (0).
    """
    row = {col: 0 for col in all_columns}  # Initialize all columns with 0

    for section in pipeline["<start>"]:
        if isinstance(section, dict):
            for key, value in section.items():
                col_name = key.replace("<", "").replace(">", "")  # Normalize column names

                # Check and set feature definitions
                if col_name == "feature_definition":
                    for feature in value:
                        if feature in feature_defs:
                            row[feature] = 1

                # Check and set the first option for scaling, selection, and ML algorithms
                elif col_name in ["feature_scaling", "feature_selection", "ml_algorithms"]:
                    method_name = list(value[0].keys())[0].replace("<", "").replace(">", "")
                    if method_name in scalings + selections + ml_algorithms:
                        row[method_name] = 1

    return pd.DataFrame([row])




In [12]:
# Example pipeline for testing
example_pipeline = {'<start>': [{'<feature_definition>': ['Toxicophores']}, '#', 
                                {'<feature_scaling>': [{'<minmax_scaler>': ['MinMaxScaler']}]}, '#', 
                                {'<feature_selection>': [{'<selectfdr>': ['SelectFdr', {'<value_rand_1>': ['0.40']}, {'<score_function>': ['chi2']}]}]}, '#', 
                                {'<ml_algorithms>': [{'<extra_trees>': ['ExtraTreesClassifier', {'<n_estimators>': ['75']}, {'<criterion>': ['gini']}, {'<max_depth>': ['12']}, {'<min_samples_split>': ['12']}, {'<min_samples_leaf>': ['8']}, {'<max_features>': ['log2']}, {'<class_weight_rf>': ['None']}]}]}]}

# Convert single pipeline to binary vector DataFrame
df_example_pipeline_binary = transform_single_pipeline_binary(
    example_pipeline, feature_definition_columns, scaling_columns, selection_columns, ml_algorithm_columns
)

# Display the resulting DataFrame


In [13]:
df_example_pipeline_binary

Unnamed: 0,General_Descriptors,Advanced_Descriptors,Graph_based_Signatures,Toxicophores,Fragments,normalizer,minmax_scaler,maxabs_scaler,robust_scaler,standard_scaler,no_scaling,variance_threshold,select_percentile,selectfpr,selectfwe,selectfdr,select_rfe,no_feature_selection,neural_networks,adaboost,decision_tree,extra_tree,random_rorest,extra_trees,gradient_boosting,xgboost,svm,nu_svm
0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
