In [None]:
pip install pillow reportlab matplotlib --quiet

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys, os
LIB_DIR = "/content/drive/MyDrive/Automata AI/Sensor Pipeline"
print("Exists?", os.path.exists(LIB_DIR))
print("Files:", os.listdir(LIB_DIR)[:20])

if LIB_DIR not in sys.path:
    sys.path.append(LIB_DIR)

print("sys.path contains LIB_DIR?", LIB_DIR in sys.path)

# show python files only
print("PY files:", [f for f in os.listdir(LIB_DIR) if f.endswith(".py")])

from automata_preprocessing import run_preprocessing, PreprocessConfig


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
#test data
df = pd.DataFrame({
    "reading_text": [
        "temperature 22C humidity 40 comfortable",
        "temperature 31C humidity 75 uncomfortable hot humid",
        "temperature 18C humidity 35 comfortable cool dry",
        "temperature 29C humidity 60 uncomfortable warm",
        "temperature 24C humidity 45 comfortable normal",
        "temperature 34C humidity 80 uncomfortable very hot humid",
        "temperature 20C humidity 30 uncomfortable cold dry",
        "temperature 26C humidity 50 comfortable warm",
    ],
    "label": ["Comfortable","Uncomfortable","Comfortable","Uncomfortable","Comfortable","Uncomfortable","Uncomfortable","Comfortable"]
})

X = df[["reading_text"]]
y = df["label"]

logo_path = "/content/drive/MyDrive/Automata AI/Automata_AI_Logo.png"

cfg = PreprocessConfig(
    report=True,
    report_path="/content/drive/MyDrive/Automata AI/Sensor Pipeline/sensor_preprocessing_report.pdf",
    project_name="Sensor Pipeline – Preprocessing",
    logo_path=logo_path,
    verbose=True
)

X, y = run_preprocessing(X, y, cfg)



X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42
)

# Make aliases for meta-feature code
data, target = X, y

print("\n✅ Data cleaning complete!")
print(f"Final Shape: {df.shape}")
print(f"Train/Test Split: {len(X_train)} train / {len(X_test)} test")


In [None]:
def build_meta_entries_for_all_models_from_preprocessed(X,y,task_id: int = 0,dataset_id: int = 0,dataset_name: str = "unknown") -> dict:
    import numpy as np
    import pandas as pd
    from sklearn.preprocessing import LabelEncoder
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.metrics import accuracy_score
    from pandas.api.types import is_numeric_dtype
    from sklearn.decomposition import PCA

    # -------------------------------------------------------------------------
    # Helper: dataset-level meta-features
    # -------------------------------------------------------------------------
    def compute_dataset_features(X_df: pd.DataFrame, y_arr) -> dict:
        meta = {}

        # Basic counts
        try:
            n_samples, n_features = X_df.shape
        except Exception:
            n_samples, n_features = None, None
        meta["n_samples"] = int(n_samples) if n_samples is not None else None
        meta["n_features"] = int(n_features) if n_features is not None else None

        # Numeric / categorical / binary features
        try:
            dtypes = X_df.dtypes
            numeric_mask = [is_numeric_dtype(dt) for dt in dtypes]
            n_numeric = int(np.sum(numeric_mask))
            n_categorical = int(len(dtypes) - n_numeric)

            n_binary = 0
            for col in X_df.columns:
                vals = pd.Series(X_df[col]).dropna().unique()
                if len(vals) == 2:
                    n_binary += 1

            meta["n_numeric_features"] = n_numeric
            meta["n_categorical_features"] = n_categorical
            meta["n_binary_features"] = int(n_binary)
        except Exception:
            numeric_mask = None
            meta["n_numeric_features"] = None
            meta["n_categorical_features"] = None
            meta["n_binary_features"] = None

        # Class stats
        try:
            y_np = np.asarray(y_arr)
            le = LabelEncoder()
            y_enc = le.fit_transform(y_np)
            classes, counts = np.unique(y_enc, return_counts=True)
            n_classes = len(classes)
            probs = counts / counts.sum()
            class_balance_std = float(probs.std()) if n_classes > 0 else None
            class_entropy = float(
                -(probs * np.log2(probs + 1e-12)).sum()
            ) if n_classes > 0 else None
            meta["n_classes"] = int(n_classes)
            meta["class_balance_std"] = class_balance_std
            meta["class_entropy"] = class_entropy
        except Exception:
            meta["n_classes"] = None
            meta["class_balance_std"] = None
            meta["class_entropy"] = None

        # Numeric feature variance and correlations
        try:
            num_cols = X_df.select_dtypes(include=[np.number])

            mean_var = 0.0
            med_var = 0.0
            mean_corr = 0.0
            max_corr = 0.0

            if num_cols.shape[1] > 0 and num_cols.shape[0] > 1:
                vars_ = num_cols.var(axis=0, ddof=1).values
                if np.isfinite(vars_).sum() > 0:
                    mean_var = float(np.nanmean(vars_))
                    med_var = float(np.nanmedian(vars_))

                max_corr_features = min(num_cols.shape[1], 50)
                corr = num_cols.iloc[:, :max_corr_features].corr().abs().values
                upper = corr[np.triu_indices_from(corr, k=1)]
                finite_upper = upper[np.isfinite(upper)]
                if finite_upper.size > 0:
                    mean_corr = float(finite_upper.mean())
                    max_corr = float(finite_upper.max())

            meta["mean_feature_variance"] = mean_var
            meta["median_feature_variance"] = med_var
            meta["mean_corr_abs"] = mean_corr
            meta["max_corr_abs"] = max_corr
        except Exception:
            meta["mean_feature_variance"] = 0.0
            meta["median_feature_variance"] = 0.0
            meta["mean_corr_abs"] = 0.0
            meta["max_corr_abs"] = 0.0

        # Extra dataset-level features
        try:
            num_cols = X_df.select_dtypes(include=[np.number])

            # 1) feature_skewness_mean
            if num_cols.shape[1] > 0:
                skews = num_cols.skew(axis=0, skipna=True)
                skews = skews.replace([np.inf, -np.inf], np.nan)
                feature_skewness_mean = float(
                    skews.mean(skipna=True)
                ) if not skews.isna().all() else 0.0
            else:
                feature_skewness_mean = 0.0
            meta["feature_skewness_mean"] = feature_skewness_mean

            # 2) feature_kurtosis_mean
            if num_cols.shape[1] > 0:
                kurts = num_cols.kurt(axis=0, skipna=True)
                kurts = kurts.replace([np.inf, -np.inf], np.nan)
                feature_kurtosis_mean = float(
                    kurts.mean(skipna=True)
                ) if not kurts.isna().all() else 0.0
            else:
                feature_kurtosis_mean = 0.0
            meta["feature_kurtosis_mean"] = feature_kurtosis_mean

            # 3) missing_percentage
            if (
                n_samples is not None
                and n_features is not None
                and n_samples > 0
                and n_features > 0
            ):
                total_cells = float(n_samples * n_features)
                missing_count = float(X_df.isna().sum().sum())
                missing_percentage = missing_count / total_cells
            else:
                missing_percentage = 0.0
            meta["missing_percentage"] = float(missing_percentage)

            # 4) avg_cardinality_categorical
            avg_card = 0.0
            if "numeric_mask" in locals() and numeric_mask is not None:
                cat_cols = [
                    col for col, isnum in zip(X_df.columns, numeric_mask) if not isnum
                ]
                if len(cat_cols) > 0:
                    cards = []
                    for col in cat_cols:
                        try:
                            cards.append(X_df[col].nunique(dropna=True))
                        except Exception:
                            continue
                    if len(cards) > 0:
                        avg_card = float(np.mean(cards))
            meta["avg_cardinality_categorical"] = avg_card

            # 5) complexity_ratio
            if n_samples is not None and n_features is not None and n_samples > 0:
                complexity_ratio = float(n_features) / float(n_samples)
            else:
                complexity_ratio = 0.0
            meta["complexity_ratio"] = complexity_ratio

            # 6) intrinsic_dim_estimate (PCA-based)
            intrinsic_dim = 0.0
            try:
                if num_cols.shape[1] >= 2 and num_cols.shape[0] >= 5:
                    X_pca = num_cols.to_numpy(dtype=np.float32)
                    col_means = np.nanmean(X_pca, axis=0)
                    inds = np.where(np.isnan(X_pca))
                    if inds[0].size > 0:
                        X_pca[inds] = np.take(col_means, inds[1])

                    n_components = min(X_pca.shape[0], X_pca.shape[1])
                    if n_components >= 1:
                        pca = PCA(n_components=n_components)
                        pca.fit(X_pca)
                        cumsum = np.cumsum(pca.explained_variance_ratio_)
                        k = int(np.searchsorted(cumsum, 0.95) + 1)
                        intrinsic_dim = float(max(1, min(k, n_components)))
            except Exception:
                intrinsic_dim = 0.0

            meta["intrinsic_dim_estimate"] = intrinsic_dim
        except Exception:
            meta.setdefault("feature_skewness_mean", 0.0)
            meta.setdefault("feature_kurtosis_mean", 0.0)
            meta.setdefault("missing_percentage", 0.0)
            meta.setdefault("avg_cardinality_categorical", 0.0)
            meta.setdefault("complexity_ratio", 0.0)
            meta.setdefault("intrinsic_dim_estimate", 0.0)

        return meta

    def _safe_stratify_or_none(y_enc: np.ndarray):
        values, counts = np.unique(y_enc, return_counts=True)
        if counts.min() < 2:
            return None
        return y_enc

    # -------------------------------------------------------------------------
    # Helper: compute landmarks
    # -------------------------------------------------------------------------

    def compute_landmarks(X_train: pd.DataFrame, y_train) -> dict:
        landmarks = {
            "landmark_lr_accuracy": 0.0,
            "landmark_dt_depth3_accuracy": 0.0,
            "landmark_knn3_accuracy": 0.0,
            "landmark_random_noise_accuracy": 0.0,
            "fisher_discriminant_ratio": 0.0,
        }

        y_arr = np.asarray(y_train)
        if y_arr.ndim > 1:
            y_arr = y_arr.ravel()
        try:
            y_arr = y_arr.astype(int)
        except Exception:
            from sklearn.preprocessing import LabelEncoder
            le_fallback = LabelEncoder()
            y_arr = le_fallback.fit_transform(y_arr)

        if X_train.shape[0] < 5 or len(np.unique(y_arr)) < 2:
            return landmarks

        LANDMARK_SUBSAMPLE_FRACTION = 0.15
        LANDMARK_MAX_ROWS           = 1000
        LANDMARK_MIN_ROWS           = 20

        if isinstance(X_train, pd.DataFrame):
            X_num = X_train.select_dtypes(include=[np.number]).copy()

            if X_num.shape[1] == 0:
                X_num = pd.DataFrame(index=X_train.index)
                for col in X_train.columns:
                    s = X_train[col]
                    if is_numeric_dtype(s):
                        X_num[col] = pd.to_numeric(s, errors="coerce").fillna(0)
                    else:
                        le_col = LabelEncoder()
                        X_num[col] = le_col.fit_transform(
                            s.astype(str).fillna("__NA__")
                        )
        else:
            X_num = pd.DataFrame(X_train)

        if X_num.shape[1] == 0:
            return landmarks

        X_num = X_num.replace([np.inf, -np.inf], np.nan)
        vals = X_num.to_numpy(dtype=np.float32)

        if np.isnan(vals).any() or not np.isfinite(vals).all():
            vals[~np.isfinite(vals)] = np.nan
            col_means = np.nanmean(vals, axis=0)
            col_means = np.where(np.isnan(col_means), 0.0, col_means)
            inds = np.where(np.isnan(vals))
            if inds[0].size > 0:
                vals[inds] = np.take(col_means, inds[1])
            X_num = pd.DataFrame(vals, columns=X_num.columns)

        # --- SUBSAMPLE rows
        n_rows = X_num.shape[0]
        RNG = np.random.RandomState(42)

        n_sub = min(LANDMARK_MAX_ROWS, int(LANDMARK_SUBSAMPLE_FRACTION * n_rows))
        if n_sub < LANDMARK_MIN_ROWS:
            n_sub = LANDMARK_MIN_ROWS
        n_sub = min(n_sub, n_rows)

        idx = RNG.choice(n_rows, size=n_sub, replace=False)
        X_num_sub = X_num.iloc[idx].reset_index(drop=True).astype(np.float32)
        y_sub = y_arr[idx]

        if len(np.unique(y_sub)) < 2:
            return landmarks

        strat_labels = _safe_stratify_or_none(y_sub)

        # 1) Logistic Regression accuracy
        try:
            Xtr, Xte, ytr, yte = train_test_split(
                X_num_sub,
                y_sub,
                test_size=0.2,
                random_state=42,
                stratify=strat_labels,
            )
            clf = LogisticRegression(max_iter=50, C=0.1, solver="lbfgs")
            clf.fit(Xtr, ytr)
            acc = accuracy_score(yte, clf.predict(Xte))
            landmarks["landmark_lr_accuracy"] = float(acc)
        except Exception:
            pass

        # 2) Decision Tree depth=3 accuracy
        try:
            Xtr, Xte, ytr, yte = train_test_split(
                X_num_sub,
                y_sub,
                test_size=0.2,
                random_state=42,
                stratify=strat_labels,
            )
            clf = DecisionTreeClassifier(
                max_depth=3, min_samples_leaf=5, random_state=42
            )
            clf.fit(Xtr, ytr)
            acc = accuracy_score(yte, clf.predict(Xte))
            landmarks["landmark_dt_depth3_accuracy"] = float(acc)
        except Exception:
            pass

        # 3) KNN-3 accuracy (optionally dim-reduce to 30 features)
        try:
            X_knn = X_num_sub
            if X_knn.shape[1] > 30:
                cols = RNG.choice(X_knn.shape[1], size=30, replace=False)
                X_knn = X_knn.iloc[:, cols]

            Xtr, Xte, ytr, yte = train_test_split(
                X_knn,
                y_sub,
                test_size=0.2,
                random_state=42,
                stratify=strat_labels,
            )
            clf = KNeighborsClassifier(n_neighbors=3)
            clf.fit(Xtr, ytr)
            acc = accuracy_score(yte, clf.predict(Xte))
            landmarks["landmark_knn3_accuracy"] = float(acc)
        except Exception:
            pass

        # 4) Random noise baseline accuracy
        try:
            counts = np.bincount(y_sub)
            probs = counts / counts.sum()
            preds = RNG.choice(np.arange(len(probs)), size=len(y_sub), p=probs)
            acc = accuracy_score(y_sub, preds)
            landmarks["landmark_random_noise_accuracy"] = float(acc)
        except Exception:
            pass

        # 5) Fisher Discriminant Ratio (on subsample)
        try:
            fdr_values = []
            for j in range(X_num_sub.shape[1]):
                xj = X_num_sub.iloc[:, j].values.astype(float)
                mu = xj.mean()
                num = 0.0
                den = 0.0
                for c in np.unique(y_sub):
                    mask_c = (y_sub == c)
                    xc = xj[mask_c]
                    if xc.size == 0:
                        continue
                    nc = xc.size
                    mu_c = xc.mean()
                    var_c = xc.var(ddof=1) if nc > 1 else 0.0
                    num += nc * (mu_c - mu) ** 2
                    den += nc * var_c
                if den > 0:
                    fdr_values.append(num / (den + 1e-12))
            if fdr_values:
                landmarks["fisher_discriminant_ratio"] = float(np.mean(fdr_values))
        except Exception:
            pass

        return landmarks

    # -------------------------------------------------------------------------
    # schema
    # -------------------------------------------------------------------------
    KEY_ORDER = [
        "Task_id",
        "dataset_id",
        "dataset_name",
        "n_samples",
        "n_features",
        "n_numeric_features",
        "n_categorical_features",
        "n_binary_features",
        "n_classes",
        "class_balance_std",
        "class_entropy",
        "mean_feature_variance",
        "median_feature_variance",
        "mean_corr_abs",
        "max_corr_abs",
        "feature_skewness_mean",
        "feature_kurtosis_mean",
        "missing_percentage",
        "avg_cardinality_categorical",
        "complexity_ratio",
        "intrinsic_dim_estimate",
        "landmark_lr_accuracy",
        "landmark_dt_depth3_accuracy",
        "landmark_knn3_accuracy",
        "landmark_random_noise_accuracy",
        "fisher_discriminant_ratio",
        "model_id",
        "model_name",
        "model_family",
        "is_deep_learning",
        "is_tree_based",
        "is_linear",
        "parameterization_type",
        "complexity_training_big_o",
        "complexity_inference_big_o",
        "is_probabilistic",
        "is_ensemble_model",
        "regularization_supported",
        "supports_multiclass_natively",
        "supports_online_learning",
        "supports_multiple_trees",
        "tree_growth_strategy",
        "default_max_depth",
        "supports_pruning",
        "splitting_criterion",
        "architecture_type",
        "supports_dropout",
        "supports_batchnorm",
        "default_activation",
        "supports_cuda_acceleration",
        "supports_non_linearity",
        "supports_categorical_directly",
        "supports_missing_values",
        "supports_gpu",
        "n_estimators",
        "avg_tree_depth",
        "max_tree_depth",
        "n_leaves_mean",
        "n_layers",
        "hidden_units_mean",
        "dropout_rate_mean",
        "activation_type",
        "batch_size",
        "epochs",
        "accuracy",
        "f1_macro",
        "precision_macro",
        "trained_model_size_kb",
        "inference_speed_ms",
        "static_usage_ram_kb",
        "dynamic_usage_ram_kb",
        "full_ram_usage_kb",
        "model_n_parameters",
    ]

    STRING_KEYS = {
        "dataset_name",
        "model_name",
        "model_family",
        "parameterization_type",
        "complexity_training_big_o",
        "complexity_inference_big_o",
        "regularization_supported",
        "tree_growth_strategy",
        "splitting_criterion",
        "architecture_type",
        "default_activation",
        "activation_type",
    }

    LANDMARK_KEYS = [
        "landmark_lr_accuracy",
        "landmark_dt_depth3_accuracy",
        "landmark_knn3_accuracy",
        "landmark_random_noise_accuracy",
        "fisher_discriminant_ratio",
    ]

    # -------------------------------------------------------------------------
    # Compute dataset meta + landmarks
    # -------------------------------------------------------------------------
    try:
        ds_meta = compute_dataset_features(X, y)
    except Exception:
        ds_meta = {}

    try:
        lm = compute_landmarks(X, y)
    except Exception:
        lm = {k: 0.0 for k in LANDMARK_KEYS}

    # Ensure all landmark keys exist
    for k in LANDMARK_KEYS:
        if k not in lm or lm[k] is None:
            lm[k] = 0.0

    meta_common = {}
    meta_common.update(ds_meta)
    meta_common.update(lm)

    # IDs from arguments
    base_ids = {
        "Task_id": int(task_id),
        "dataset_id": int(dataset_id),
        "dataset_name": str(dataset_name),
    }

    # Performance/resource placeholders
    perf_placeholders = {
        "accuracy": 0.0,
        "f1_macro": 0.0,
        "precision_macro": 0.0,
        "trained_model_size_kb": 0.0,
        "inference_speed_ms": 0.0,
        "static_usage_ram_kb": 0.0,
        "dynamic_usage_ram_kb": 0.0,
        "full_ram_usage_kb": 0.0,
        "model_n_parameters": 0.0,
    }

    entries_by_model = {}

    MODEL_IDS = {
        "logreg": 1,
        "rf": 2,
        "xgboost": 3,
        "cnn1d": 4,
        "tiny_rnn": 5,
        "mlp": 6,
        "tinyconv": 7,
    }

    DL_MAX_EPOCHS = 20
    DL_BATCH_SIZE = 128

    MODEL_CAPABILITIES = {
        "logreg": {
            "model_id": MODEL_IDS["logreg"],
            "model_name": "logreg",
            "is_deep_learning": False,
            "is_tree_based": False,
            "is_linear": True,
            "model_family": "Linear",
            "parameterization_type": "linear-in-features",
            "complexity_training_big_o": "O(n · d)",
            "complexity_inference_big_o": "O(d)",
            "is_probabilistic": True,
            "is_ensemble_model": False,
            "regularization_supported": "L2",
            "supports_multiclass_natively": True,
            "supports_online_learning": False,
            "supports_multiple_trees": False,
            "tree_growth_strategy": "none",
            "default_max_depth": 0,
            "supports_pruning": False,
            "splitting_criterion": "none",
            "architecture_type": "none",
            "supports_dropout": False,
            "supports_batchnorm": False,
            "default_activation": "none",
            "supports_cuda_acceleration": False,
            "supports_non_linearity": False,
            "supports_categorical_directly": False,
            "supports_missing_values": False,
            "supports_gpu": False,
            "n_estimators": 0,
            "avg_tree_depth": 0.0,
            "max_tree_depth": 0,
            "n_leaves_mean": 0.0,
            "n_layers": 0,
            "hidden_units_mean": 0.0,
            "dropout_rate_mean": 0.0,
            "activation_type": "none",
            "batch_size": 0,
            "epochs": 0,
        },
        "rf": {
            "model_id": MODEL_IDS["rf"],
            "model_name": "rf",
            "is_deep_learning": False,
            "is_tree_based": True,
            "is_linear": False,
            "model_family": "TreeEnsemble",
            "parameterization_type": "fixed-per-estimator",
            "complexity_training_big_o": "O(n · log n · trees)",
            "complexity_inference_big_o": "O(trees · depth)",
            "is_probabilistic": True,
            "is_ensemble_model": True,
            "regularization_supported": "None",
            "supports_multiclass_natively": True,
            "supports_online_learning": False,
            "supports_multiple_trees": True,
            "tree_growth_strategy": "depth-based",
            "default_max_depth": 0,
            "supports_pruning": False,
            "splitting_criterion": "gini",
            "architecture_type": "none",
            "supports_dropout": False,
            "supports_batchnorm": False,
            "default_activation": "none",
            "supports_cuda_acceleration": False,
            "supports_non_linearity": True,
            "supports_categorical_directly": False,
            "supports_missing_values": False,
            "supports_gpu": False,
            "n_estimators": 200,
            "avg_tree_depth": 0.0,
            "max_tree_depth": 0,
            "n_leaves_mean": 0.0,
            "n_layers": 0,
            "hidden_units_mean": 0.0,
            "dropout_rate_mean": 0.0,
            "activation_type": "none",
            "batch_size": 0,
            "epochs": 0,
        },
        "xgboost": {
            "model_id": MODEL_IDS["xgboost"],
            "model_name": "xgboost",
            "is_deep_learning": False,
            "is_tree_based": True,
            "is_linear": False,
            "model_family": "BoostedTrees",
            "parameterization_type": "fixed-per-estimator",
            "complexity_training_big_o": "O(n · log n · trees)",
            "complexity_inference_big_o": "O(trees · depth)",
            "is_probabilistic": True,
            "is_ensemble_model": True,
            "regularization_supported": "L1/L2",
            "supports_multiclass_natively": True,
            "supports_online_learning": False,
            "supports_multiple_trees": True,
            "tree_growth_strategy": "leaf-based",
            "default_max_depth": 6,
            "supports_pruning": True,
            "splitting_criterion": "gain",
            "architecture_type": "none",
            "supports_dropout": False,
            "supports_batchnorm": False,
            "default_activation": "none",
            "supports_cuda_acceleration": True,
            "supports_non_linearity": True,
            "supports_categorical_directly": False,
            "supports_missing_values": True,
            "supports_gpu": True,
            "n_estimators": 200,
            "avg_tree_depth": 6.0,
            "max_tree_depth": 6,
            "n_leaves_mean": 0.0,
            "n_layers": 0,
            "hidden_units_mean": 0.0,
            "dropout_rate_mean": 0.0,
            "activation_type": "none",
            "batch_size": 0,
            "epochs": 0,
        },
        "cnn1d": {
            "model_id": MODEL_IDS["cnn1d"],
            "model_name": "cnn1d",
            "is_deep_learning": True,
            "is_tree_based": False,
            "is_linear": False,
            "model_family": "CNN",
            "parameterization_type": "linear-in-features",
            "complexity_training_big_o": "O(n · d · epochs)",
            "complexity_inference_big_o": "O(d · filters)",
            "is_probabilistic": True,
            "is_ensemble_model": False,
            "regularization_supported": "L2",
            "supports_multiclass_natively": True,
            "supports_online_learning": False,
            "supports_multiple_trees": False,
            "tree_growth_strategy": "none",
            "default_max_depth": 0,
            "supports_pruning": False,
            "splitting_criterion": "none",
            "architecture_type": "CNN1D",
            "supports_dropout": False,
            "supports_batchnorm": False,
            "default_activation": "relu",
            "supports_cuda_acceleration": True,
            "supports_non_linearity": True,
            "supports_categorical_directly": False,
            "supports_missing_values": False,
            "supports_gpu": True,
            "n_estimators": 0,
            "avg_tree_depth": 0.0,
            "max_tree_depth": 0,
            "n_leaves_mean": 0.0,
            "n_layers": 2,
            "hidden_units_mean": 8.0,
            "dropout_rate_mean": 0.0,
            "activation_type": "relu",
            "batch_size": DL_BATCH_SIZE,
            "epochs": DL_MAX_EPOCHS,
        },
        "tiny_rnn": {
            "model_id": MODEL_IDS["tiny_rnn"],
            "model_name": "tiny_rnn",
            "is_deep_learning": True,
            "is_tree_based": False,
            "is_linear": False,
            "model_family": "RNN",
            "parameterization_type": "linear-in-features",
            "complexity_training_big_o": "O(n · d · hidden_dim · epochs)",
            "complexity_inference_big_o": "O(d · hidden_dim)",
            "is_probabilistic": True,
            "is_ensemble_model": False,
            "regularization_supported": "L2",
            "supports_multiclass_natively": True,
            "supports_online_learning": False,
            "supports_multiple_trees": False,
            "tree_growth_strategy": "none",
            "default_max_depth": 0,
            "supports_pruning": False,
            "splitting_criterion": "none",
            "architecture_type": "RNN-GRU",
            "supports_dropout": False,
            "supports_batchnorm": False,
            "default_activation": "tanh",
            "supports_cuda_acceleration": True,
            "supports_non_linearity": True,
            "supports_categorical_directly": False,
            "supports_missing_values": False,
            "supports_gpu": True,
            "n_estimators": 0,
            "avg_tree_depth": 0.0,
            "max_tree_depth": 0,
            "n_leaves_mean": 0.0,
            "n_layers": 2,
            "hidden_units_mean": 32.0,
            "dropout_rate_mean": 0.0,
            "activation_type": "tanh",
            "batch_size": DL_BATCH_SIZE,
            "epochs": DL_MAX_EPOCHS,
        },
        "mlp": {
            "model_id": MODEL_IDS["mlp"],
            "model_name": "mlp",
            "is_deep_learning": True,
            "is_tree_based": False,
            "is_linear": False,
            "model_family": "MLP",
            "parameterization_type": "linear-in-features",
            "complexity_training_big_o": "O(n · Σ(layer_dims) · epochs)",
            "complexity_inference_big_o": "O(Σ(layer_dims))",
            "is_probabilistic": True,
            "is_ensemble_model": False,
            "regularization_supported": "L2",
            "supports_multiclass_natively": True,
            "supports_online_learning": False,
            "supports_multiple_trees": False,
            "tree_growth_strategy": "none",
            "default_max_depth": 0,
            "supports_pruning": False,
            "splitting_criterion": "none",
            "architecture_type": "MLP",
            "supports_dropout": False,
            "supports_batchnorm": False,
            "default_activation": "relu",
            "supports_cuda_acceleration": True,
            "supports_non_linearity": True,
            "supports_categorical_directly": False,
            "supports_missing_values": False,
            "supports_gpu": True,
            "n_estimators": 0,
            "avg_tree_depth": 0.0,
            "max_tree_depth": 0,
            "n_leaves_mean": 0.0,
            "n_layers": 3,
            "hidden_units_mean": (128.0 + 64.0) / 2.0,
            "dropout_rate_mean": 0.0,
            "activation_type": "relu",
            "batch_size": DL_BATCH_SIZE,
            "epochs": DL_MAX_EPOCHS,
        },
        "tinyconv": {
            "model_id": MODEL_IDS["tinyconv"],
            "model_name": "tinyconv",
            "is_deep_learning": True,
            "is_tree_based": False,
            "is_linear": False,
            "model_family": "CNN",
            "parameterization_type": "linear-in-features",
            "complexity_training_big_o": "O(n · d · epochs)",
            "complexity_inference_big_o": "O(d · filters)",
            "is_probabilistic": True,
            "is_ensemble_model": False,
            "regularization_supported": "L2",
            "supports_multiclass_natively": True,
            "supports_online_learning": False,
            "supports_multiple_trees": False,
            "tree_growth_strategy": "none",
            "default_max_depth": 0,
            "supports_pruning": False,
            "splitting_criterion": "none",
            "architecture_type": "CNN1D",
            "supports_dropout": False,
            "supports_batchnorm": False,
            "default_activation": "relu",
            "supports_cuda_acceleration": True,
            "supports_non_linearity": True,
            "supports_categorical_directly": False,
            "supports_missing_values": False,
            "supports_gpu": True,
            "n_estimators": 0,
            "avg_tree_depth": 0.0,
            "max_tree_depth": 0,
            "n_leaves_mean": 0.0,
            "n_layers": 2,
            "hidden_units_mean": 4.0,
            "dropout_rate_mean": 0.0,
            "activation_type": "relu",
            "batch_size": DL_BATCH_SIZE,
            "epochs": DL_MAX_EPOCHS,
        },
    }

    for model_name, caps in MODEL_CAPABILITIES.items():
        combined = {}
        combined.update(base_ids)
        combined.update(meta_common)
        combined.update(caps)

        for k, v in perf_placeholders.items():
            combined.setdefault(k, v)

        ordered_entry = {}
        for key in KEY_ORDER:
            val = combined.get(key, None)
            if val is None:
                if key in STRING_KEYS:
                    val = "unknown"
                else:
                    val = 0.0
            ordered_entry[key] = val

        entries_by_model[model_name] = ordered_entry

    return entries_by_model


In [None]:
candidates = build_meta_entries_for_all_models_from_preprocessed(data, target)
candidates

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import torch
import torch.nn as nn

# You must define these somewhere in your code:
# MAX_THREADS = ...
# RANDOM_SEED = ...


def make_logreg():
    return LogisticRegression(
        max_iter=800,
        solver="lbfgs"
    )

def make_rf():
    return RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        random_state=42
    )

def make_xgboost():
    try:
        import xgboost as xgb
    except ImportError:
        return None

    # IMPORTANT: do NOT hardcode objective/num_class here.
    # We'll set objective/num_class later in train_and_eval_model based on n_classes.
    return xgb.XGBClassifier(
        tree_method="hist",
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="mlogloss"
    )

# ---------- Simple PyTorch models ----------

class MLPNet(nn.Module):
    def __init__(self, input_dim, n_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, n_classes),
        )

    def forward(self, x):
        return self.net(x)


class TinyConv1DNet(nn.Module):
    """
    Main 1D-CNN model (for 'cnn1d').
    """
    def __init__(self, input_dim, n_classes):
        super().__init__()
        # treat features as length=input_dim, channels=1
        self.conv = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveMaxPool1d(16)
        self.fc = nn.Linear(8 * 16, n_classes)

    def forward(self, x):
        # x: [B, D]
        x = x.unsqueeze(1)          # [B,1,D]
        x = self.conv(x)            # [B,8,D]
        x = self.relu(x)
        x = self.pool(x)            # [B,8,16]
        x = x.view(x.size(0), -1)   # [B,8*16]
        return self.fc(x)


class TinyConvNet(nn.Module):
    """
    Separate smaller conv model for 'tinyconv' (not just an alias of cnn1d).
    """
    def __init__(self, input_dim, n_classes):
        super().__init__()
        # even smaller: fewer channels, smaller pooled length
        self.conv = nn.Conv1d(1, 4, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveMaxPool1d(8)
        self.fc = nn.Linear(4 * 8, n_classes)

    def forward(self, x):
        # x: [B, D]
        x = x.unsqueeze(1)          # [B,1,D]
        x = self.conv(x)            # [B,4,D]
        x = self.relu(x)
        x = self.pool(x)            # [B,4,8]
        x = x.view(x.size(0), -1)   # [B,4*8]
        return self.fc(x)


class TinyRNNNet(nn.Module):
    def __init__(self, input_dim, n_classes):
        super().__init__()
        self.hidden_dim = 32
        # Represent each feature as one timestep with dim=1
        self.rnn = nn.GRU(input_size=1, hidden_size=self.hidden_dim, batch_first=True)
        self.fc = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        # x: [B,D] -> [B,D,1]
        x = x.unsqueeze(-1)
        out, h = self.rnn(x)
        # use last hidden state
        return self.fc(h[-1])


def make_cnn1d(input_dim, n_classes):
    model = TinyConv1DNet(input_dim, n_classes)
    return model


def make_tinyconv(input_dim, n_classes):
    """
    Actual tiny conv model, separate from cnn1d.
    """
    model = TinyConvNet(input_dim, n_classes)
    return model


def make_tiny_rnn(input_dim, n_classes):
    model = TinyRNNNet(input_dim, n_classes)
    return model


def make_mlp(input_dim, n_classes):
    model = MLPNet(input_dim, n_classes)
    return model


MODELS = {
    "logreg":   ("classic", make_logreg),
    "rf":       ("classic", make_rf),
    "xgboost":  ("classic", make_xgboost),
    "cnn1d":    ("deep",    make_cnn1d),
    "tiny_rnn": ("deep",    make_tiny_rnn),
    "mlp":      ("deep",    make_mlp),
    "tinyconv": ("deep",    make_tinyconv),
}


In [None]:
import pickle
import tempfile
import os
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Hyperparameter search spaces

CLASSIC_PARAM_GRIDS = {
    "logreg": {
        "C": [0.01, 0.1, 1.0, 10.0],
        "penalty": ["l2"],
        "solver": ["lbfgs"],
        "max_iter": [200, 500]
    },
    "rf": {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 5, 10, 20],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    },
    "xgboost": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.1, 0.2],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0]
    },
}

# Search space for deep models
DEEP_PARAM_CONFIGS = {
    "cnn1d": [
        {"lr": 1e-3, "batch_size": 32, "epochs": 15},
        {"lr": 1e-4, "batch_size": 64, "epochs": 20},
    ],
    "tiny_rnn": [
        {"lr": 1e-3, "batch_size": 32, "epochs": 20},
        {"lr": 5e-4, "batch_size": 64, "epochs": 25},
    ],
    "mlp": [
        {"lr": 1e-3, "batch_size": 32, "epochs": 20},
        {"lr": 1e-4, "batch_size": 64, "epochs": 30},
    ],
    "tinyconv": [
        {"lr": 1e-3, "batch_size": 32, "epochs": 20},
        {"lr": 5e-4, "batch_size": 64, "epochs": 25},
    ],
}

# Utility: model size

def get_model_size_mb(model, is_deep=False):
    if not is_deep:
        # sklearn / classic
        data = pickle.dumps(model)
        size_bytes = len(data)
    else:
        # Keras / deep
        with tempfile.NamedTemporaryFile(suffix=".h5", delete=False) as tmp:
            tmp_path = tmp.name
        try:
            model.save(tmp_path, include_optimizer=True)
            size_bytes = os.path.getsize(tmp_path)
        finally:
            if os.path.exists(tmp_path):
                os.remove(tmp_path)

    return size_bytes / (1024 ** 2)  # MB

# Classic model training + tuning

def train_and_tune_classic(model_name, make_fn, X_train, y_train, X_val, y_val):
    if model_name not in CLASSIC_PARAM_GRIDS:
        raise ValueError(f"No param grid defined for classic model: {model_name}")

    base_model = make_fn()
    param_grid = CLASSIC_PARAM_GRIDS[model_name]

    grid = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        scoring="accuracy",
        cv=3
    )
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_

    # Evaluate on validation set
    y_pred = best_model.predict(X_val)
    val_acc = accuracy_score(y_val, y_pred)

    model_size_mb = get_model_size_mb(best_model, is_deep=False)

    print(f"[{model_name}] best params: {grid.best_params_}")
    print(f"[{model_name}] validation accuracy: {val_acc:.4f}")
    print(f"[{model_name}] serialized size: {model_size_mb:.4f} MB")

    return best_model, val_acc, model_size_mb

# Deep model training + tuning

def train_and_tune_deep(model_name, make_fn, X_train, y_train, X_val, y_val):
    if model_name not in DEEP_PARAM_CONFIGS:
        raise ValueError(f"No deep param configs defined for: {model_name}")

    best_acc = -np.inf
    best_model = None
    best_cfg = None

    for cfg in DEEP_PARAM_CONFIGS[model_name]:
        print(f"\n[{model_name}] Trying config: {cfg}")

        model = make_fn(lr=cfg["lr"])

        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=cfg["epochs"],
            batch_size=cfg["batch_size"],
            verbose=0
        )

        # Use best val accuracy across epochs
        val_acc = max(history.history.get("val_accuracy", [0.0]))

        print(f"[{model_name}] val_accuracy (best over epochs): {val_acc:.4f}")

        if val_acc > best_acc:
            best_acc = val_acc
            best_model = model
            best_cfg = cfg

    model_size_mb = get_model_size_mb(best_model, is_deep=True)

    print(f"\n[{model_name}] best config: {best_cfg}")
    print(f"[{model_name}] best validation accuracy: {best_acc:.4f}")
    print(f"[{model_name}] serialized size: {model_size_mb:.4f} MB")

    return best_model, best_acc, model_size_mb

# Main: use best candidate name

def run_best_candidate(best_model_name, X_train, y_train, X_val, y_val):
    """
    best_model_name is something like 'logreg', 'rf', 'xgboost', 'cnn1d', etc.
    Looks it up in MODELS, then trains + tunes and prints metrics.
    """
    if best_model_name not in MODELS:
        raise ValueError(f"Unknown model name: {best_model_name}")

    model_type, make_fn = MODELS[best_model_name]

    if model_type == "classic":
        return train_and_tune_classic(best_model_name, make_fn, X_train, y_train, X_val, y_val)
    elif model_type == "deep":
        return train_and_tune_deep(best_model_name, make_fn, X_train, y_train, X_val, y_val)
    else:
        raise ValueError(f"Unknown model type for {best_model_name}: {model_type}")

In [None]:
def preprocessing_logic(df: pd.DataFrame) -> pd.DataFrame:
    cols_to_drop = [
        "accuracy",
        "f1_macro",
        "precision_macro",
        "trained_model_size_kb",
        "inference_speed_ms",
        "model_name",
        "Task_id",
        "dataset_id",
        "model_id",
        "dataset_name",
        "static_usage_ram_kb",
        "dynamic_usage_ram_kb",
        "full_ram_usage_kb",
        "mean_feature_variance",
        "median_feature_variance"
    ]
    df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors="ignore")

    zero_variance_cols = [
        "n_leaves_mean",
        "dropout_rate_mean"
    ]
    df = df.drop(columns=[c for c in zero_variance_cols if c in df.columns], errors="ignore")

    if "regularization_supported" in df.columns:
        df = df.drop("regularization_supported", axis=1)

    return df

In [None]:
import joblib

loaded_model = joblib.load("xgb_pipeline_model.pkl")

best_name = None
best_score = float("-inf")

for name, features in candidates.items():
    df_entry = pd.DataFrame([features])

    score = loaded_model.predict(df_entry)[0]

    print(f"{name} -> predicted score: {score}")

    if score > best_score:
        best_score = score
        best_name = name

print("Best candidate name:", best_name)
print("Best predicted score:", best_score)

In [None]:
best_model, best_acc, best_size = run_best_candidate(
    best_name,
    X_train, y_train,
    X_test, y_test
)