In [1]:
#### IMPORTS AND GLOBALS ####

# Standard Libraries
import os
import warnings
import logging
import time
import re

# Data Science
import numpy as np
import pandas as pd
import kagglehub
from scipy import stats

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import (
    f_regression,      # for regression
    f_classif,         # for classification
    SelectKBest,
    mutual_info_regression,
    mutual_info_classif,
)
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Global Settings
warnings.filterwarnings("ignore", category=UserWarning)

RANDOM_STATE = 42
random_state = RANDOM_STATE
N_ROWS = 10_000

# No scientific notation in pandas display
pd.set_option("display.float_format", lambda x: f"{x:.6f}")

# Reproducibility for NumPy-based randomness
np.random.seed(RANDOM_STATE)


In [2]:
# =============================
# Small Helpers (updated)
# =============================
import pandas as pd
import numpy as np
from scipy import stats

import pandas as pd
import numpy as np
from scipy import stats

def robust_eda(df):
    """
    Performs robust Exploratory Data Analysis (EDA) on a pandas DataFrame and prints the results.
    
    This function includes:
    - DataFrame shape and data types
    - Missing values and duplicates (with handling for unhashable types)
    - Head of the DataFrame
    - Numerical summaries with robust percentiles (5%, 25%, 50%, 75%, 95%)
    - Skewness and kurtosis for numerical columns
    - Outlier counts using IQR method
    - Correlation matrix for numerical columns
    - Value counts and unique counts for categorical columns (with handling for unhashable types)
    
    Parameters:
    df (pd.DataFrame): The input DataFrame to analyze.
    """
    print("=== Robust EDA Report ===")
    
    # Basic Info
    print("\nDataFrame Shape:", df.shape)
    print("\nData Types:\n", df.dtypes)
    print("\nMissing Values:\n", df.isnull().sum())
    
    # Duplicate Rows with try-except for unhashable types
    try:
        print("\nDuplicate Rows:", df.duplicated().sum())
    except TypeError:
        print("\nDuplicate Rows: Cannot compute due to unhashable types in DataFrame (e.g., lists in columns).")
    
    print("\nHead of DataFrame:\n", df.head())
    
    # Separate numerical and categorical columns
    num_cols = df.select_dtypes(include=np.number).columns.tolist()
    cat_cols = df.select_dtypes(exclude=np.number).columns.tolist()
    
    # Numerical Analysis
    if num_cols:
        print("\n=== Numerical Columns Analysis ===")
        # Robust summary with additional percentiles for outlier insight
        print("\nSummary Statistics (with 5%, 25%, 50%, 75%, 95% percentiles):\n", 
              df[num_cols].describe(percentiles=[.05, .25, .5, .75, .95]))
        
        # Skewness and Kurtosis
        print("\nSkewness:\n", df[num_cols].skew())
        print("\nKurtosis:\n", df[num_cols].kurtosis())
        
        # Outlier Detection using IQR (robust to outliers)
        Q1 = df[num_cols].quantile(0.25)
        Q3 = df[num_cols].quantile(0.75)
        IQR = Q3 - Q1
        outliers = ((df[num_cols] < (Q1 - 1.5 * IQR)) | (df[num_cols] > (Q3 + 1.5 * IQR))).sum()
        print("\nOutlier Counts (IQR method):\n", outliers)
        
        # Correlation Matrix (using Pearson; consider Spearman for non-normal data if needed)
        if len(num_cols) > 1:
            print("\nCorrelation Matrix:\n", df[num_cols].corr())
    
    # Categorical Analysis
    if cat_cols:
        print("\n=== Categorical Columns Analysis ===")
        for col in cat_cols:
            try:
                print(f"\nValue Counts for '{col}':\n", df[col].value_counts())
                print(f"Unique Values in '{col}': {df[col].nunique()}")
                print(f"Most Common Value in '{col}': {df[col].mode()[0] if not df[col].mode().empty else 'N/A'}")
            except TypeError:
                print(f"\nCannot analyze '{col}' due to unhashable types (e.g., lists).")
    
    print("\n=== End of EDA Report ===")

def is_sparse_dtype(dtype):
    # check if a dtype is pandas sparse
    return pd.api.types.is_sparse(dtype)

def dollar_format(x, pos=None):
    # format money like $12,345
    return f"${x:,.0f}"

def format_hms(seconds):
    # format seconds to H:M:S and handle >24 hours
    seconds = int(seconds)
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    secs = seconds % 60
    return f"{hours:02d}:{minutes:02d}:{secs:02d}"

def try_read_csv(folder_path, file_name, **kwargs):
    # try to read a csv; file_name can be a full path
    full_path = os.path.join(folder_path, file_name) if folder_path else file_name
    if full_path and os.path.exists(full_path):
        try:
            return pd.read_csv(full_path, **kwargs)
        except Exception:
            return None
    return None

def list_csvs(folder_path):
    # list csv files (sorted)
    if not folder_path or not os.path.exists(folder_path):
        return []
    return sorted([f for f in os.listdir(folder_path) if f.lower().endswith(".csv")])

def simple_random_sample(data_frame, n_rows=None, frac=None, random_state=42):
    # sample without replacement
    if data_frame is None:
        raise ValueError("data_frame is None")
    total_rows = len(data_frame)
    if (n_rows is None) == (frac is None):
        raise ValueError("pass exactly one of n_rows or frac")

    if frac is not None:
        if not (0 < frac <= 1):
            raise ValueError("frac must be between 0 and 1")
        pick_rows = int(np.floor(frac * total_rows))
    else:
        if int(n_rows) <= 0:
            raise ValueError("n_rows must be > 0")
        pick_rows = min(int(n_rows), total_rows)

    if pick_rows >= total_rows:
        print("simple_random_sample: taking all rows")
        return data_frame.copy()

    start = time.perf_counter()
    rng = np.random.default_rng(random_state)
    pick_index = rng.choice(total_rows, size=pick_rows, replace=False)
    pick_index = np.sort(pick_index)  # keep original order
    out_df = data_frame.iloc[pick_index].copy()
    end = time.perf_counter()
    print(f"simple_random_sample: picked {len(out_df)} of {total_rows} rows in {round(end - start, 3)} sec")
    return out_df

def stratified_sample(data_frame, y, n_rows=None, frac=None, random_state=42):
    # stratified sample on labels y
    if data_frame is None:
        raise ValueError("data_frame is None")

    y_array = data_frame[y].to_numpy() if isinstance(y, str) else np.asarray(y)
    total_rows = len(data_frame)
    if len(y_array) != total_rows:
        raise ValueError("X and y length mismatch")

    # prefer n_rows if both given
    if n_rows is not None and frac is not None:
        frac = None
    if n_rows is None and frac is None:
        raise ValueError("provide n_rows or frac")

    if frac is not None:
        if not (0 < frac <= 1):
            raise ValueError("frac must be between 0 and 1")
        test_size = float(frac)
        use_frac, use_n = frac, None
    else:
        if int(n_rows) <= 0:
            raise ValueError("n_rows must be > 0")
        test_size = min(float(n_rows) / total_rows, 1.0)
        use_frac, use_n = None, int(n_rows)

    if test_size >= 1.0:
        print("stratified_sample: taking all rows")
        return data_frame.copy()

    _, counts = np.unique(y_array, return_counts=True)
    min_count = counts.min()

    # need at least 1 per class in both splits
    if min_count < 2 or (min_count * test_size < 1) or (min_count * (1.0 - test_size) < 1):
        print("stratified_sample: class counts too small for requested size, falling back to simple sample")
        return simple_random_sample(data_frame, n_rows=use_n, frac=use_frac, random_state=random_state)

    start = time.perf_counter()
    index_array = np.arange(total_rows)
    _, test_idx, _, _ = train_test_split(
        index_array,
        y_array,
        test_size=test_size,
        stratify=y_array,
        random_state=random_state
    )
    out_df = data_frame.iloc[np.sort(test_idx)].copy()  # keep original order
    end = time.perf_counter()
    print(f"stratified_sample: picked {len(out_df)} of {total_rows} rows in {round(end - start, 3)} sec")
    return out_df

def safe_kaggle_download(dataset_name):
    # download from kaggle with timing and errors
    print(f"download: starting {dataset_name}")
    start = time.perf_counter()
    try:
        path = kagglehub.dataset_download(dataset_name)
        end = time.perf_counter()
        print(f"download: done {dataset_name} -> {path} in {round(end - start, 3)} sec")
        return path
    except Exception as e:
        end = time.perf_counter()
        print(f"download: error {dataset_name} -> {str(e)} in {round(end - start, 3)} sec")
        return None

def coerce_datetime_columns(df):
    # convert likely date/time columns if they are strings
    if df is None:
        return None
    print("dates: converting possible date/time columns")
    for col_name in df.columns:
        lower = col_name.lower()
        if ("date" in lower) or ("time" in lower):
            s = df[col_name]
            try:
                if pd.api.types.is_object_dtype(s) or pd.api.types.is_string_dtype(s):
                    df[col_name] = pd.to_datetime(s, errors="coerce")
            except Exception:
                pass
    return df

def float_range(start, stop, step):
    # float range with guards and tolerance
    if step == 0:
        raise ValueError("step must not be 0")
    values = []
    value = float(start)
    tolerance = abs(step) / 1_000_000
    if step > 0:
        while value <= stop + tolerance:
            values.append(round(value, 12))
            value += step
    else:
        while value >= stop - tolerance:
            values.append(round(value, 12))
            value += step
    return values

In [3]:
# =============================
# Steam Loader
# =============================
def load_steam_dataset(base_path, n_rows=100_000, seed=42):
    print("steam: start")
    if base_path is None:
        print("steam: skip because base_path is None")
        return None

    games = try_read_csv(base_path, "games.csv", low_memory=False)
    users = try_read_csv(base_path, "users.csv", low_memory=False)
    recommendations = try_read_csv(base_path, "recommendations.csv", low_memory=False)

    metadata = None
    meta_path = os.path.join(base_path, "games_metadata.json")
    if os.path.exists(meta_path):
        try:
            metadata = pd.read_json(meta_path, lines=True)
        except Exception as e:
            print(f"steam: metadata read error -> {str(e)}")

    print(
        f"steam: shapes games={None if games is None else games.shape}, "
        f"users={None if users is None else users.shape}, "
        f"recs={None if recommendations is None else recommendations.shape}, "
        f"meta={None if metadata is None else metadata.shape}"
    )

    steam_table = None
    if recommendations is not None:
        if "is_recommended" in recommendations.columns:
            recs_sample = stratified_sample(recommendations, y="is_recommended", n_rows=n_rows, random_state=seed)
        else:
            recs_sample = simple_random_sample(recommendations, n_rows=n_rows, random_state=seed)

        games_plus = games
        if (
            metadata is not None
            and games is not None
            and "app_id" in metadata.columns
            and "app_id" in games.columns
        ):
            print("steam: merge games with metadata")
            games_plus = games.merge(metadata, on="app_id", how="left", suffixes=("", "_meta"))

        steam_table = recs_sample
        if games_plus is not None and "app_id" in recs_sample.columns and "app_id" in games_plus.columns:
            print("steam: merge recommendations with games")
            steam_table = steam_table.merge(games_plus, on="app_id", how="left", suffixes=("", "_game"))

        if users is not None and "user_id" in steam_table.columns and "user_id" in users.columns:
            print("steam: merge with users")
            steam_table = steam_table.merge(users, on="user_id", how="left", suffixes=("", "_user"))

        steam_table = coerce_datetime_columns(steam_table)
        print(f"steam: done shape={None if steam_table is None else steam_table.shape}")
    else:
        print("steam: skip because recommendations.csv is missing")

    return steam_table


# =============================
# Olist Loader
# =============================
def load_olist_dataset(base_path, n_rows=1_000_000, seed=42):
    print("olist: start")
    if base_path is None:
        print("olist: skip because base_path is None")
        return None

    olist_customers = try_read_csv(base_path, "olist_customers_dataset.csv", low_memory=False)
    olist_geolocation = try_read_csv(base_path, "olist_geolocation_dataset.csv", low_memory=False)
    olist_items = try_read_csv(base_path, "olist_order_items_dataset.csv", low_memory=False)
    olist_payments = try_read_csv(base_path, "olist_order_payments_dataset.csv", low_memory=False)
    olist_reviews = try_read_csv(base_path, "olist_order_reviews_dataset.csv", low_memory=False)
    olist_orders = try_read_csv(base_path, "olist_orders_dataset.csv", low_memory=False)
    olist_products = try_read_csv(base_path, "olist_products_dataset.csv", low_memory=False)
    olist_sellers = try_read_csv(base_path, "olist_sellers_dataset.csv", low_memory=False)
    olist_cat_trans = try_read_csv(base_path, "product_category_name_translation.csv", low_memory=False)

    print(
        "olist: shapes "
        f"customers={None if olist_customers is None else olist_customers.shape}, "
        f"geolocation={None if olist_geolocation is None else olist_geolocation.shape}, "
        f"items={None if olist_items is None else olist_items.shape}, "
        f"payments={None if olist_payments is None else olist_payments.shape}, "
        f"reviews={None if olist_reviews is None else olist_reviews.shape}, "
        f"orders={None if olist_orders is None else olist_orders.shape}, "
        f"products={None if olist_products is None else olist_products.shape}, "
        f"sellers={None if olist_sellers is None else olist_sellers.shape}, "
        f"cat_trans={None if olist_cat_trans is None else olist_cat_trans.shape}"
    )

    if not all(x is not None for x in [olist_orders, olist_items, olist_products, olist_sellers, olist_customers]):
        print("olist: skip because core tables are missing")
        return None

    print("olist: sample orders")
    orders_small = simple_random_sample(olist_orders, n_rows=min(n_rows, len(olist_orders)), random_state=seed)

    print("olist: filter items for sampled orders")
    items_small = olist_items[olist_items["order_id"].isin(orders_small["order_id"])].copy()

    if olist_cat_trans is not None and "product_category_name" in olist_products.columns:
        print("olist: merge category translation")
        products_en = olist_products.merge(olist_cat_trans, on="product_category_name", how="left")
    else:
        products_en = olist_products

    if olist_reviews is not None:
        print("olist: build product review stats")
        product_reviews = (
            items_small[["order_id", "product_id"]]
            .merge(olist_reviews[["order_id", "review_score"]], on="order_id", how="inner")
        )
        product_reviews = product_reviews.drop_duplicates(["order_id", "product_id"])
        product_stats = (
            product_reviews.groupby("product_id", as_index=False)
            .agg(
                review_count_product=("review_score", "count"),
                review_score_mean_product=("review_score", "mean"),
            )
        )
    else:
        product_stats = None

    print("olist: merge items, products, and sellers")
    items_ext = (
        items_small.merge(products_en, on="product_id", how="left")
        .merge(olist_sellers, on="seller_id", how="left", suffixes=("", "_seller"))
    )

    if olist_geolocation is not None:
        print("olist: build basic zip geo")
        geo_zip = (
            olist_geolocation.groupby("geolocation_zip_code_prefix", as_index=False).agg(
                geolocation_lat=("geolocation_lat", "mean"),
                geolocation_lng=("geolocation_lng", "mean"),
                geo_points=("geolocation_city", "count"),
            )
        )
        print("olist: merge customers with geo")
        customers_geo = (
            olist_customers.merge(
                geo_zip,
                left_on="customer_zip_code_prefix",
                right_on="geolocation_zip_code_prefix",
                how="left",
            )
            .drop(columns=["geolocation_zip_code_prefix"])
        )
    else:
        customers_geo = olist_customers

    if olist_payments is not None:
        print("olist: aggregate payments")
        payments_agg = (
            olist_payments.groupby("order_id", as_index=False).agg(
                payment_value_total=("payment_value", "sum"),
                payment_installments_max=("payment_installments", "max"),
                payment_count=("payment_type", "count"),
            )
        )
    else:
        payments_agg = None

    print("olist: assemble main table")
    olist_full = (
        orders_small.merge(customers_geo, on="customer_id", how="left")
        .merge(items_ext, on="order_id", how="left")
    )

    if payments_agg is not None:
        print("olist: merge payments")
        olist_full = olist_full.merge(payments_agg, on="order_id", how="left")

    if product_stats is not None:
        print("olist: merge product stats")
        olist_full = olist_full.merge(product_stats, on="product_id", how="left")

    olist_full = coerce_datetime_columns(olist_full)

    print(f"olist: shape after assemble {olist_full.shape}")
    print("olist: done")
    return olist_full


# =============================
# VG2019 Loader
# =============================
def load_vg2019_dataset(base_path, n_rows=1_000_000, seed=42):
    print("vg2019: start")
    if base_path is None:
        print("vg2019: skip because base_path is None")
        return None

    csv_files = list_csvs(base_path)
    pick = None
    for f in csv_files:
        if "vgsales" in f.lower():
            pick = f
            break
    target_csv = pick if pick else (csv_files[0] if csv_files else None)

    if target_csv is None:
        print("vg2019: skip because no csv found")
        return None

    full_path = os.path.join(base_path, target_csv)
    try:
        sales = pd.read_csv(full_path, low_memory=False)
    except Exception as e:
        print(f"vg2019: read error -> {str(e)}")
        return None

    print(f"vg2019: loaded {target_csv} with shape {sales.shape}")

    if "Genre" in sales.columns:
        print("vg2019: stratified sample by Genre")
        sales = stratified_sample(sales, y="Genre", n_rows=n_rows, random_state=seed)
    else:
        print("vg2019: simple random sample")
        sales = simple_random_sample(sales, n_rows=n_rows, random_state=seed)

    print(f"vg2019: done shape={sales.shape}")
    return sales


In [4]:
# =============================
# Helpers
# =============================
from sklearn.base import BaseEstimator, TransformerMixin

class KeepTrainColumns(BaseEstimator, TransformerMixin):
    # remembers training columns and reindexes any input to match
    def fit(self, X, y=None):
        if hasattr(X, "columns"):
            self.keep_columns_ = list(X.columns)
        else:
            self.keep_columns_ = None
        return self

    def transform(self, X):
        if self.keep_columns_ is None:
            return X
        if hasattr(X, "reindex"):
            return X.reindex(columns=self.keep_columns_, fill_value=0)
        return X


def predict_with_threshold(model, X, threshold=0.5):
    # turn scores into 0/1 using a chosen threshold
    import numpy as np
    if hasattr(model, "predict_proba"):
        scores = model.predict_proba(X)[:, 1]
    elif hasattr(model, "decision_function"):
        raw = model.decision_function(X)
        raw_min, raw_max = float(raw.min()), float(raw.max())
        scores = (raw - raw_min) / (raw_max - raw_min + 1e-9)
    else:
        scores = model.predict(X).astype(float)
    return (scores >= threshold).astype(int)


# =============================
# Model builder + tuner (with oversampling + threshold tuning)
# =============================
def build_and_tune_models(
    X_train, y_train,
    task_type,
    num_folds,
    num_iterations,
    oversample=False,
    oversample_method="random"
):
    import math
    import numpy as np
    import pandas as pd
    import warnings

    from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold, RandomizedSearchCV, cross_val_predict
    from sklearn.pipeline import Pipeline as SKPipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.feature_selection import SelectKBest, f_regression, f_classif, VarianceThreshold
    from sklearn.impute import SimpleImputer
    from sklearn.exceptions import ConvergenceWarning
    from sklearn.metrics import f1_score, confusion_matrix

    from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, GradientBoostingRegressor, RandomForestRegressor
    from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
    from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
    from sklearn.svm import LinearSVC
    from sklearn.naive_bayes import GaussianNB
    from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
    from sklearn.dummy import DummyClassifier, DummyRegressor

    # optional oversampling tools
    ImbPipeline = None
    RandomOverSampler = None
    SMOTE = None
    if oversample and str(task_type).strip().lower() == "classification":
        try:
            from imblearn.pipeline import Pipeline as ImbPipeline
            from imblearn.over_sampling import RandomOverSampler, SMOTE
        except Exception:
            print("imblearn not available. Oversampling disabled.")
            oversample = False

    # neat prints
    np.set_printoptions(suppress=True)
    pd.options.display.float_format = lambda x: f"{x:.6f}"
    warnings.filterwarnings("ignore", category=ConvergenceWarning)

    # drop constant columns once on train
    non_constant_columns = X_train.columns[X_train.nunique(dropna=False) > 1]
    if len(non_constant_columns) < X_train.shape[1]:
        dropped = X_train.shape[1] - len(non_constant_columns)
        print(f"Removed {dropped} constant feature(s).")
        X_train = X_train[non_constant_columns]

    # task settings
    task = str(task_type).strip().lower()
    if task == "classification":
        scoring = "f1_macro"
        selector_score_func = f_classif
        min_class = int(y_train.value_counts().min())
        eff_folds = max(2, min(int(num_folds), min_class))
        baseline_cv = StratifiedKFold(n_splits=eff_folds, shuffle=True, random_state=42)
        search_cv = StratifiedKFold(n_splits=eff_folds, shuffle=True, random_state=42)

        sampler_obj = None
        if oversample:
            if oversample_method == "smote":
                k_neighbors_for_smote = max(1, min(5, min_class - 1))
                if k_neighbors_for_smote < 1:
                    print("SMOTE not possible (minority class too small). Using RandomOverSampler.")
                    sampler_obj = RandomOverSampler(random_state=42)
                else:
                    sampler_obj = SMOTE(random_state=42, k_neighbors=k_neighbors_for_smote)
            else:
                sampler_obj = RandomOverSampler(random_state=42)

        class_weight_choice = None if oversample else "balanced"

        model_space = {
            "GBT": GradientBoostingClassifier(random_state=42),
            "RandomForest": RandomForestClassifier(random_state=42, class_weight=class_weight_choice, n_jobs=-1),
            "DecisionTree": DecisionTreeClassifier(random_state=42, class_weight=class_weight_choice),
            "LogisticRegression": LogisticRegression(solver="saga", max_iter=5000, class_weight=class_weight_choice),
            "LinearSVM": LinearSVC(max_iter=5000, class_weight=class_weight_choice),
            "NaiveBayes": GaussianNB(),
            "KNN": KNeighborsClassifier(),
            "Dummy": DummyClassifier(strategy="most_frequent", random_state=42),
        }
        metric_name = "F1_macro"
        higher_is_better = True
    elif task == "regression":
        scoring = "neg_mean_absolute_error"
        selector_score_func = f_regression
        eff_folds = max(2, int(num_folds))
        baseline_cv = KFold(n_splits=eff_folds, shuffle=True, random_state=42)
        search_cv = KFold(n_splits=eff_folds, shuffle=True, random_state=42)

        model_space = {
            "GBT": GradientBoostingRegressor(random_state=42),
            "RandomForest": RandomForestRegressor(random_state=42, n_jobs=-1),
            "DecisionTree": DecisionTreeRegressor(random_state=42),
            "LinearRegression": LinearRegression(),
            "Ridge": Ridge(max_iter=5000),
            "Lasso": Lasso(max_iter=5000),
            "ElasticNet": ElasticNet(max_iter=5000),
            "KNN": KNeighborsRegressor(),
            "Dummy": DummyRegressor(strategy="mean"),
        }
        metric_name = "CV_MAE"
        higher_is_better = False
        sampler_obj = None
    else:
        raise ValueError('task_type must be "classification" or "regression"')

    total_features = X_train.shape[1]
    feature_fractions = [0.10, 0.25, 0.50, 0.75, 1.00]

    # which models need scaling and selection
    needs_scaling = {"LogisticRegression", "LinearSVM", "KNN", "LinearRegression", "Ridge", "Lasso", "ElasticNet", "NaiveBayes"}
    skip_selection = {"Dummy"}
    tree_like = {"RandomForest", "DecisionTree", "GBT"}

    def k_from_fraction(frac, total_cols):
        if frac >= 1.0:
            return "all"
        k = int(max(1, math.ceil(frac * total_cols)))
        return min(k, total_cols)

    # dynamic KNN neighbors cap
    per_fold_train = int(len(X_train) * (eff_folds - 1) / eff_folds)
    max_knn_k = max(3, min(101, per_fold_train - 1))
    knn_ks = list(range(3, max_knn_k + 1, 2))

    def logspace_list(low_exp, high_exp, num):
        return list(np.logspace(low_exp, high_exp, num))

    def linspace_list(low_val, high_val, num):
        return list(np.linspace(low_val, high_val, num))

    param_spaces_classification = {
        "GBT": {
            "model__n_estimators": [100, 200, 300, 500],
            "model__learning_rate": logspace_list(-3, 0, 12),
            "model__max_depth": [2, 3, 4, 5],
            "model__subsample": linspace_list(0.6, 1.0, 5),
        },
        "RandomForest": {
            "model__n_estimators": [200, 400, 700],
            "model__max_depth": [None, 20, 40],
            "model__min_samples_split": [2, 5, 10],
            "model__min_samples_leaf": [1, 2, 4],
            "model__max_features": ["sqrt", "log2", None],
        },
        "DecisionTree": {
            "model__max_depth": [None, 10, 20, 40],
            "model__min_samples_split": [2, 5, 10],
            "model__min_samples_leaf": [1, 2, 4],
            "model__splitter": ["best", "random"],
        },
        "LogisticRegression": {
            "model__C": logspace_list(-3, 3, 20),
            "model__penalty": ["l1", "l2"],
            "model__solver": ["saga"],
        },
        "LinearSVM": {
            "model__C": logspace_list(-3, 3, 20),
            "model__loss": ["hinge", "squared_hinge"],
        },
        "NaiveBayes": {
            "model__var_smoothing": list(10 ** np.linspace(-11, -7, 9))
        },
        "KNN": {
            "model__n_neighbors": knn_ks,
            "model__weights": ["uniform", "distance"],
            "model__p": [1, 2],
            "model__leaf_size": list(range(10, 61, 10)),
        },
        "Dummy": {"model__strategy": ["most_frequent", "stratified", "uniform"]},
    }

    param_spaces_regression = {
        "GBT": {
            "model__n_estimators": [100, 200, 300, 500],
            "model__learning_rate": logspace_list(-3, 0, 12),
            "model__max_depth": [2, 3, 4, 5],
            "model__subsample": linspace_list(0.6, 1.0, 5),
        },
        "RandomForest": {
            "model__n_estimators": [200, 400, 700],
            "model__max_depth": [None, 20, 40],
            "model__min_samples_split": [2, 5, 10],
            "model__min_samples_leaf": [1, 2, 4],
            "model__max_features": ["sqrt", "log2", None],
        },
        "DecisionTree": {
            "model__max_depth": [None, 10, 20, 40],
            "model__min_samples_split": [2, 5, 10],
            "model__min_samples_leaf": [1, 2, 4],
            "model__splitter": ["best", "random"],
        },
        "LinearRegression": {},
        "Ridge": {"model__alpha": logspace_list(-3, 3, 20), "model__fit_intercept": [True, False]},
        "Lasso": {"model__alpha": logspace_list(-4, 1, 20), "model__fit_intercept": [True, False]},
        "ElasticNet": {"model__alpha": logspace_list(-4, 1, 20), "model__l1_ratio": linspace_list(0.1, 0.9, 9), "model__fit_intercept": [True, False]},
        "KNN": {
            "model__n_neighbors": knn_ks,
            "model__weights": ["uniform", "distance"],
            "model__p": [1, 2],
            "model__leaf_size": list(range(10, 61, 10)),
        },
        "Dummy": {"model__strategy": ["mean", "median"]},
    }

    # build a pipeline for a given k
    def make_pipeline_for_k(model_name, model_obj, k_value):
        # order: align -> impute -> variance -> select -> scale -> sampler -> model
        align_step = ("align", KeepTrainColumns())
        impute_step = ("impute", SimpleImputer(strategy="median"))
        variance_step = ("variance", VarianceThreshold(threshold=0.0))

        if model_name in skip_selection or model_name in tree_like:
            select_step = ("select", "passthrough")
        else:
            select_step = ("select", SelectKBest(score_func=selector_score_func, k=k_value))

        scale_step = ("scale", StandardScaler() if model_name in needs_scaling else "passthrough")

        steps = [align_step, impute_step, variance_step, select_step, scale_step]

        if task == "classification" and oversample and sampler_obj is not None:
            steps.append(("sampler", sampler_obj))

        steps.append(("model", model_obj))

        if oversample and ImbPipeline is not None and task == "classification":
            return ImbPipeline(steps)
        else:
            return SKPipeline(steps)

    # baseline sweep across models × k
    rows = []
    total_steps = len(model_space) * len(feature_fractions)
    step = 0
    print("Streaming results (each line is one model × feature count):")
    for model_name, model_obj in model_space.items():
        for frac in feature_fractions:
            step += 1
            k_val = k_from_fraction(frac, total_features)
            k_print = total_features if k_val == "all" else int(k_val)
            pipeline = make_pipeline_for_k(model_name, model_obj, k_val)
            scores = cross_val_score(pipeline, X_train, y_train, cv=baseline_cv, scoring=scoring, n_jobs=1)
            mean_score = float(np.mean(scores))
            std_score = float(np.std(scores))
            if task == "regression":
                display_mean = -mean_score
                display_std = float(np.std(-scores))
            else:
                display_mean = mean_score
                display_std = std_score
            rows.append({"Model": model_name, "K_features": k_print, "MeanScore": display_mean, "StdDev": display_std, "Metric": metric_name})
            print(f"[{step}/{total_steps}] {model_name} | k={k_print} | {metric_name}={display_mean:.6f} ± {display_std:.6f}", flush=True)

    results_df = pd.DataFrame(rows)
    if task == "classification":
        results_df = results_df.sort_values(by=["MeanScore", "Model"], ascending=[False, True]).reset_index(drop=True)
    else:
        results_df = results_df.sort_values(by=["MeanScore", "Model"], ascending=[True, True]).reset_index(drop=True)

    print("\n=== Baseline results (CV) ===")
    print(results_df[["Model", "K_features", "MeanScore", "StdDev", "Metric"]])

    best_row = results_df.iloc[0]
    best_model_name = str(best_row["Model"])
    best_k = int(best_row["K_features"])
    best_model_obj = model_space[best_model_name]
    k_val_for_search = "all" if best_k >= total_features else best_k
    best_pipeline = make_pipeline_for_k(best_model_name, best_model_obj, k_val_for_search)

    # pick search space
    search_space = (param_spaces_classification if task == "classification" else param_spaces_regression).get(best_model_name, {})
    if len(search_space) == 0:
        best_pipeline.fit(X_train, y_train)
        try:
            best_pipeline.input_columns_ = list(X_train.columns)
        except Exception:
            pass
        # threshold tuning only for classification
        if task == "classification":
            try:
                _tune_threshold_inplace(best_pipeline, X_train, y_train, search_cv)
            except Exception as e:
                print(f"[warn] threshold tuning failed: {e}")
        print("\nBest model had no tunable params. Returning fitted pipeline.")
        return best_pipeline

    # hyperparameter search
    search = RandomizedSearchCV(
        estimator=best_pipeline,
        param_distributions=search_space,
        n_iter=int(max(1, num_iterations)),
        scoring=scoring,
        cv=search_cv,
        random_state=42,
        n_jobs=-1,
        verbose=2
    )
    search.fit(X_train, y_train)

    # print tuned CV result
    if task == "regression":
        tuned_score_display = -float(search.best_score_)
        tuned_metric_name = "CV MAE"
    else:
        tuned_score_display = float(search.best_score_)
        tuned_metric_name = "F1 macro"

    print("\n=== Best model after randomized search ===")
    print(f"Model name: {best_model_name}")
    print(f"Number of features: {best_k}")
    print(f"Best hyperparameters: {search.best_params_}")
    print(f"Best CV score ({tuned_metric_name}): {tuned_score_display:.6f}")

    # remember training columns
    try:
        search.best_estimator_.input_columns_ = list(X_train.columns)
    except Exception:
        pass

    # threshold tuning only for classification
    if task == "classification":
        try:
            _tune_threshold_inplace(search.best_estimator_, X_train, y_train, search_cv)
        except Exception as e:
            print(f"[warn] threshold tuning failed: {e}")

    return search.best_estimator_


def _tune_threshold_inplace(fitted_estimator, X, y, cv):
    """
    Finds a good decision threshold using out-of-fold scores on the training set.
    Stores results on the estimator as .best_threshold_ and .best_threshold_cv_f1_.
    """
    import numpy as np
    from sklearn.model_selection import cross_val_predict
    from sklearn.metrics import f1_score, confusion_matrix

    # try probabilities first
    scores = None
    try:
        proba_oof = cross_val_predict(fitted_estimator, X, y, cv=cv, method="predict_proba", n_jobs=1)  # shape (n, 2)
        scores = proba_oof[:, 1]
    except Exception:
        pass

    # fallback to decision function
    if scores is None:
        try:
            decision_oof = cross_val_predict(fitted_estimator, X, y, cv=cv, method="decision_function", n_jobs=1)
            dec_min, dec_max = float(decision_oof.min()), float(decision_oof.max())
            scores = (decision_oof - dec_min) / (dec_max - dec_min + 1e-9)
        except Exception:
            pass

    # if no scores available, keep default threshold
    if scores is None:
        fitted_estimator.best_threshold_ = 0.5
        fitted_estimator.best_threshold_cv_f1_ = None
        print("[info] model does not expose scores for thresholding. Using 0.5.")
        return

    # sweep thresholds
    best_threshold = 0.5
    best_f1_macro = -1.0
    thresholds_to_try = np.linspace(0.05, 0.95, 19)

    for t in thresholds_to_try:
        y_hat = (scores >= t).astype(int)
        f1_macro_val = float(f1_score(y, y_hat, average="macro"))
        if f1_macro_val > best_f1_macro:
            best_f1_macro = f1_macro_val
            best_threshold = float(t)

    # show OOF result at best threshold
    y_hat_final = (scores >= best_threshold).astype(int)
    print("\n=== Threshold tuning (OOF on train) ===")
    print(f"Best threshold: {best_threshold:.2f} | F1_macro: {best_f1_macro:.6f}")
    print("Confusion matrix at best threshold:")
    print(confusion_matrix(y, y_hat_final))

    # store on estimator
    fitted_estimator.best_threshold_ = best_threshold
    fitted_estimator.best_threshold_cv_f1_ = best_f1_macro


# =============================
# Holdout evaluation helper (uses tuned threshold if available)
# =============================
def evaluate_on_holdout(model, X_test, y_test, task_type, threshold=None):
    import pandas as pd
    from sklearn.metrics import f1_score, mean_absolute_error, confusion_matrix

    # align columns to what the model saw at fit
    try:
        if hasattr(model, "input_columns_") and hasattr(X_test, "reindex"):
            X_test = X_test.reindex(columns=model.input_columns_, fill_value=0)
        elif hasattr(model, "named_steps") and "align" in getattr(model, "named_steps", {}):
            keep_cols = getattr(model.named_steps["align"], "keep_columns_", None)
            if keep_cols is not None and hasattr(X_test, "reindex"):
                X_test = X_test.reindex(columns=list(keep_cols), fill_value=0)
    except Exception as e:
        print(f"[warn] could not align columns: {e}")

    # choose prediction path
    if str(task_type).strip().lower() == "classification":
        final_threshold = threshold
        if final_threshold is None and hasattr(model, "best_threshold_"):
            final_threshold = float(model.best_threshold_)
        if final_threshold is not None:
            y_pred = predict_with_threshold(model, X_test, threshold=final_threshold)
        else:
            y_pred = model.predict(X_test)
    else:
        y_pred = model.predict(X_test)

    print("\n=== Holdout (time split) ===")
    if str(task_type).strip().lower() == "classification":
        f1 = float(f1_score(y_test, y_pred, average="macro"))
        print(f"F1 macro: {f1:.6f}")
        print("Confusion matrix:")
        print(confusion_matrix(y_test, y_pred))
        return f1
    else:
        mae = float(mean_absolute_error(y_test, y_pred))
        print(f"MAE: {mae:.6f}")
        return mae


In [5]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, TimeSeriesSplit, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.metrics import f1_score, confusion_matrix, mean_absolute_error
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

# ---------------------------------
# config
# ---------------------------------
FEATURE_SELECTION_OPTIONS = {"none", "tree", "forward", "mutual_info"}

# ---------------------------------
# small helpers
# ---------------------------------
def _scale_fit(method, X_train_num):
    if method == "standard":
        return StandardScaler().fit(X_train_num)
    if method == "minmax":
        return MinMaxScaler().fit(X_train_num)
    return None

def _select_features(method, max_features, task_type, random_state, X_train, y_train):
    if method is None or method == "none":
        return X_train.columns.tolist()
    k = min(max_features, X_train.shape[1]) if max_features else X_train.shape[1]
    if k < 1:
        return X_train.columns.tolist()

    if method == "tree":
        est = RandomForestClassifier(random_state=random_state) if task_type == "classification" else RandomForestRegressor(random_state=random_state)
        est.fit(X_train, y_train)
        imp = pd.Series(est.feature_importances_, index=X_train.columns)
        return imp.nlargest(k).index.tolist()

    if method == "forward":
        if task_type == "classification":
            est = RandomForestClassifier(random_state=random_state)
            scoring = "f1_macro"
        else:
            est = RandomForestRegressor(random_state=random_state)
            scoring = "neg_mean_absolute_error"
        sfs = SequentialFeatureSelector(est, n_features_to_select=k, direction="forward", scoring=scoring, cv=5, n_jobs=-1)
        sfs.fit(X_train, y_train)
        return X_train.columns[sfs.get_support()].tolist()

    if method == "mutual_info":
        sel = SelectKBest(mutual_info_classif if task_type == "classification" else mutual_info_regression, k=k)
        sel.fit(X_train, y_train)
        return X_train.columns[sel.get_support()].tolist()

    return X_train.columns.tolist()

def _cap_fit(X_train, q=0.95):
    caps = {}
    for c in X_train.select_dtypes(include=["number"]).columns:
        caps[c] = np.nanpercentile(X_train[c], q * 100.0)
    return caps

def _cap_apply(X, caps):
    for c, v in caps.items():
        if c in X.columns:
            X[c] = np.clip(X[c], None, v)
    return X

def _dummify_fit(X_train, cols):
    if not cols:
        return X_train.columns
    Xd = pd.get_dummies(X_train, columns=cols, drop_first=True)
    return Xd.columns

def _dummify_apply(X, cols, schema_cols):
    if cols:
        Xd = pd.get_dummies(X, columns=cols, drop_first=True)
    else:
        Xd = X.copy()
    Xd = Xd.reindex(columns=schema_cols, fill_value=0)
    return Xd

def _datetimes_to_numeric_inplace(X):
    # convert datetimes to float seconds since epoch
    for c in X.columns:
        if np.issubdtype(X[c].dtype, np.datetime64):
            mask = X[c].isna()
            arr = X[c].astype("int64").astype("float64") / 1e9
            if mask.any():
                arr[mask.values] = np.nan
            X[c] = arr
    return X

def _freq_encode_objects_inplace(X_train, X_test):
    # replace object/category columns with frequency ratios
    obj_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
    for c in obj_cols:
        vc = X_train[c].value_counts(dropna=False)
        total = vc.sum() if vc.sum() > 0 else 1.0
        mapping = (vc / total).to_dict()
        default_val = 0.0
        X_train[c] = X_train[c].map(mapping).fillna(default_val)
        X_test[c] = X_test[c].map(mapping).fillna(default_val)
    return X_train, X_test

def _scale_numeric_only(X_train, X_test, scale_method):
    # scale numeric columns only
    num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
    if not num_cols:
        return X_train, X_test
    scaler = _scale_fit(scale_method, X_train[num_cols])
    if scaler is None:
        return X_train, X_test
    X_train[num_cols] = scaler.transform(X_train[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols])
    return X_train, X_test

def _find_time_col(X):
    # try a few common time columns
    preferred = [
        "date",
        "order_purchase_timestamp",
        "order_delivered_customer_date",
        "order_estimated_delivery_date",
        "date_release",
        "Year",
        "year",
    ]
    for c in preferred:
        if c in X.columns:
            return c
    # fallback: use index order
    return None

def _sort_by_time(X, y):
    tcol = _find_time_col(X)
    if tcol is None:
        return X.reset_index(drop=True), y.reset_index(drop=True), None
    key = X[tcol].values
    order = np.argsort(key)
    return X.iloc[order].reset_index(drop=True), y.iloc[order].reset_index(drop=True), tcol

def _tune_f1_threshold(y_true, y_prob, grid=None):
    if grid is None:
        grid = np.linspace(0.05, 0.95, 19)
    best_t, best_f1 = 0.5, -1.0
    for t in grid:
        y_pred = (y_prob >= t).astype(int)
        f1 = f1_score(y_true, y_pred, average="macro")
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return best_t, best_f1

# ---------------------------------
# main prep
# ---------------------------------
def prepare_data(steam_df, olist_df, sales_df, test_size, random_state,
                 feature_selection, max_features, task_type, scale_method):
    feature_selection = (feature_selection or "none").lower()
    if feature_selection not in FEATURE_SELECTION_OPTIONS:
        feature_selection = "none"

    outputs = {}

    # ===================== STEAM =====================
    steam = steam_df.copy()
    if task_type == "classification":
        steam["target"] = (steam["positive_ratio"] >= 80).astype(int)
    else:
        steam["target"] = steam["positive_ratio"]
    steam.dropna(subset=["target"], inplace=True)

    # drop only clear leakage and IDs
    steam.drop(columns=["app_id", "user_id", "review_id", "title", "description", "tags", "positive_ratio", "rating"], errors="ignore", inplace=True)

    # keep dates as features and add days_since_release
    if {"date", "date_release"}.issubset(steam.columns):
        steam["days_since_release"] = (steam["date"] - steam["date_release"]).dt.days

    # keep these binary flags
    if "is_recommended" in steam.columns:
        steam["is_recommended"] = steam["is_recommended"].astype(int)
    if "mac" in steam.columns:
        steam["mac"] = steam["mac"].astype(int)
    if "linux" in steam.columns:
        steam["linux"] = steam["linux"].astype(int)

    # basic feature ideas
    if "hours" in steam.columns:
        steam["log_hours"] = np.log1p(steam["hours"])
    if {"hours", "user_reviews"}.issubset(steam.columns):
        steam["reviews_per_hour"] = steam["user_reviews"] / (steam["hours"] + 1e-9)

    X = steam.drop(columns=["target"])
    y = steam["target"]
    strat = y if task_type == "classification" else None
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=strat)

    _datetimes_to_numeric_inplace(X_train)
    _datetimes_to_numeric_inplace(X_test)
    X_train, X_test = _freq_encode_objects_inplace(X_train, X_test)

    num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
    if num_cols:
        simp = SimpleImputer(strategy="median").fit(X_train[num_cols])
        X_train[num_cols] = simp.transform(X_train[num_cols])
        X_test[num_cols] = simp.transform(X_test[num_cols])

    caps = _cap_fit(X_train.select_dtypes(include=["number"]), q=0.95)
    X_train = _cap_apply(X_train, caps)
    X_test = _cap_apply(X_test, caps)

    # no oversampling here; it happens inside CV if you enable it

    X_train, X_test = _scale_numeric_only(X_train, X_test, scale_method)

    keep_cols = _select_features(feature_selection, max_features, task_type, random_state, X_train, y_train)
    X_train = X_train[keep_cols]
    X_test = X_test[keep_cols]
    outputs["steam"] = (X_train, X_test, y_train, y_test)

    # ===================== OLIST =====================
    olist = olist_df.copy()
    if task_type == "classification":
        olist["target"] = (olist["review_score_mean_product"] >= 4.0).astype(int)
    else:
        olist["target"] = olist["review_score_mean_product"]
    olist.dropna(subset=["target"], inplace=True)

    # drop only clear IDs
    olist.drop(columns=["order_id", "customer_id", "customer_unique_id"], errors="ignore", inplace=True)

    # keep order_status (not direct leakage)
    if {"order_purchase_timestamp", "order_estimated_delivery_date"}.issubset(olist.columns):
        olist["delivery_delay"] = (olist["order_estimated_delivery_date"] - olist["order_purchase_timestamp"]).dt.days

    if {"payment_value_total", "payment_installments_max"}.issubset(olist.columns):
        olist["avg_installment"] = olist["payment_value_total"] / olist["payment_installments_max"].replace(0, 1)

    X = olist.drop(columns=["review_score_mean_product", "target"], errors="ignore")
    y = olist["target"]
    strat = y if task_type == "classification" else None
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=strat)

    _datetimes_to_numeric_inplace(X_train)
    _datetimes_to_numeric_inplace(X_test)
    X_train, X_test = _freq_encode_objects_inplace(X_train, X_test)

    num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
    if num_cols:
        simp = SimpleImputer(strategy="median").fit(X_train[num_cols])
        X_train[num_cols] = simp.transform(X_train[num_cols])
        X_test[num_cols] = simp.transform(X_test[num_cols])

    dims = ["product_length_cm", "product_height_cm", "product_width_cm"]
    if all(d in X_train.columns for d in dims):
        X_train["volume"] = X_train["product_length_cm"] * X_train["product_height_cm"] * X_train["product_width_cm"]
        X_test["volume"] = X_test["product_length_cm"] * X_test["product_height_cm"] * X_test["product_width_cm"]
        X_train.drop(columns=dims, inplace=True, errors="ignore")
        X_test.drop(columns=dims, inplace=True, errors="ignore")

    small_cats = []  # none after freq-encode
    schema_cols = _dummify_fit(X_train.copy(), small_cats)
    X_train = _dummify_apply(X_train, small_cats, schema_cols)
    X_test = _dummify_apply(X_test, small_cats, schema_cols)

    caps = _cap_fit(X_train.select_dtypes(include=["number"]), q=0.95)
    X_train = _cap_apply(X_train, caps)
    X_test = _cap_apply(X_test, caps)

    # no oversampling here

    X_train, X_test = _scale_numeric_only(X_train, X_test, scale_method)

    keep_cols = _select_features(feature_selection, max_features, task_type, random_state, X_train, y_train)
    X_train = X_train[keep_cols]
    X_test = X_test[keep_cols]
    outputs["olist"] = (X_train, X_test, y_train, y_test)

    # ===================== SALES =====================
    sales = sales_df.copy()
    if task_type == "classification":
        sales["target"] = (sales["Critic_Score"] >= 8.0).astype(int)
    else:
        sales["target"] = sales["Critic_Score"]
    sales.dropna(subset=["target"], inplace=True)

    # drop only obvious leakage and outcomes not usable at predict time
    sales.drop(columns=["Rank", "Name", "Publisher", "Developer", "Total_Shipped", "Global_Sales", "NA_Sales", "PAL_Sales", "JP_Sales", "Other_Sales"], errors="ignore", inplace=True)

    for c in ["ESRB_Rating", "Genre", "Platform"]:
        if c in sales.columns:
            sales[c] = sales[c].fillna("Unknown")

    X = sales.drop(columns=["target", "Critic_Score"], errors="ignore")
    y = sales["target"]
    strat = y if task_type == "classification" else None
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=strat)

    _datetimes_to_numeric_inplace(X_train)
    _datetimes_to_numeric_inplace(X_test)
    X_train, X_test = _freq_encode_objects_inplace(X_train, X_test)

    num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
    if num_cols:
        simp = SimpleImputer(strategy="median").fit(X_train[num_cols])
        X_train[num_cols] = simp.transform(X_train[num_cols])
        X_test[num_cols] = simp.transform(X_test[num_cols])

    caps = _cap_fit(X_train.select_dtypes(include=["number"]), q=0.95)
    X_train = _cap_apply(X_train, caps)
    X_test = _cap_apply(X_test, caps)

    # no oversampling here

    X_train, X_test = _scale_numeric_only(X_train, X_test, scale_method)

    keep_cols = _select_features(feature_selection, max_features, task_type, random_state, X_train, y_train)
    X_train = X_train[keep_cols]
    X_test = X_test[keep_cols]
    outputs["sales"] = (X_train, X_test, y_train, y_test)

    return outputs

In [7]:
# =============================
# Download Paths
# =============================
print("main: start downloads")
steam_path = safe_kaggle_download("antonkozyriev/game-recommendations-on-steam")
olist_path = safe_kaggle_download("olistbr/brazilian-ecommerce")
vg2019_path = safe_kaggle_download("ashaheedq/video-games-sales-2019")
print("main: downloads finished")

# =============================
# Load All
# =============================
start_total = time.perf_counter()
steam = load_steam_dataset(steam_path, n_rows=N_ROWS, seed=random_state)
olist = load_olist_dataset(olist_path, n_rows=N_ROWS, seed=random_state)
sales = load_vg2019_dataset(vg2019_path, n_rows=N_ROWS, seed=random_state)
end_total = time.perf_counter()
print(f"main: load all done in {round(end_total - start_total, 3)} sec ({format_hms(end_total - start_total)})")

# =============================
# Download Shapes
# =============================
print("download: shapes summary")
print(f"download: steam shape = {None if steam is None else steam.shape}")
print(f"download: olist shape = {None if olist is None else olist.shape}")
print(f"download: sales shape = {None if sales is None else sales.shape}")

main: start downloads
download: starting antonkozyriev/game-recommendations-on-steam
download: done antonkozyriev/game-recommendations-on-steam -> /Users/chandlercampbell/.cache/kagglehub/datasets/antonkozyriev/game-recommendations-on-steam/versions/28 in 0.361 sec
download: starting olistbr/brazilian-ecommerce
download: done olistbr/brazilian-ecommerce -> /Users/chandlercampbell/.cache/kagglehub/datasets/olistbr/brazilian-ecommerce/versions/2 in 0.217 sec
download: starting ashaheedq/video-games-sales-2019
download: done ashaheedq/video-games-sales-2019 -> /Users/chandlercampbell/.cache/kagglehub/datasets/ashaheedq/video-games-sales-2019/versions/2 in 0.186 sec
main: downloads finished
steam: start
steam: shapes games=(50872, 13), users=(14306064, 3), recs=(41154794, 8), meta=(50872, 3)
stratified_sample: picked 10000 of 41154794 rows in 5.885 sec
steam: merge games with metadata
steam: merge recommendations with games
steam: merge with users
dates: converting possible date/time colum

In [8]:
print("ROBUST EDA ON STEAM")
robust_eda(steam)
print("END OF ROBUST EDA ON STEAM\n")
print("ROBUST EDA ON OLIST")
robust_eda(olist)
print("END OF ROBUST EDA ON OLIST\n")
print("ROBUST EDA ON SALES")
robust_eda(sales)
print("END OF ROBUST EDA ON SALES\n")

ROBUST EDA ON STEAM
=== Robust EDA Report ===

DataFrame Shape: (10000, 24)

Data Types:
 app_id                     int64
helpful                    int64
funny                      int64
date              datetime64[ns]
is_recommended              bool
hours                    float64
user_id                    int64
review_id                  int64
title                     object
date_release      datetime64[ns]
win                         bool
mac                         bool
linux                       bool
rating                    object
positive_ratio             int64
user_reviews               int64
price_final              float64
price_original           float64
discount                 float64
steam_deck                  bool
description               object
tags                      object
products                   int64
reviews                    int64
dtype: object

Missing Values:
 app_id            0
helpful           0
funny             0
date              0
is_rec

# Classification

In [9]:

# Classification call
splits = prepare_data(
    steam_df=steam,
    olist_df=olist,
    sales_df=sales,
    test_size=0.2,
    random_state=random_state,
    feature_selection='tree',
    max_features=100,
    task_type='classification',
    scale_method='standard'
)

X_train_steam, X_test_steam, y_train_steam, y_test_steam = splits["steam"]
X_train_olist, X_test_olist, y_train_olist, y_test_olist = splits["olist"]
X_train_sales, X_test_sales, y_train_sales, y_test_sales = splits["sales"]

print("\n=== STEAM Dataset ===")
best_steam_model = build_and_tune_models(
    X_train_steam, y_train_steam,
    task_type="classification",
    num_folds=3,
    num_iterations=20,
    oversample=True
)

score_steam = evaluate_on_holdout(best_steam_model, X_test_steam, y_test_steam, task_type="classification")

print("steam threshold:", getattr(best_steam_model, "best_threshold_", None))

print("\n=== OLIST Dataset ===")
best_olist_model = build_and_tune_models(
    X_train_olist, y_train_olist,
    task_type="classification",
    num_folds=3,
    num_iterations=20,
    oversample=True
)

score_olist = evaluate_on_holdout(best_olist_model, X_test_olist, y_test_olist, task_type="classification")

print("olist threshold:", getattr(best_olist_model, "best_threshold_", None))

print("\n=== SALES Dataset ===")
best_sales_model = build_and_tune_models(
    X_train_sales, y_train_sales,
    task_type="classification",
    num_folds=3,
    num_iterations=20,
    oversample=True
) 

score_sales = evaluate_on_holdout(best_sales_model, X_test_sales, y_test_sales, task_type="classification")

print("sales threshold:", getattr(best_sales_model, "best_threshold_", None))



=== STEAM Dataset ===
Removed 1 constant feature(s).
Streaming results (each line is one model × feature count):
[1/40] GBT | k=2 | F1_macro=0.668788 ± 0.016833
[2/40] GBT | k=5 | F1_macro=0.668788 ± 0.016833
[3/40] GBT | k=9 | F1_macro=0.668788 ± 0.016833
[4/40] GBT | k=14 | F1_macro=0.668788 ± 0.016833
[5/40] GBT | k=18 | F1_macro=0.668788 ± 0.016833
[6/40] RandomForest | k=2 | F1_macro=0.674434 ± 0.019483
[7/40] RandomForest | k=5 | F1_macro=0.674434 ± 0.019483
[8/40] RandomForest | k=9 | F1_macro=0.674434 ± 0.019483
[9/40] RandomForest | k=14 | F1_macro=0.674434 ± 0.019483
[10/40] RandomForest | k=18 | F1_macro=0.674434 ± 0.019483
[11/40] DecisionTree | k=2 | F1_macro=0.663213 ± 0.015647
[12/40] DecisionTree | k=5 | F1_macro=0.663213 ± 0.015647
[13/40] DecisionTree | k=9 | F1_macro=0.663213 ± 0.015647
[14/40] DecisionTree | k=14 | F1_macro=0.663213 ± 0.015647
[15/40] DecisionTree | k=18 | F1_macro=0.663213 ± 0.015647
[16/40] LogisticRegression | k=2 | F1_macro=0.456025 ± 0.019314


# Regression

In [10]:

# Regression call
splits = prepare_all(
    steam, olist, sales,
    task_type="regression",
    test_size=0.2
)

X_train_steam, X_test_steam, y_train_steam, y_test_steam = splits["steam"]
X_train_olist, X_test_olist, y_train_olist, y_test_olist = splits["olist"]
X_train_sales, X_test_sales, y_train_sales, y_test_sales = splits["sales"]

best_steam_model = build_and_tune_models(
    X_train_steam, y_train_steam,
    task_type="regression",
    num_folds=3,
    num_iterations=20
)

best_olist_model = build_and_tune_models(
    X_train_olist, y_train_olist,
    task_type="regression",
    num_folds=3,
    num_iterations=20
)

best_sales_model = build_and_tune_models(
    X_train_sales, y_train_sales,
    task_type="regression",
    num_folds=3,
    num_iterations=20
)

NameError: name 'prepare_all' is not defined