In [5]:
# %cd ..
import numpy as np
import pandas as pd
from sklearn import (
    ensemble,
    feature_selection,
    model_selection,
    pipeline,
    preprocessing
)

In [2]:
pangenome = pd.read_csv(
    "data/pangenome/pangenome.full.tsv", sep="\t", index_col="genome"
)
pangenome

Unnamed: 0_level_0,g0,g1,g2,g3,g4,g5,g6,g7,g8,g9,...,g63923,g63924,g63925,g63926,g63927,g63928,g63929,g63930,g63931,g63932
genome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCA_000149955.2,27,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
GCA_000222805.1,27,2,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
GCA_000259975.2,31,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
GCA_000260175.2,27,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
GCA_000260215.2,29,2,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCA_032878545.1,30,2,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
GCA_032991405.1,28,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
GCA_034509825.1,30,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
GCA_036785135.1,32,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [3]:
target = pd.read_csv("accessions.tsv", sep="\t", index_col="genome")["fsp"]
target

genome
GCA_000149955.2     lycopersici
GCA_000222805.1    conglutinans
GCA_000259975.2     lycopersici
GCA_000260175.2     vasinfectum
GCA_000260215.2    conglutinans
                       ...     
GCA_032878545.1     vasinfectum
GCA_032991405.1     vasinfectum
GCA_034509825.1         cubense
GCA_036785135.1    conglutinans
GCA_038050555.1     vasinfectum
Name: fsp, Length: 242, dtype: object

In [4]:
def logstep(start: int, end: int, step: float):
    # logstep(10, 1200, 1) -> 10 100 1000 1200
    steps = []
    current = start
    power = 1
    while current < end:
        steps.append(current)
        power += step
        current = round(start**power)
    steps.append(end)
    return steps

In [81]:
def random_feature_names(
    transformer: preprocessing.FunctionTransformer,
    input_features: np.ndarray
):
    default = {"seed": 0, "size": 10}
    passed = transformer.get_params()["kw_args"]
    args = default if passed is None else {**default, **passed}
    seed = args["seed"]
    size = args["size"]
    selection = np.random.default_rng(seed + size).choice(
        input_features.shape[0], size, replace=False
    )
    return input_features[selection]

def random_selector(X: np.ndarray, seed: int = 0, size: int = 10):
    selection = np.random.default_rng(seed + size).choice(
        X.shape[1], size, replace=False
    )
    return X[:, selection]

RandomSelector = preprocessing.FunctionTransformer(
    random_selector,
    validate=True,
    feature_names_out=random_feature_names,
    kw_args={"seed": 10}
)
RandomSelector

In [82]:
def run_search(
    dataframe: pd.DataFrame,
    target: pd.Series,
    model,
    grid: dict
):
    cv = model_selection.StratifiedKFold(4, shuffle=True, random_state=0)
    estimator = pipeline.Pipeline([
        ("select", RandomSelector),
        ("model", model)
    ])
    sizes = logstep(10, dataframe.shape[1], 0.5)
    seeds = range(1)
    grid = {
        "select__kw_args": [
            {"seed": seed, "size": size}for seed in seeds for size in sizes
        ],
        **{f"model__{key}": value for key, value in grid.items()}
    }
    search = model_selection.GridSearchCV(
        estimator,
        grid,
        scoring="f1_weighted",
        n_jobs=10,
        verbose=1,
        cv=cv
    ).fit(dataframe, target).cv_results_
    return (
        pd.DataFrame(search)
        .drop(columns="params")
        .sort_values("rank_test_score", ignore_index=True)
    )

In [None]:
run_search(
    pangenome,
    target,
    ensemble.RandomForestClassifier(),
    {
        "n_estimators": [100, 200, 300],
        "criterion": ["gini", "entropy", "log_loss"],
        "max_features": ["sqrt", "log2", None],
        "bootstrap": [True, False],
        "random_state": [0]
    }
)

Fitting 4 folds for each of 9 candidates, totalling 36 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__random_state,param_select__kw_args,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.934417,0.005343,0.281176,0.014201,0,"{'seed': 0, 'size': 31623}",0.878566,0.91588,0.901528,0.924935,0.905227,0.01751,1
1,0.629553,0.025359,0.180193,0.004812,0,"{'seed': 0, 'size': 63933}",0.849747,0.91588,0.932778,0.891757,0.89754,0.031208,2
2,0.806263,0.00959,0.316725,0.003523,0,"{'seed': 0, 'size': 1000}",0.843577,0.881289,0.934247,0.889276,0.887097,0.032232,3
3,0.487562,0.023571,0.181101,0.005359,0,"{'seed': 0, 'size': 3162}",0.817537,0.891914,0.872572,0.910651,0.873168,0.034827,4
4,0.848131,0.023863,0.30047,0.00298,0,"{'seed': 0, 'size': 10000}",0.833769,0.869864,0.863052,0.911773,0.869614,0.027863,5
5,0.782227,0.020853,0.318229,0.00282,0,"{'seed': 0, 'size': 316}",0.800154,0.815922,0.836684,0.773271,0.806508,0.023153,6
6,0.554752,0.138555,0.215893,0.055075,0,"{'seed': 0, 'size': 100}",0.613115,0.750306,0.603228,0.615596,0.645561,0.060651,7
7,0.816892,0.011541,0.311043,0.001113,0,"{'seed': 0, 'size': 32}",0.343588,0.433506,0.450342,0.397601,0.406259,0.040891,8
8,0.731628,0.138291,0.282054,0.061681,0,"{'seed': 0, 'size': 10}",0.155824,0.174551,0.276935,0.17188,0.194798,0.04796,9
