# PAAC Feature evaluation

# Imports

In [None]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test
)
from subpred.compositions import calculate_paac

# Dataset

In [None]:
outliers = (
    ["Q9HBR0", "Q07837"]  + ["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0"]
    
)
df = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Transmembrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    tax_ids_filter=[3702, 9606, 559292],
    output_log="../logs/meta_amino_sugar_dataset.log",
    outliers=outliers,
    sequence_clustering=70
)
taxid_to_organism = {
    3702: "A. thaliana",
    9606: "Human",
    559292: "Yeast",
}
df = df.assign(organism=df.organism_id.map(taxid_to_organism))


# Feature generation

In [None]:
labels = df.keywords_transport
labels.value_counts()

In [None]:
df_paac = calculate_paac(df.sequence)
df_paac

## Independent test set

In [None]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_paac, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)

## Model comparison



In [None]:
models_quick_compare(X_train, y_train)

## Parameter tuning

RBF kernel delivers better results when using all features. 

In [None]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="linear", C=[0.0001, 0.001, 0.01, 0.1, 1, 10])
best_estimator_lsvc = gsearch

In [None]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="rbf", C=[0.1, 1, 10, 100]
)
best_estimator_svc = gsearch

## Dimensionality reduction

### Linear kernel

PCA does not lead to improvements:

In [None]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
    C=[1, 0.01, 0.1, 10],
)
best_estimator_lsvc_pca = gsearch

Kbest perform worse with the linear kernel:

In [None]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="kbest",
    select_k_steps=20,
    remove_zero_var=True,
    C=[0.1, 1, 10],
)
best_estimator_lsvc_kbest = gsearch

#### RBF

In [None]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[0.1, 1, 10, 100],
    # gamma = [1e-0, 1e-1, 1e-2, 1e-3,"scale"]
)
best_estimator_svc_pca = gsearch

With the RBF kernel, the kbest model performs the best. It only removes nine features.

In [None]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="kbest",
    select_k_steps=10,
    remove_zero_var=True,
    C=[0.1, 1, 10, 100],
)
best_estimator_svc_kbest = gsearch

In [None]:
feature_names[~best_estimator_svc_kbest.best_estimator_["selectkbest"].get_support()]

The linear kernel with feature selection shows the best result here.

## Validation

RBF with PCA delivers the best results on the test set:

### Linear kernel

In [None]:
display(get_confusion_matrix(X_test, y_test, best_estimator_lsvc, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_lsvc, labels=labels))

In [None]:
display(get_confusion_matrix(X_test, y_test, best_estimator_lsvc_kbest, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_lsvc_kbest, labels=labels))

In [None]:
display(get_confusion_matrix(X_test, y_test, best_estimator_lsvc_pca, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_lsvc_pca, labels=labels))

### RBF kernel

In [None]:
display(get_confusion_matrix(X_test, y_test, best_estimator_svc, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_svc, labels=labels))

In [None]:
display(get_confusion_matrix(X_test, y_test, best_estimator_svc_kbest, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_svc_kbest, labels=labels))

In [None]:
display(get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels))

## Conclusion

The models achieve F1 scores of around 0.80 with only PAAC. This is relatively consistent between training set and test set, and between the two substrates.

## Estimating validation variance 


In [None]:
df_scores, df_params = full_test(
    df_paac, labels, dim_reduction="pca", kernel="rbf", repetitions=10, remove_zero_var=True, select_k_steps=20
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
df_params

For the meta-organism, the AAC actually performs better than the PAAC, possibly because of increased noise. Only a small subset of PAAC features have high frequencies. The AAC better captures the fact that H and G are among the most important features.