# AAC feature evaluation (only Eukaryotes)

During the dataset evaluation, we found that E Coli transports form its own cluster in the PCA plot. How does the model perform without E Coli transporters?

# Imports

In [None]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test
)
from subpred.compositions import calculate_aac
from subpred.plots import pca_plot_2d

# Dataset

In [None]:
outliers = (
    ["Q9HBR0", "Q07837"]  + ["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0"]
    
)
df = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Transmembrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    tax_ids_filter=[3702, 9606, 559292],
    output_log="../logs/meta_amino_sugar_dataset.log",
    outliers=outliers,
    sequence_clustering=70
)
taxid_to_organism = {
    3702: "A. thaliana",
    9606: "Human",
    559292: "Yeast",
}
df = df.assign(organism=df.organism_id.map(taxid_to_organism))


In [None]:
df[df.gene_names.str.contains("GDU")]

# print(outliers)

# Feature generation

## Labels

In [None]:
labels = df.keywords_transport
labels.value_counts()

## AAC

In [None]:
df_aac = calculate_aac(df.sequence)
df_aac

In [None]:
pca_plot_2d(df_aac, df.organism, df.keywords_transport)

df

### Independent test set

In [None]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_aac, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)

### Model selection


In [None]:
models_quick_compare(X_train, y_train)

### Parameter tuning

RBF kernel delivers slightly better results

In [None]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="linear")

In [None]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="rbf", C=[0.1, 1, 10, 100])
best_estimator_svc = gsearch

### Dimensionality reduction

Kbest performs slightly worse, and only removes one feature:

In [None]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="rbf", dim_reduction="kbest"#, C=[1,10,100], gamma=["scale"]
)
best_estimator_svc_kbest = gsearch

In [None]:
feature_names[~best_estimator_svc_kbest.best_estimator_["selectkbest"].get_support()]

The amino acid whose removal improves the model is Glu.

PCA performs worse, which might be caused by the fact that it removes more information to reach 99% of variance explained:

In [None]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[0.1, 1, 10, 100],
    gamma=["scale", 1, 0.1, 0.01, 0.001],
)
best_estimator_svc_pca = gsearch

### Validation

Kbest had a slightly worse performance on the training set than PCA, and a much better performance on the test set.The performance of the model without feature selection is between that of PCA and Kbest.

#### Kbest

In [None]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_kbest, labels=labels)

In [None]:
get_classification_report(X_test, y_test, best_estimator_svc_kbest, labels=labels)

#### No feature selection:

In [None]:
get_confusion_matrix(X_test, y_test, best_estimator_svc, labels=labels)

In [None]:
get_classification_report(X_test, y_test, best_estimator_svc, labels=labels)

#### PCA

In [None]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels)

In [None]:
get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels)

## Checking dependence on train test split


### Kbest model

On average, the model performs about as good as on the split above:

In [None]:
df_scores, df_params = full_test(
    df_aac, labels, dim_reduction="kbest", kernel="rbf", repetitions=10
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)

#### PCA

As before, the PCA model delivers worse results than Kbest:

In [None]:
df_scores, df_params = full_test(
    df_aac, labels, dim_reduction="pca", kernel="rbf", repetitions=10
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)

### All features

Interestingly, the average performance is actually better on average without feature selection. The removal of Glu improving scores might have been something that was only the case in our split.  

In [None]:
df_scores, df_params = full_test(
    df_aac, labels, kernel="rbf", repetitions=10
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)
df_params

## Conclusion

