# PSSM Feature evaluation

# Imports

In [1]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test,
)
from subpred.pssm import calculate_pssms_notebook

# Dataset

In [2]:
outliers = (
    ["Q9HBR0", "Q07837"]
    + ["P76773", "Q47706", "P02943", "P75733", "P69856", "P64550"]
    + ["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0",],
)
df = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Transmembrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    tax_ids_filter=[3702, 9606, 83333, 559292],
    output_log="../logs/meta_amino_sugar_dataset.log",
    outliers=outliers,
    sequence_clustering=70
)
taxid_to_organism = {
    3702: "A. thaliana",
    9606: "Human",
    83333: "E. coli",
    559292: "Yeast",
}
df = df.assign(organism=df.organism_id.map(taxid_to_organism))

cd-hit: clustered 428 sequences into 362 clusters at threshold 70


# Feature generation

In [3]:
labels = df.keywords_transport
labels.value_counts()

Sugar transport         186
Amino-acid transport    176
Name: keywords_transport, dtype: int64

In [4]:
df_pssm = calculate_pssms_notebook(df.sequence)
df_pssm

Unnamed: 0_level_0,AA_50_1,AR_50_1,AN_50_1,AD_50_1,AC_50_1,AQ_50_1,AE_50_1,AG_50_1,AH_50_1,AI_50_1,...,VL_90_3,VK_90_3,VM_90_3,VF_90_3,VP_90_3,VS_90_3,VT_90_3,VW_90_3,VY_90_3,VV_90_3
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P69801,0.873913,0.156522,0.273913,0.108696,0.421739,0.295652,0.178261,0.513043,0.134783,0.408696,...,0.496933,0.487730,0.503067,0.558282,0.450920,0.472393,0.475460,0.490798,0.592025,0.481595
Q9SFG0,0.784223,0.252900,0.327146,0.238979,0.394432,0.350348,0.276102,0.545244,0.227378,0.317865,...,0.434307,0.381387,0.421533,0.578467,0.357664,0.390511,0.392336,0.512774,0.656934,0.417883
Q08986,0.734091,0.259091,0.313636,0.220455,0.393182,0.295455,0.234091,0.529545,0.265909,0.415909,...,0.425047,0.345351,0.402277,0.584440,0.282732,0.351044,0.351044,0.605313,0.759013,0.387097
Q9BRV3,0.676768,0.488215,0.508418,0.464646,0.602694,0.511785,0.478114,0.565657,0.511785,0.612795,...,0.484375,0.403125,0.471875,0.706250,0.368750,0.443750,0.440625,0.568750,0.856250,0.478125
Q84WN3,0.664740,0.416185,0.462428,0.427746,0.624277,0.445087,0.456647,0.526012,0.479769,0.543353,...,0.383260,0.264317,0.374449,0.726872,0.215859,0.286344,0.312775,0.493392,0.982379,0.352423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9FHH5,0.685484,0.193548,0.241935,0.217742,0.266129,0.266129,0.250000,0.314516,0.209677,0.266129,...,0.339130,0.295652,0.304348,0.634783,0.373913,0.365217,0.339130,0.513043,0.747826,0.278261
Q8S8A0,0.898305,0.186441,0.254237,0.262712,0.288136,0.279661,0.322034,0.381356,0.262712,0.322034,...,0.500000,0.532787,0.508197,0.598361,0.483607,0.516393,0.516393,0.557377,0.704918,0.500000
Q3E965,0.880342,0.273504,0.333333,0.264957,0.367521,0.316239,0.307692,0.401709,0.290598,0.324786,...,0.505155,0.556701,0.525773,0.608247,0.505155,0.536082,0.515464,0.587629,0.742268,0.484536
Q3EAV6,0.611511,0.294964,0.309353,0.316547,0.330935,0.323741,0.323741,0.374101,0.302158,0.338129,...,0.444444,0.444444,0.487179,0.538462,0.435897,0.487179,0.461538,0.538462,0.606838,0.461538


## Independent test set

In [5]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_pssm, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)



## Model comparison

PSSM seems to work better than the sequence-based features. SVC looks the most promising.

In [7]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.619,0.651,0.635,0.523,0.613,0.608,0.05
KNeighborsClassifier(),0.791,0.844,0.828,0.792,0.806,0.812,0.023
"LinearSVC(class_weight='balanced', max_iter=1000000.0, random_state=0)",0.879,0.896,0.861,0.879,0.807,0.865,0.035
"LinearSVC(max_iter=1000000.0, random_state=0)",0.879,0.896,0.861,0.879,0.807,0.865,0.035
"RandomForestClassifier(class_weight='balanced', random_state=0)",0.879,0.826,0.844,0.844,0.86,0.851,0.02
RandomForestClassifier(random_state=0),0.862,0.789,0.879,0.759,0.877,0.833,0.056
SGDClassifier(random_state=0),0.844,0.896,0.914,0.862,0.824,0.868,0.037
"SVC(class_weight='balanced', random_state=0)",0.862,0.914,0.931,0.861,0.93,0.899,0.035
SVC(random_state=0),0.862,0.896,0.931,0.861,0.93,0.896,0.035


## Parameter tuning

#### Custom transformer

Here, we try the multi-pssm feature, which tries all combinations of feature generation parameters, and selects the best ones based on the training set. First without the transformer:

In [8]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction=None,
    C=[0.01, 0.1, 1, 10],
)

{'linearsvc__C': 10, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__max_iter': 100000000.0}
0.875


The pssmselector increases the scores a bit:

In [9]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction=None,
    feature_transformer="pssm", 
    feature_names = feature_names,
    C=[0.001, 0.01, 0.1, 1]
)

{'linearsvc__C': 0.01, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pssmselector__iterations': 'all', 'pssmselector__uniref_threshold': 50}
0.889


The RBF kernel improves the results further:

In [12]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction=None,
    C=[0.1, 1, 10, 100],
)

{'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.924


Here, the pssmselector chooses to select all pssms, leading to the same model: 

In [15]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction=None,
    C=[0.1, 1, 10, 100],
    feature_transformer="pssm",
    feature_names=feature_names,
)
best_estimator_rbf = gsearch

{'pssmselector__iterations': 'all', 'pssmselector__uniref_threshold': 'all', 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.924


RBF is the best one so far.

## Dimensionality reduction

In [16]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
)

{'linearsvc__C': 0.1, 'linearsvc__class_weight': None, 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.95}
0.885


In [17]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
    C=[10, 1, 0.1, 0.01],
    feature_transformer="pssm",
    feature_names=feature_names,
)
best_estimator_linearsvc_pca = gsearch

{'linearsvc__C': 0.01, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.96, 'pssmselector__iterations': 'all', 'pssmselector__uniref_threshold': 50}
0.91


In [18]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[0.1, 1, 10, 100],
    # gamma=["scale"],
)

{'pca__n_components': 0.96, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.927


That already looks good, now with the PSSMSelector:

In [19]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    feature_transformer="pssm",
    feature_names=feature_names,
    # C=[1, 0.1, 10],
    # gamma=["scale"],
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.96, 'pssmselector__iterations': 'all', 'pssmselector__uniref_threshold': 50, 'svc__C': 1, 'svc__class_weight': None, 'svc__gamma': 'scale'}
0.934


A good score with default parameters, and 99% of the variance. 

## Validation


### RBF kernel without feature selection

Without lowering dimensions, we get worse performance for Sugar:

In [24]:
get_confusion_matrix(X_test, y_test, best_estimator_rbf, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,33,2
Sugar transport,11,27


In [25]:
get_classification_report(X_test, y_test, best_estimator_rbf, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.75,0.943,0.835,35
Sugar transport,0.931,0.711,0.806,38
macro avg,0.841,0.827,0.821,73
weighted avg,0.844,0.822,0.82,73


### Linear kernel with PCA

Improves the results for sugar transporters.

In [20]:
get_confusion_matrix(X_test, y_test, best_estimator_linearsvc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,33,2
Sugar transport,5,33


In [21]:
get_classification_report(X_test, y_test, best_estimator_linearsvc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.868,0.943,0.904,35
Sugar transport,0.943,0.868,0.904,38
macro avg,0.906,0.906,0.904,73
weighted avg,0.907,0.904,0.904,73


### RBF + PCA

RBF kernel and pca leads to the best model.


In [26]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,33,2
Sugar transport,4,34


In [27]:
get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.892,0.943,0.917,35
Sugar transport,0.944,0.895,0.919,38
macro avg,0.918,0.919,0.918,73
weighted avg,0.919,0.918,0.918,73


### Conclusion

PSSM with PCA and RBF-SVC is suitable for stable multi-organism models.

## Estimating validation variance

How much did the result depend on choosing the training and test sets?

Mean and standard deviation for randomly selected training and validation sets.

#### RBF+PCA 

In [28]:
df_scores, df_params = full_test(
    df_pssm,
    labels,
    dim_reduction="pca",
    kernel="rbf",
    repetitions=10,
    feature_transformer="pssm",
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

KeyboardInterrupt: 