# PSSM Feature evaluation

# Imports

In [32]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test,
)
from subpred.plots import pca_plot_2d
from subpred.util import get_feature_score
from subpred.pssm import calculate_pssms_notebook
import pandas as pd
import seaborn as sns

# Dataset

In [33]:
df = create_dataset(
    keywords_substrate_filter = ["Amino-acid transport","Sugar transport"],
    keywords_component_filter = ["Transmembrane"],
    keywords_transport_filter = ["Transport"],
    input_file = "../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate = "integrate",
    outliers= ['P76773', 'Q47706', 'P02943', 'P75733', 'P69856', 'P64550'],
    verbose = True,
    tax_ids_filter = [83333],
    output_log = "../logs/ecoli_amino_sugar_dataset.log",
    sequence_clustering=70
)

cd-hit: clustered 99 sequences into 98 clusters at threshold 70


# Feature generation

In [34]:
labels = df.keywords_transport
labels.value_counts()

Amino-acid transport    51
Sugar transport         47
Name: keywords_transport, dtype: int64

In [35]:
df_pssm = calculate_pssms_notebook(df.sequence)
df_pssm

Unnamed: 0_level_0,AA_50_1,AR_50_1,AN_50_1,AD_50_1,AC_50_1,AQ_50_1,AE_50_1,AG_50_1,AH_50_1,AI_50_1,...,VL_90_3,VK_90_3,VM_90_3,VF_90_3,VP_90_3,VS_90_3,VT_90_3,VW_90_3,VY_90_3,VV_90_3
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P69801,0.873913,0.156522,0.273913,0.108696,0.421739,0.295652,0.178261,0.513043,0.134783,0.408696,...,0.496933,0.487730,0.503067,0.558282,0.450920,0.472393,0.475460,0.490798,0.592025,0.481595
P36672,0.756944,0.252315,0.358796,0.263889,0.469907,0.321759,0.293981,0.546296,0.261574,0.458333,...,0.436330,0.421348,0.436330,0.556180,0.370787,0.464419,0.423221,0.503745,0.644195,0.436330
P56580,0.658537,0.262195,0.268293,0.256098,0.323171,0.298780,0.301829,0.426829,0.195122,0.286585,...,0.410876,0.389728,0.407855,0.504532,0.371601,0.413897,0.404834,0.444109,0.555891,0.398792
P0AA47,0.930939,0.179558,0.276243,0.193370,0.505525,0.284530,0.237569,0.585635,0.218232,0.522099,...,0.464052,0.416122,0.472767,0.640523,0.350763,0.420479,0.424837,0.570806,0.838780,0.461874
P08722,0.740234,0.191406,0.310547,0.224609,0.300781,0.277344,0.308594,0.427734,0.189453,0.349609,...,0.443131,0.437223,0.454948,0.514032,0.410635,0.438700,0.440177,0.472674,0.584934,0.447563
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P19642,0.825949,0.322785,0.398734,0.319620,0.430380,0.452532,0.382911,0.560127,0.401899,0.493671,...,0.480253,0.461295,0.481833,0.532385,0.454976,0.464455,0.464455,0.519747,0.593997,0.473934
P0AAD4,0.837629,0.250000,0.337629,0.255155,0.559278,0.355670,0.319588,0.572165,0.291237,0.556701,...,0.525449,0.494012,0.519461,0.622754,0.479042,0.510479,0.517964,0.574850,0.718563,0.508982
P23173,0.744186,0.284884,0.293605,0.252907,0.514535,0.366279,0.308140,0.482558,0.311047,0.500000,...,0.546237,0.464516,0.548387,0.608602,0.455914,0.516129,0.511828,0.531183,0.752688,0.529032
P33361,0.822430,0.420561,0.370093,0.315888,0.429907,0.400000,0.355140,0.568224,0.351402,0.457944,...,0.547009,0.527066,0.539886,0.574074,0.521368,0.531339,0.532764,0.571225,0.589744,0.544160


## Independent test set

In [36]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_pssm, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)



## Model comparison

Linear SVC looks promising before feature selection, more so than RBF. Could be that the linear kernel prevents overfitting, due to the many features and few samples.

In [37]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.619,0.686,0.625,0.722,0.583,0.647,0.056
KNeighborsClassifier(),0.867,0.746,0.875,0.598,0.785,0.774,0.112
"LinearSVC(class_weight='balanced', max_iter=1000000.0)",1.0,0.937,0.937,0.661,0.932,0.893,0.133
LinearSVC(max_iter=1000000.0),1.0,0.937,0.937,0.661,0.932,0.893,0.133
RandomForestClassifier(),0.515,0.686,0.676,0.796,0.583,0.651,0.107
RandomForestClassifier(class_weight='balanced'),0.733,0.686,0.812,0.722,0.661,0.723,0.057
SGDClassifier(),0.935,0.873,0.937,0.525,0.866,0.827,0.172
SVC(),0.792,0.746,0.875,0.722,0.661,0.759,0.08
SVC(class_weight='balanced'),0.619,0.746,0.812,0.722,0.661,0.712,0.075


## Parameter tuning

#### Custom transformer

Here, we try the multi-pssm feature, which tries all combinations of feature generation parameters, and selects the best ones based on the training set. First without the transformer:

In [38]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction=None,
    # C=[0.1, 1, 10],
)

{'linearsvc__C': 1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__max_iter': 100000000.0}
0.908


The linear kernel already leads to good results. WIth the transformer:

In [39]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction=None,
    feature_transformer="pssm", 
    feature_names = feature_names,
    C=[0.01, 0.1, 1]
)
best_estimator_linear = gsearch

{'linearsvc__C': 0.1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pssmselector__iterations': 3, 'pssmselector__uniref_threshold': 'all'}
0.935


Lower value of C, better results. Does RBF improve anything?

In [40]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction=None,
    C=[1, 10, 100],
)

{'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.866


In contrast to A Thaliana, we get lower scores with the RBF kernel.

In [41]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction=None,
    C=[1, 10, 100],
    feature_transformer="pssm",
    feature_names=feature_names,
)

{'pssmselector__iterations': 3, 'pssmselector__uniref_threshold': 'all', 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.882


Again, the scores improve a bit with the transformer, but not as high as the linear kernel.

## Dimensionality reduction

With PCA and the linear kernel, we get good results again:

In [42]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
    C=[1, 0.1, 0.001, 0.01],
    feature_transformer="pssm",
    feature_names=feature_names,
)
best_estimator_linearsvc_pca = gsearch

{'linearsvc__C': 0.001, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.98, 'pssmselector__iterations': 3, 'pssmselector__uniref_threshold': 'all'}
0.908


The linear model wants a very low value for C, which leads to a more generalized decision function. Again, the multi-pssm feature selector improved the results a bit:

In [43]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
    # C=[0.001, 0.01, 0.1],
)

{'linearsvc__C': 0.1, 'linearsvc__class_weight': None, 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.99}
0.883


PCA seems to perform the best and is faster, how about the RBF kernel?

In [44]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[1, 10, 100],
    gamma=["scale"],
)

{'pca__n_components': 0.97, 'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.882


With the custom transformer, rbf and pca, we get the best training score. Lower values of gamma lead to higher scores on the training set, but possibly also more overfitting. Setting gamma to scale only:

In [45]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    feature_transformer="pssm",
    feature_names=feature_names,
    C=[1, 0.1, 10],
    gamma=["scale"],
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.9299999999999999, 'pssmselector__iterations': 'all', 'pssmselector__uniref_threshold': 90, 'svc__C': 1, 'svc__class_weight': None, 'svc__gamma': 'scale'}
0.908


## Validation

And that perfect score also happens on the validation set! Both for PCA, and Kbest

### Linear kernel without feature selection

Perfect score for the amino-acid transporters, 7/10 for the sugar transporters.

In [46]:
get_confusion_matrix(X_test, y_test, best_estimator_linear, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,10,0
Sugar transport,3,7


In [47]:
get_classification_report(X_test, y_test, best_estimator_linear, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.769,1.0,0.87,10
Sugar transport,1.0,0.7,0.824,10
macro avg,0.885,0.85,0.847,20
weighted avg,0.885,0.85,0.847,20


### Linear kernel with PCA

Same with PCA:

In [48]:
get_confusion_matrix(X_test, y_test, best_estimator_linearsvc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,10,0
Sugar transport,3,7


In [49]:
get_classification_report(X_test, y_test, best_estimator_linearsvc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.769,1.0,0.87,10
Sugar transport,1.0,0.7,0.824,10
macro avg,0.885,0.85,0.847,20
weighted avg,0.885,0.85,0.847,20


### RBF + PCA

The RBF model classifies one more sugar transporter correctly:

In [50]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,10,0
Sugar transport,2,8


In [51]:
get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.833,1.0,0.909,10
Sugar transport,1.0,0.8,0.889,10
macro avg,0.917,0.9,0.899,20
weighted avg,0.917,0.9,0.899,20


## Estimating validation variance

How much did the result depend on choosing the training and test sets?

Mean and standard deviation for randomly selected training and validation sets.

#### RBF+PCA

In [52]:
df_scores, df_params = full_test(
    df_pssm,
    labels,
    dim_reduction="pca",
    kernel="rbf",
    repetitions=10,
    feature_transformer="pssm",
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8652,0.9327
Sugar transport,0.8506,0.9233


Sdev F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.066866,0.024363
Sugar transport,0.091483,0.027512


Parameters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.94,0.91,0.87,0.93,0.92,0.96,0.98,0.9,0.99,0.96
pssmselector__iterations,all,1,1,3,1.0,all,all,1,all,all
pssmselector__uniref_threshold,50,50,all,50,50.0,50,all,90,all,50
svc__C,1,10,1,1,1.0,1,1,1,1,10
svc__class_weight,balanced,balanced,balanced,balanced,,balanced,balanced,balanced,balanced,balanced
svc__gamma,scale,0.01,scale,scale,0.1,0.01,0.01,0.1,0.01,scale


The results are the best so far for E. Coli. Especially the scores on the training set are good, although there seems to be some overfitting on the training data. What happens if we simplify the model?

#### RBF + PCA (unoptimized gamma)

In [54]:
df_scores, df_params = full_test(
    df_pssm,
    labels,
    dim_reduction="pca",
    kernel="rbf",
    gamma=["scale"],
    repetitions=10,
    feature_transformer="pssm",
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8631,0.9289
Sugar transport,0.8561,0.9162


Sdev F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.077105,0.02384
Sugar transport,0.078146,0.027944


Parameters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.94,0.92,0.87,0.93,0.88,0.88,0.98,0.89,0.99,0.96
pssmselector__iterations,all,all,1,3,1,3,3,1,all,all
pssmselector__uniref_threshold,50,90,all,50,90,all,all,all,all,50
svc__C,1,1,1,1,1,1,1,1,10,10
svc__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced
svc__gamma,scale,scale,scale,scale,scale,scale,scale,scale,scale,scale


Not optimizing gamma already improves the overfitting and instability problems by a good amount. What about the linear kernel?

#### PCA + linear Kernel

In [55]:
df_scores, df_params = full_test(
    df_pssm,
    labels,
    dim_reduction="pca",
    kernel="linear",
    repetitions=10,
    feature_transformer="pssm",
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8612,0.919
Sugar transport,0.8669,0.9087


Sdev F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.073031,0.021505
Sugar transport,0.055969,0.02553


Parameters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
linearsvc__C,0.1,0.1,0.1,10,0.1,0.1,0.1,0.1,1,0.1
linearsvc__class_weight,balanced,balanced,balanced,,balanced,balanced,balanced,balanced,balanced,
linearsvc__dual,True,True,True,False,True,True,True,True,True,True
linearsvc__max_iter,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0
pca__n_components,0.89,0.91,0.91,0.99,0.87,0.99,0.99,0.95,0.99,0.97
pssmselector__iterations,all,1,1,all,all,3,3,1,3,3
pssmselector__uniref_threshold,50,50,all,50,50,50,all,50,all,50


## Dataset filtering

The dataset does not seem to produce the most stable models so far. Almost all proteins in the dataset come with TCDB annotations. What happens when we remove a particular TCDB class from the dataset, such as group translocators, active transporters, or passive transporters?

First, we tried removing group translocators, all of which are sugar transporters. This led to a sharp drop in F1 score for sugar (to around 0.6), meaning that the problem stems from other proteins, and that the GL are important for the model training.

Second, we tried to only keep passive transporter in the dataset, but this led to even worse scores for sugar (0.58). The reason for this is probably the very low sample count (see notebook 3). The metrics were raising warnings about not enough samples in the dataset for each class during evaluation. 

What about removing active transporters, and keeping the group translocators and the passive transporters?

In [73]:
print("Sugar TCDB")
display(df[df.keywords_transport.str.contains("Sugar")].tcdb_class.value_counts())
print("Amino TCDB")
display(df[df.keywords_transport.str.contains("Amino")].tcdb_class.value_counts())

Sugar TCDB


2.A    18
4.A    17
3.A     9
0.0     3
Name: tcdb_class, dtype: int64

Amino TCDB


2.A    36
3.A    14
0.0     1
Name: tcdb_class, dtype: int64

### RBF, PCA, no active transporters

In [70]:
df_no_active = df[~df.tcdb_class.str.startswith("3")]
df_pssm_no_active = df_pssm.loc[df_no_active.index]
labels_no_active = df_no_active.keywords_transport

df_scores, df_params = full_test(
    df_pssm_no_active,
    labels_no_active,
    dim_reduction="pca",
    kernel="rbf",
    repetitions=10,
    feature_transformer="pssm",
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.9243,0.9507
Sugar transport,0.9279,0.9494


Sdev F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.038111,0.022196
Sugar transport,0.038725,0.019631


Parameters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.99,0.96,0.88,0.98,0.91,0.99,0.99,0.83,0.99,0.94
pssmselector__iterations,3,1,1,1,1,3,3,1,all,all
pssmselector__uniref_threshold,all,all,all,all,all,50,50,all,50,all
svc__C,1,1,1,1,10,1,1,1,1,1
svc__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced
svc__gamma,0.01,0.1,scale,0.01,scale,0.01,scale,0.1,0.01,scale


This improves the models and the results by quite a bit, both the mean scores and the standard deviations! Interestingly, this was not the case for the PAAC feature, where we also tried removing different TCDB classes from the dataset. We should try the same thing for other organisms and features.