# PSSM Feature evaluation

# Imports

In [1]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test,
)
from subpred.pssm import calculate_pssms_notebook

# Dataset

In [2]:
outliers = (
    ["Q9HBR0", "Q07837"]
    + ["P76773", "Q47706", "P02943", "P75733", "P69856", "P64550"]
    + ["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0",]
)
df = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Transmembrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    tax_ids_filter=[3702, 9606, 83333, 559292],
    output_log="../logs/meta_amino_sugar_dataset.log",
    outliers=outliers,
    sequence_clustering=70
)
taxid_to_organism = {
    3702: "A. thaliana",
    9606: "Human",
    83333: "E. coli",
    559292: "Yeast",
}
df = df.assign(organism=df.organism_id.map(taxid_to_organism))

cd-hit: clustered 413 sequences into 347 clusters at threshold 70


# Feature generation

In [3]:
labels = df.keywords_transport
labels.value_counts()

Sugar transport         181
Amino-acid transport    166
Name: keywords_transport, dtype: int64

In [4]:
df_pssm = calculate_pssms_notebook(df.sequence)
df_pssm

Unnamed: 0_level_0,AA_50_1,AR_50_1,AN_50_1,AD_50_1,AC_50_1,AQ_50_1,AE_50_1,AG_50_1,AH_50_1,AI_50_1,...,VL_90_3,VK_90_3,VM_90_3,VF_90_3,VP_90_3,VS_90_3,VT_90_3,VW_90_3,VY_90_3,VV_90_3
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P69801,0.873913,0.156522,0.273913,0.108696,0.421739,0.295652,0.178261,0.513043,0.134783,0.408696,...,0.496933,0.487730,0.503067,0.558282,0.450920,0.472393,0.475460,0.490798,0.592025,0.481595
Q9SFG0,0.784223,0.252900,0.327146,0.238979,0.394432,0.350348,0.276102,0.545244,0.227378,0.317865,...,0.434307,0.381387,0.421533,0.578467,0.357664,0.390511,0.392336,0.512774,0.656934,0.417883
Q08986,0.734091,0.259091,0.313636,0.220455,0.393182,0.295455,0.234091,0.529545,0.265909,0.415909,...,0.425047,0.345351,0.402277,0.584440,0.282732,0.351044,0.351044,0.605313,0.759013,0.387097
Q9BRV3,0.676768,0.488215,0.508418,0.464646,0.602694,0.511785,0.478114,0.565657,0.511785,0.612795,...,0.484375,0.403125,0.471875,0.706250,0.368750,0.443750,0.440625,0.568750,0.856250,0.478125
Q84WN3,0.664740,0.416185,0.462428,0.427746,0.624277,0.445087,0.456647,0.526012,0.479769,0.543353,...,0.383260,0.264317,0.374449,0.726872,0.215859,0.286344,0.312775,0.493392,0.982379,0.352423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F4IHS9,0.745981,0.495177,0.520900,0.450161,0.649518,0.520900,0.485531,0.578778,0.469453,0.604502,...,0.533654,0.492788,0.543269,0.639423,0.492788,0.526442,0.524038,0.661058,0.713942,0.533654
Q04162,0.786925,0.305085,0.392252,0.295400,0.513317,0.372881,0.341404,0.624697,0.278450,0.411622,...,0.527721,0.501027,0.503080,0.694045,0.435318,0.525667,0.519507,0.624230,0.837782,0.517454
P33361,0.822430,0.420561,0.370093,0.315888,0.429907,0.400000,0.355140,0.568224,0.351402,0.457944,...,0.547009,0.527066,0.539886,0.574074,0.521368,0.531339,0.532764,0.571225,0.589744,0.544160
P39328,0.871111,0.277778,0.271111,0.186667,0.506667,0.284444,0.226667,0.528889,0.208889,0.402222,...,0.530752,0.528474,0.530752,0.542141,0.526196,0.530752,0.530752,0.539863,0.562642,0.530752


## Independent test set

In [5]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_pssm, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)



## Model comparison

PSSM seems to work better than the sequence-based features. SVC looks the most promising.

In [6]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.607,0.588,0.647,0.762,0.56,0.633,0.079
KNeighborsClassifier(),0.856,0.892,0.8,0.834,0.762,0.829,0.05
"LinearSVC(class_weight='balanced', max_iter=1000000.0, random_state=0)",0.854,0.893,0.873,0.8,0.853,0.855,0.034
"LinearSVC(max_iter=1000000.0, random_state=0)",0.854,0.893,0.873,0.8,0.853,0.855,0.034
"RandomForestClassifier(class_weight='balanced', random_state=0)",0.802,0.875,0.854,0.854,0.873,0.852,0.029
RandomForestClassifier(random_state=0),0.838,0.91,0.817,0.873,0.8,0.848,0.044
SGDClassifier(random_state=0),0.892,0.875,0.836,0.872,0.853,0.866,0.021
"SVC(class_weight='balanced', random_state=0)",0.893,0.964,0.873,0.909,0.818,0.891,0.053
SVC(random_state=0),0.893,0.946,0.854,0.927,0.818,0.888,0.052


## Parameter tuning

#### Custom transformer

Here, we try the multi-pssm feature, which tries all combinations of feature generation parameters, and selects the best ones based on the training set. First without the transformer:

In [7]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction=None,
    C=[0.1, 1, 10, 100, 1000],
)

{'linearsvc__C': 10, 'linearsvc__class_weight': None, 'linearsvc__dual': False, 'linearsvc__max_iter': 100000000.0}
0.888


Here, the pssmselector chooses to select all pssms, leading to the same model: 

In [8]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction=None,
    feature_transformer="pssm", 
    feature_names = feature_names,
    C=[0.1, 1, 10, 100, 1000]
)

{'linearsvc__C': 10, 'linearsvc__class_weight': None, 'linearsvc__dual': False, 'linearsvc__max_iter': 100000000.0, 'pssmselector__iterations': 'all', 'pssmselector__uniref_threshold': 'all'}
0.888


The RBF kernel improves the results further:

In [9]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction=None,
    C=[0.1, 1, 10, 100],
)

{'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.909


Slightly better scores with the selector:

In [10]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction=None,
    C=[0.1, 1, 10, 100],
    feature_transformer="pssm",
    feature_names=feature_names,
)
best_estimator_rbf = gsearch

{'pssmselector__iterations': 3, 'pssmselector__uniref_threshold': 'all', 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.91


RBF is the best one so far.

## Dimensionality reduction

In [11]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
)

{'linearsvc__C': 0.1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.98}
0.895


In [12]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
    C=[10, 1, 0.1, 0.01, 0.001],
    feature_transformer="pssm",
    feature_names=feature_names,
)
best_estimator_linearsvc_pca = gsearch

{'linearsvc__C': 0.01, 'linearsvc__class_weight': None, 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.96, 'pssmselector__iterations': 'all', 'pssmselector__uniref_threshold': 'all'}
0.917


In [13]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[0.1, 1, 10, 100],
    # gamma=["scale"],
)

{'pca__n_components': 0.97, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.935


That already looks good, now with the PSSMSelector:

In [14]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    feature_transformer="pssm",
    feature_names=feature_names,
    C=[0.1, 1, 10, 100],
    # C=[1, 0.1, 10],
    # gamma=["scale"],
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.97, 'pssmselector__iterations': 'all', 'pssmselector__uniref_threshold': 'all', 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.935


The selector selects all PSSMs

## Validation


### RBF kernel without feature selection



In [15]:
get_confusion_matrix(X_test, y_test, best_estimator_rbf, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,30,3
Sugar transport,5,32


In [16]:
get_classification_report(X_test, y_test, best_estimator_rbf, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.857,0.909,0.882,33
Sugar transport,0.914,0.865,0.889,37
macro avg,0.886,0.887,0.886,70
weighted avg,0.887,0.886,0.886,70


### Linear kernel with PCA

Slightly better results, only one correct classification more

In [17]:
get_confusion_matrix(X_test, y_test, best_estimator_linearsvc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,30,3
Sugar transport,4,33


In [18]:
get_classification_report(X_test, y_test, best_estimator_linearsvc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.882,0.909,0.896,33
Sugar transport,0.917,0.892,0.904,37
macro avg,0.9,0.9,0.9,70
weighted avg,0.9,0.9,0.9,70


### RBF + PCA

RBF kernel and pca leads to the best model on the training set.

In [19]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,29,4
Sugar transport,2,35


In [20]:
get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.935,0.879,0.906,33
Sugar transport,0.897,0.946,0.921,37
macro avg,0.916,0.912,0.914,70
weighted avg,0.915,0.914,0.914,70


### Conclusion

PSSM with PCA and RBF-SVC is suitable for stable multi-organism models.

## Estimating validation variance

How much did the result depend on choosing the training and test sets?

Mean and standard deviation for randomly selected training and validation sets.

#### RBF+PCA 

In [21]:
df_scores, df_params = full_test(
    df_pssm,
    labels,
    dim_reduction="pca",
    kernel="rbf",
    repetitions=10,
    feature_transformer="pssm",
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.9188,0.9287
Sugar transport,0.929,0.9338


Sdev F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.042415,0.010393
Sugar transport,0.037642,0.008548


Parameters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.97,0.97,0.86,0.97,0.85,0.99,0.97,0.98,0.97,0.97
pssmselector__iterations,all,3,1.0,all,all,all,all,all,all,all
pssmselector__uniref_threshold,50,50,50.0,all,50,50,50,all,50,all
svc__C,1,1,1.0,10,1,10,10,10,10,10
svc__class_weight,,,,balanced,balanced,balanced,balanced,balanced,balanced,balanced
svc__gamma,scale,scale,0.1,0.01,0.1,0.01,0.01,scale,0.01,0.01


The PSSM feature leads to a good average performance across the 10 random seeds, even better than the split we tested before.