# PSSM Feature evaluation

# Imports

In [1]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test,
)
from subpred.pssm import calculate_pssms_notebook

# Dataset

In [2]:
df = create_dataset(
    keywords_substrate_filter = ["Amino-acid transport","Sugar transport"],
    keywords_component_filter = ["Transmembrane"],
    keywords_transport_filter = ["Transport"],
    input_file = "../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate = "integrate",
    verbose = True,
    tax_ids_filter = [9606],
    output_log = "../logs/human_amino_sugar_dataset.log",
    outliers=["Q9HBR0", "Q07837"],
    sequence_clustering=70
)

cd-hit: clustered 85 sequences into 81 clusters at threshold 70


# Feature generation

In [3]:
labels = df.keywords_transport
labels.value_counts()

Amino-acid transport    48
Sugar transport         33
Name: keywords_transport, dtype: int64

In [4]:
df_pssm = calculate_pssms_notebook(df.sequence)
df_pssm

Unnamed: 0_level_0,AA_50_1,AR_50_1,AN_50_1,AD_50_1,AC_50_1,AQ_50_1,AE_50_1,AG_50_1,AH_50_1,AI_50_1,...,VL_90_3,VK_90_3,VM_90_3,VF_90_3,VP_90_3,VS_90_3,VT_90_3,VW_90_3,VY_90_3,VV_90_3
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q9BRV3,0.676768,0.488215,0.508418,0.464646,0.602694,0.511785,0.478114,0.565657,0.511785,0.612795,...,0.484375,0.403125,0.471875,0.706250,0.368750,0.443750,0.440625,0.568750,0.856250,0.478125
Q5M8T2,0.812672,0.363636,0.407713,0.314050,0.595041,0.451791,0.399449,0.498623,0.407713,0.539945,...,0.522388,0.492537,0.525373,0.659701,0.498507,0.516418,0.513433,0.594030,0.817910,0.498507
Q969S0,0.638989,0.339350,0.418773,0.342960,0.494585,0.404332,0.357401,0.498195,0.382671,0.454874,...,0.492669,0.398827,0.492669,0.568915,0.384164,0.422287,0.416422,0.609971,0.750733,0.472141
O75387,0.698152,0.441478,0.455852,0.418891,0.523614,0.447639,0.439425,0.558522,0.472279,0.474333,...,0.527546,0.510851,0.509182,0.602671,0.477462,0.514190,0.515860,0.544240,0.686144,0.512521
Q9NTN3,0.808333,0.325000,0.350000,0.283333,0.591667,0.416667,0.362500,0.504167,0.412500,0.500000,...,0.450704,0.357746,0.439437,0.645070,0.321127,0.388732,0.397183,0.594366,0.867606,0.450704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q96A29,0.696023,0.485795,0.480114,0.443182,0.511364,0.502841,0.465909,0.559659,0.477273,0.500000,...,0.547126,0.457471,0.531034,0.655172,0.427586,0.487356,0.482759,0.634483,0.749425,0.533333
Q9BYW1,0.786749,0.343685,0.418219,0.316770,0.610766,0.455487,0.383023,0.546584,0.424431,0.498965,...,0.534107,0.502177,0.528302,0.647315,0.489115,0.516691,0.513788,0.579100,0.718433,0.523948
P14672,0.808300,0.316206,0.418972,0.332016,0.600791,0.409091,0.363636,0.551383,0.371542,0.503953,...,0.514116,0.476969,0.505201,0.618128,0.471025,0.490342,0.484398,0.551263,0.716196,0.494799
Q96AA3,0.711697,0.449753,0.462932,0.393740,0.566722,0.481054,0.429984,0.528830,0.434926,0.546952,...,0.571069,0.500629,0.555975,0.640252,0.483019,0.529560,0.515723,0.571069,0.739623,0.544654


## Independent test set

In [5]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_pssm, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)



## Model comparison

Linear SVC looks promising before feature selection, more so than RBF. Could be that the linear kernel prevents overfitting, due to the many features and few samples.

In [6]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.845,0.606,0.575,0.606,0.748,0.676,0.116
KNeighborsClassifier(),0.921,0.675,0.567,0.675,0.916,0.751,0.159
"LinearSVC(class_weight='balanced', max_iter=1000000.0, random_state=0)",0.845,0.745,0.819,0.69,0.829,0.786,0.066
"LinearSVC(max_iter=1000000.0, random_state=0)",0.845,0.745,0.819,0.69,0.829,0.786,0.066
"RandomForestClassifier(class_weight='balanced', random_state=0)",0.745,0.606,0.381,0.769,0.833,0.667,0.18
RandomForestClassifier(random_state=0),0.745,0.675,0.381,0.69,0.916,0.682,0.193
SGDClassifier(random_state=0),0.845,0.764,0.707,0.606,0.748,0.734,0.087
"SVC(class_weight='balanced', random_state=0)",0.921,0.606,0.567,0.69,0.916,0.74,0.169
SVC(random_state=0),0.745,0.606,0.381,0.769,0.916,0.683,0.202


## Parameter tuning

#### Custom transformer

Here, we try the multi-pssm feature, which tries all combinations of feature generation parameters, and selects the best ones based on the training set. First without the transformer:

In [7]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction=None,
    C=[1, 10, 100],
)

{'linearsvc__C': 10, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__max_iter': 100000000.0}
0.808


The linear kernel already leads to good results. With the transformer:

In [8]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction=None,
    feature_transformer="pssm", 
    feature_names = feature_names,
    C=[0.001, 0.01, 0.1, 1]
)
best_estimator_linear = gsearch

{'linearsvc__C': 0.01, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pssmselector__iterations': 'all', 'pssmselector__uniref_threshold': 50}
0.823


Lower value of C, better results. Does RBF improve anything?

In [9]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction=None,
    C=[1, 10, 100],
)

{'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.766


In contrast to A Thaliana, we get lower scores with the RBF kernel without feature reduction. With the feature selector:

In [10]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction=None,
    C=[1, 10, 100],
    feature_transformer="pssm",
    feature_names=feature_names,
)

{'pssmselector__iterations': 1, 'pssmselector__uniref_threshold': 50, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.903


RBF with feature selector is the best one so far.

## Dimensionality reduction

With PCA and the linear kernel, the results are slightly worse than for rbf with no FS

In [11]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
    C=[1, 0.1, 0.001, 0.01],
    feature_transformer="pssm",
    feature_names=feature_names,
)
best_estimator_linearsvc_pca = gsearch

{'linearsvc__C': 1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.94, 'pssmselector__iterations': 1, 'pssmselector__uniref_threshold': 50}
0.871


Without the pssm selection, we get lower results:

In [12]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
    # C=[0.001, 0.01, 0.1],
)

{'linearsvc__C': 0.1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.99}
0.803


PCA seems to perform the best and is faster, how about the RBF kernel?

In [13]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[1, 10, 100],
    # gamma=["scale"],
)

{'pca__n_components': 0.92, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.855


With the custom transformer, rbf and pca, we get the best training score. Lower values of gamma lead to higher scores on the training set, but possibly also more overfitting. Setting gamma to scale only:

In [14]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    feature_transformer="pssm",
    feature_names=feature_names,
    C=[1, 0.1, 10],
    gamma=["scale"],
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.9299999999999999, 'pssmselector__iterations': 'all', 'pssmselector__uniref_threshold': 50, 'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.936


## Validation


### Linear kernel without feature selection

With just the linear kernel and the pssmselector, we get a perfect score on the training set. That might be due to chance, which we will evaluate later

In [15]:
get_confusion_matrix(X_test, y_test, best_estimator_linear, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,10,0
Sugar transport,0,7


In [16]:
get_classification_report(X_test, y_test, best_estimator_linear, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,1.0,1.0,1.0,10
Sugar transport,1.0,1.0,1.0,7
macro avg,1.0,1.0,1.0,17
weighted avg,1.0,1.0,1.0,17


### Linear kernel with PCA

Adding PCA leads to one wrong classification:

In [17]:
get_confusion_matrix(X_test, y_test, best_estimator_linearsvc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,9,1
Sugar transport,0,7


In [18]:
get_classification_report(X_test, y_test, best_estimator_linearsvc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,1.0,0.9,0.947,10
Sugar transport,0.875,1.0,0.933,7
macro avg,0.938,0.95,0.94,17
weighted avg,0.949,0.941,0.942,17


### RBF + PCA

RBF kernel and pca leads to a perfect score again.


In [19]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,10,0
Sugar transport,0,7


In [20]:
get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,1.0,1.0,1.0,10
Sugar transport,1.0,1.0,1.0,7
macro avg,1.0,1.0,1.0,17
weighted avg,1.0,1.0,1.0,17


## Estimating validation variance

How much did the result depend on choosing the training and test sets?

Mean and standard deviation for randomly selected training and validation sets.

#### RBF+PCA

In [21]:
df_scores, df_params = full_test(
    df_pssm,
    labels,
    dim_reduction="pca",
    kernel="rbf",
    repetitions=10,
    feature_transformer="pssm",
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8752,0.9445
Sugar transport,0.7965,0.9194


Sdev F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.063822,0.013083
Sugar transport,0.113128,0.021025


Parameters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.94,0.92,0.89,0.99,0.92,0.94,0.93,0.95,0.91,0.88
pssmselector__iterations,3,3,all,3,all,all,all,1,all,1
pssmselector__uniref_threshold,50,50,all,all,50,50,50,50,50,90
svc__C,1,10,10,1,1,1,1,1,10,1
svc__class_weight,balanced,balanced,balanced,balanced,,,balanced,,balanced,balanced
svc__gamma,0.1,0.1,0.1,scale,scale,0.1,0.1,scale,scale,0.1


Very good results for the cross validation on the training set, but again not so much on the test set, especially for sugar. This could be caused by the very low sample count.

#### Just the linear kernel and PSSMselector

In [22]:
df_scores, df_params = full_test(
    df_pssm,
    labels,
    kernel="linear",
    repetitions=10,
    feature_transformer="pssm",
    cross_val_method="5fold"
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8676,0.893
Sugar transport,0.8127,0.8542


Sdev F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.056884,0.023636
Sugar transport,0.098688,0.034676


Parameters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
linearsvc__C,10,10,1,10,1,1,1,1,0.1,10
linearsvc__class_weight,,balanced,balanced,,balanced,balanced,balanced,balanced,balanced,balanced
linearsvc__dual,False,False,True,False,True,True,True,True,True,False
linearsvc__max_iter,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0
pssmselector__iterations,3,3,all,3,all,all,all,3,1,3
pssmselector__uniref_threshold,all,all,50,all,50,50,50,50,50,50


The linear SVM leads to a more stable model overall. How does the higher number of training samples in LOOCV change the evaluation results?

In [23]:
df_scores, df_params = full_test(
    df_pssm,
    labels,
    kernel="linear",
    repetitions=10,
    feature_transformer="pssm",
    cross_val_method="loocv"
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8553,0.8979
Sugar transport,0.8013,0.8582


Sdev F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.036179,0.021388
Sugar transport,0.059922,0.029484


Parameters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
linearsvc__C,1,1,1,10,1,10,1,10,1,1
linearsvc__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,balanced,,balanced,balanced
linearsvc__dual,True,True,True,False,True,False,True,False,True,True
linearsvc__max_iter,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0
pssmselector__iterations,3,all,3,all,3,all,all,3,3,3
pssmselector__uniref_threshold,all,all,50,all,all,90,50,50,50,all


In this case, LOOCV does not make a difference. Does PCA improve the model?

#### PCA + linear Kernel

In [24]:
df_scores, df_params = full_test(
    df_pssm,
    labels,
    dim_reduction="pca",
    kernel="linear",
    repetitions=10,
    feature_transformer="pssm",
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("Sdev F1")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
print("Parameters")
display(df_params)

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8422,0.9049
Sugar transport,0.7601,0.8668


Sdev F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.04008,0.027225
Sugar transport,0.087827,0.03864


Parameters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
linearsvc__C,0.1,1,0.1,1,0.1,10,10,0.1,0.1,1
linearsvc__class_weight,,balanced,,balanced,balanced,balanced,balanced,balanced,balanced,balanced
linearsvc__dual,True,True,True,True,True,False,True,True,True,True
linearsvc__max_iter,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0
pca__n_components,0.97,0.99,0.99,0.99,0.91,0.99,0.98,0.98,0.94,0.99
pssmselector__iterations,3,3,all,3,all,1,3,3,3,1
pssmselector__uniref_threshold,all,all,50,all,50,50,50,50,50,50


PCA on the linear kernel leads to a bit more overfitting, not better results.

## Additional dataset filtering

Again, we try what effect removing the Sideroflexins has on the evaluation:

In [25]:
mask_sideroflexin = ~df.protein_names.str.startswith("Side")
df_scores, df_params = full_test(
    df_pssm.loc[mask_sideroflexin],
    labels.loc[mask_sideroflexin],
    dim_reduction="pca",
    kernel="rbf",
    repetitions=10,
    feature_transformer="pssm",
    # cross_val_method="loocv",
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)
df_params

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8684,0.9357
Sugar transport,0.8128,0.9137


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.083295,0.026238
Sugar transport,0.125424,0.035814


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.8,0.93,0.9,0.96,0.91,0.93,0.88,0.99,0.87,0.9
pssmselector__iterations,3.0,3,3,all,1,1,1,3,all,3
pssmselector__uniref_threshold,50.0,50,50,50,50,all,50,all,50,50
svc__C,1.0,10,1,10,10,10,10,1,10,10
svc__class_weight,,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced
svc__gamma,0.1,scale,scale,scale,0.1,scale,0.1,0.01,scale,0.1


In [26]:
mask_sideroflexin = ~df.protein_names.str.startswith("Side")
df_scores, df_params = full_test(
    df_pssm.loc[mask_sideroflexin],
    labels.loc[mask_sideroflexin],
    # dim_reduction="pca",
    kernel="linear",
    repetitions=10,
    feature_transformer="pssm",
    # cross_val_method="loocv",
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)
df_params

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.879,0.8742
Sugar transport,0.8383,0.8388


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.068751,0.026549
Sugar transport,0.103488,0.031173


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
linearsvc__C,10,10,10,0.1,1,1,10,10,1,10
linearsvc__class_weight,balanced,balanced,balanced,,balanced,balanced,balanced,balanced,balanced,balanced
linearsvc__dual,False,False,False,True,True,True,False,False,True,True
linearsvc__max_iter,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0
pssmselector__iterations,1,1,all,1,1,3,all,1,3,3
pssmselector__uniref_threshold,50,all,50,50,all,all,50,all,all,50


In contrast to the PAAC feature, removing the sideroflexins did not make a difference to the classification performance. That makes sense, considering that these proteins outliers only in the PAAC PCA, not in that of PSSM and AAC.