# PAAC Feature evaluation

# Imports

In [1]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test
)
from subpred.compositions import calculate_paac

# Dataset

In [2]:
df = create_dataset(
    keywords_substrate_filter = ["Amino-acid transport","Sugar transport"],
    keywords_component_filter = ["Transmembrane"],
    keywords_transport_filter = ["Transport"],
    input_file = "../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate = "integrate",
    verbose = True,
    tax_ids_filter = [559292],
    output_log = "../logs/yeast_amino_sugar_dataset.log",
    sequence_clustering=70
)

cd-hit: clustered 64 sequences into 51 clusters at threshold 70


# Feature generation

In [3]:
labels = df.keywords_transport
labels.value_counts()

Amino-acid transport    34
Sugar transport         17
Name: keywords_transport, dtype: int64

In [4]:
df_paac = calculate_paac(df.sequence)
df_paac

Unnamed: 0_level_0,AA,AC,AD,AE,AF,AG,AH,AI,AK,AL,...,YM,YN,YP,YQ,YR,YS,YT,YV,YW,YY
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q08986,0.008532,0.0,0.0,0.006826,0.006826,0.003413,0.001706,0.008532,0.006826,0.015358,...,0.001706,0.001706,0.0,0.001706,0.003413,0.001706,0.0,0.003413,0.003413,0.001706
P38967,0.003384,0.0,0.0,0.005076,0.005076,0.005076,0.001692,0.013536,0.003384,0.006768,...,0.0,0.0,0.0,0.0,0.0,0.003384,0.001692,0.003384,0.003384,0.001692
P38085,0.006472,0.001618,0.001618,0.012945,0.001618,0.008091,0.001618,0.006472,0.003236,0.0,...,0.001618,0.001618,0.001618,0.0,0.001618,0.003236,0.003236,0.001618,0.004854,0.0
P15380,0.00639,0.0,0.00639,0.003195,0.00639,0.007987,0.001597,0.007987,0.001597,0.00639,...,0.0,0.003195,0.001597,0.0,0.0,0.0,0.004792,0.00639,0.004792,0.0
P38206,0.001745,0.001745,0.0,0.0,0.006981,0.001745,0.001745,0.00349,0.001745,0.005236,...,0.0,0.0,0.00349,0.001745,0.001745,0.0,0.0,0.001745,0.001745,0.001745
Q12300,0.007874,0.0,0.001312,0.003937,0.006562,0.003937,0.0,0.010499,0.002625,0.002625,...,0.001312,0.002625,0.001312,0.003937,0.001312,0.0,0.003937,0.005249,0.0,0.003937
Q12010,0.006515,0.003257,0.003257,0.0,0.0,0.006515,0.0,0.0,0.0,0.003257,...,0.0,0.0,0.0,0.0,0.003257,0.0,0.006515,0.003257,0.0,0.003257
Q03697,0.004535,0.0,0.002268,0.0,0.004535,0.002268,0.0,0.0,0.0,0.011338,...,0.0,0.006803,0.002268,0.0,0.0,0.002268,0.004535,0.002268,0.0,0.002268
Q04602,0.001304,0.002608,0.005215,0.001304,0.001304,0.0,0.001304,0.002608,0.002608,0.007823,...,0.001304,0.0,0.0,0.0,0.0,0.002608,0.0,0.003911,0.0,0.0
P10870,0.006795,0.0,0.001133,0.003398,0.003398,0.002265,0.0,0.006795,0.00453,0.007928,...,0.001133,0.002265,0.001133,0.002265,0.001133,0.0,0.001133,0.00453,0.0,0.002265


## Independent test set

In [5]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_paac, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)

## Model comparison

Before optimization, linear SVC looks best, but there is one fold with worse results. This indicates that a certain combination of training and test set does not work. This could be training on the polysaccharide transporters, and testing on the monosaccharide transporters, for example.

In [6]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.667,0.795,0.855,0.619,0.333,0.654,0.203
KNeighborsClassifier(),1.0,0.795,0.75,0.365,0.855,0.753,0.236
"LinearSVC(class_weight='balanced', max_iter=1000000.0, random_state=0)",0.855,0.855,1.0,0.467,0.855,0.806,0.2
"LinearSVC(max_iter=1000000.0, random_state=0)",0.855,0.855,1.0,0.467,0.855,0.806,0.2
"RandomForestClassifier(class_weight='balanced', random_state=0)",0.795,0.795,0.667,0.385,0.385,0.605,0.208
RandomForestClassifier(random_state=0),0.795,0.795,0.385,0.385,0.667,0.605,0.208
SGDClassifier(random_state=0),0.855,0.855,0.873,0.365,0.855,0.76,0.221
"SVC(class_weight='balanced', random_state=0)",1.0,1.0,0.667,0.385,0.385,0.687,0.308
SVC(random_state=0),0.795,0.795,0.385,0.385,0.385,0.549,0.225


## Parameter tuning

Linear kernel performs slightly better, and wants a low value for C, which could be a compensation for overfitting in one of the folds.

In [11]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="linear", C=[0.01, 0.1, 1, 10])

{'linearsvc__C': 0.1, 'linearsvc__class_weight': None, 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0}
0.806


In [9]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="rbf"
)

{'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.687


## Dimensionality reduction

### Linear kernel

Slight improvements with PCA, and now a C value of 1:

In [20]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
    C=[1, 0.01, 0.1, 10],
)
best_estimator_lsvc_pca = gsearch

{'linearsvc__C': 1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.89}
0.864


Kbest also improves the model a bit:

In [15]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="kbest",
    remove_zero_var=True,
    C=[0.1, 1, 10],
)
best_estimator_lsvc_kbest = gsearch

{'linearsvc__C': 0.1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'selectkbest__k': 185}
0.835


#### RBF

In [14]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[1, 10, 100],
    # gamma = [1e-0, 1e-1, 1e-2, 1e-3,"scale"]
)
best_estimator_svc_kbest = gsearch

{'pca__n_components': 0.9299999999999999, 'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 0.1}
0.913


In [17]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="kbest",
    remove_zero_var=True,
    C=[0.1, 1, 10],
    # gamma = [1e-0, 1e-1, 1e-2, 1e-3,"scale"]
)
best_estimator_svc_pca = gsearch

{'selectkbest__k': 72, 'svc__C': 1, 'svc__class_weight': None, 'svc__gamma': 0.01}
0.875


The linear kernel with feature selection shows the best result here.

## Validation

Two out of four sugar transporters are classified correclty, along with all AA transporters.

### Linear kernel

In [24]:
display(get_confusion_matrix(X_test, y_test, best_estimator_lsvc_kbest, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_lsvc_kbest, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,7,0
Sugar transport,2,2


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.778,1.0,0.875,7
Sugar transport,1.0,0.5,0.667,4
macro avg,0.889,0.75,0.771,11
weighted avg,0.859,0.818,0.799,11


In [25]:
display(get_confusion_matrix(X_test, y_test, best_estimator_lsvc_pca, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_lsvc_pca, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,7,0
Sugar transport,2,2


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.778,1.0,0.875,7
Sugar transport,1.0,0.5,0.667,4
macro avg,0.889,0.75,0.771,11
weighted avg,0.859,0.818,0.799,11


### RBF kernel

In [26]:
display(get_confusion_matrix(X_test, y_test, best_estimator_svc_kbest, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_svc_kbest, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,7,0
Sugar transport,3,1


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.7,1.0,0.824,7
Sugar transport,1.0,0.25,0.4,4
macro avg,0.85,0.625,0.612,11
weighted avg,0.809,0.727,0.67,11


In [27]:
display(get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,7,0
Sugar transport,2,2


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.778,1.0,0.875,7
Sugar transport,1.0,0.5,0.667,4
macro avg,0.889,0.75,0.771,11
weighted avg,0.859,0.818,0.799,11


Is there a reason for the bad performance of sugar?

In [32]:
df.loc[sample_names_test].query("keywords_transport == 'Sugar transport'")

Unnamed: 0_level_0,keywords_transport,keywords_location,keywords_transport_related,gene_names,protein_names,tcdb_id,tcdb_class,organism_id,sequence
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P38695,Sugar transport,Membrane;Transmembrane,Transport,HXT5 YHR096C,Probable glucose transporter HXT5,,0.0,559292,MSELENAHQGPLEGSATVSTNSNSYNEKSGNSTAPGTAGYNDNLAQ...
P40004,Sugar transport,Endoplasmic reticulum;Membrane;Transmembrane,Transport,YEA4 YEL004W,UDP-N-acetylglucosamine transporter YEA4,2.A.7.10.5,2.A,559292,MWNSLKAFALVFGGCCSNVITFETLMSNETGSINNLITFCQFLFVT...
Q12520,Sugar transport,Endoplasmic reticulum;Membrane;Transmembrane,Transport,HUT1 MLF6 YPL244C,UDP-galactose transporter homolog 1 (Multicopy...,2.A.7.11.6,2.A,559292,MAGSTSSLVICAIGIYATFLTWALVQEPLATRTWPNSMGKFQFPNV...
P40107,Sugar transport,Endoplasmic reticulum;Membrane;Transmembrane,Transport,VRG4 GOG5 LDB3 MCD3 VAN2 VIG4 YGL225W,GDP-mannose transporter 1 (GMT 1) (Low dye-bin...,2.A.7.13.1,2.A,559292,MSELKTGHAGHNPWASVANSGPISILSYCGSSILMTVTNKFVVNLK...


## Conclusion

The sugar test set actually contains three nucleotide-sugars. As can be seen in notebook 2, these are the only ones in the dataset! Which means that the SVM could not be trained to classify these types of proteins.

## Estimating validation variance with LOOCV


In [37]:
df_scores, df_params = full_test(
    df_paac, labels, dim_reduction="pca", kernel="rbf", repetitions=10, remove_zero_var=True, cross_val_method="LOOCV"
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
df_params

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8505,0.9367
Sugar transport,0.6839,0.8431


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.098139,0.018043
Sugar transport,0.171824,0.052346


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.81,0.83,0.83,0.82,0.8,0.86,0.8,0.8,0.8,0.97
svc__C,1,1,1,1,1,1,1,1,1,1
svc__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced
svc__gamma,scale,0.01,0.1,0.01,scale,0.01,scale,0.1,scale,0.01


The performance for the minority class is still not very good on the independent test set.