# PAAC Feature evaluation

# Imports

In [22]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test
)
from subpred.compositions import calculate_paac

# Dataset

In [23]:
outliers = (
    ["Q9HBR0", "Q07837"]
    + ["P76773", "Q47706", "P02943", "P75733", "P69856", "P64550"]
    + ["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0",]
)
df = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Transmembrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    tax_ids_filter=[3702, 9606, 83333, 559292],
    output_log="../logs/meta_amino_sugar_dataset.log",
    outliers=outliers,
    sequence_clustering=70
)
taxid_to_organism = {
    3702: "A. thaliana",
    9606: "Human",
    83333: "E. coli",
    559292: "Yeast",
}
df = df.assign(organism=df.organism_id.map(taxid_to_organism))

cd-hit: clustered 413 sequences into 347 clusters at threshold 70


# Feature generation

In [24]:
labels = df.keywords_transport
labels.value_counts()

Sugar transport         181
Amino-acid transport    166
Name: keywords_transport, dtype: int64

In [25]:
df_paac = calculate_paac(df.sequence)
df_paac

Unnamed: 0_level_0,AA,AC,AD,AE,AF,AG,AH,AI,AK,AL,...,YM,YN,YP,YQ,YR,YS,YT,YV,YW,YY
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P69801,0.030189,0.007547,0.007547,0.000000,0.007547,0.030189,0.000000,0.015094,0.000000,0.018868,...,0.000000,0.003774,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Q9SFG0,0.005929,0.001976,0.000000,0.000000,0.005929,0.005929,0.000000,0.003953,0.003953,0.009881,...,0.001976,0.000000,0.000000,0.000000,0.001976,0.000000,0.000000,0.003953,0.000000,0.000000
Q08986,0.008532,0.000000,0.000000,0.006826,0.006826,0.003413,0.001706,0.008532,0.006826,0.015358,...,0.001706,0.001706,0.000000,0.001706,0.003413,0.001706,0.000000,0.003413,0.003413,0.001706
Q9BRV3,0.004545,0.004545,0.004545,0.000000,0.000000,0.009091,0.000000,0.000000,0.004545,0.009091,...,0.000000,0.000000,0.009091,0.000000,0.000000,0.000000,0.000000,0.000000,0.004545,0.000000
Q84WN3,0.004167,0.000000,0.000000,0.004167,0.004167,0.004167,0.000000,0.008333,0.004167,0.008333,...,0.004167,0.000000,0.000000,0.000000,0.004167,0.000000,0.000000,0.004167,0.000000,0.004167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F4IHS9,0.002933,0.000000,0.000000,0.000000,0.002933,0.002933,0.000000,0.008798,0.002933,0.005865,...,0.000000,0.000000,0.000000,0.008798,0.000000,0.005865,0.002933,0.000000,0.000000,0.000000
Q04162,0.005415,0.001805,0.001805,0.000000,0.007220,0.000000,0.000000,0.014440,0.000000,0.009025,...,0.001805,0.000000,0.003610,0.000000,0.000000,0.003610,0.003610,0.005415,0.000000,0.001805
P33361,0.018229,0.005208,0.000000,0.000000,0.002604,0.007812,0.000000,0.013021,0.000000,0.046875,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.002604,0.000000,0.000000,0.000000,0.000000
P39328,0.014706,0.000000,0.005882,0.000000,0.000000,0.023529,0.000000,0.014706,0.002941,0.014706,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.002941,0.000000,0.000000


## Independent test set

In [26]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_paac, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)

## Model comparison



In [27]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.768,0.857,0.818,0.577,0.854,0.775,0.116
KNeighborsClassifier(),0.785,0.768,0.782,0.781,0.764,0.776,0.01
"LinearSVC(class_weight='balanced', max_iter=1000000.0, random_state=0)",0.732,0.803,0.778,0.582,0.818,0.743,0.096
"LinearSVC(max_iter=1000000.0, random_state=0)",0.732,0.803,0.778,0.582,0.818,0.743,0.096
"RandomForestClassifier(class_weight='balanced', random_state=0)",0.764,0.804,0.817,0.626,0.78,0.758,0.076
RandomForestClassifier(random_state=0),0.764,0.802,0.833,0.704,0.818,0.784,0.052
SGDClassifier(random_state=0),0.692,0.747,0.741,0.58,0.763,0.705,0.075
"SVC(class_weight='balanced', random_state=0)",0.785,0.857,0.796,0.741,0.704,0.777,0.058
SVC(random_state=0),0.785,0.875,0.776,0.738,0.704,0.776,0.064


## Parameter tuning

RBF kernel delivers better results when using all features. 

In [28]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="linear", C=[0.0001, 0.001, 0.01, 0.1, 1, 10])
best_estimator_lsvc = gsearch

{'linearsvc__C': 0.001, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0}
0.783


In [29]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="rbf", C=[0.1, 1, 10, 100]
)
best_estimator_svc = gsearch

{'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.791


## Dimensionality reduction

### Linear kernel

PCA does not lead to improvements:

In [30]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
    C=[1, 0.01, 0.1, 10],
)
best_estimator_lsvc_pca = gsearch

{'linearsvc__C': 0.01, 'linearsvc__class_weight': None, 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.8200000000000001}
0.79


Kbest perform worse with the linear kernel:

In [31]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="kbest",
    select_k_steps=20,
    remove_zero_var=True,
    C=[0.1, 1, 10],
)
best_estimator_lsvc_kbest = gsearch

{'linearsvc__C': 0.1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'selectkbest__k': 321}
0.757


#### RBF

In [32]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[0.1, 1, 10, 100],
    # gamma = [1e-0, 1e-1, 1e-2, 1e-3,"scale"]
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.87, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01}
0.819


With the RBF kernel, the kbest model performs the best. It only removes nine features.

In [33]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="kbest",
    select_k_steps=10,
    remove_zero_var=True,
    C=[0.1, 1, 10, 100],
)
best_estimator_svc_kbest = gsearch

{'selectkbest__k': 371, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.814


In [34]:
feature_names[~best_estimator_svc_kbest.best_estimator_["selectkbest"].get_support()]

array(['AD', 'CR', 'DH', 'FM', 'GK', 'GL', 'HD', 'IK', 'IY', 'KF', 'KI',
       'KW', 'LE', 'LS', 'MK', 'NA', 'NH', 'PM', 'PN', 'QD', 'SK', 'TG',
       'TM', 'TQ', 'TW', 'VV', 'WK', 'WS', 'YV'], dtype='<U2')

The RBF kernel with feature selection shows the best result here.

## Validation

RBF with all features delivers the best results on the test set:

### Linear kernel

In [35]:
display(get_confusion_matrix(X_test, y_test, best_estimator_lsvc, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_lsvc, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,26,7
Sugar transport,10,27


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.722,0.788,0.754,33
Sugar transport,0.794,0.73,0.761,37
macro avg,0.758,0.759,0.757,70
weighted avg,0.76,0.757,0.757,70


In [36]:
display(get_confusion_matrix(X_test, y_test, best_estimator_lsvc_kbest, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_lsvc_kbest, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,26,7
Sugar transport,12,25


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.684,0.788,0.732,33
Sugar transport,0.781,0.676,0.725,37
macro avg,0.733,0.732,0.729,70
weighted avg,0.736,0.729,0.728,70


In [37]:
display(get_confusion_matrix(X_test, y_test, best_estimator_lsvc_pca, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_lsvc_pca, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,26,7
Sugar transport,9,28


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.743,0.788,0.765,33
Sugar transport,0.8,0.757,0.778,37
macro avg,0.771,0.772,0.771,70
weighted avg,0.773,0.771,0.772,70


### RBF kernel

In [38]:
display(get_confusion_matrix(X_test, y_test, best_estimator_svc, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_svc, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,25,8
Sugar transport,6,31


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.806,0.758,0.781,33
Sugar transport,0.795,0.838,0.816,37
macro avg,0.801,0.798,0.799,70
weighted avg,0.8,0.8,0.8,70


In [39]:
display(get_confusion_matrix(X_test, y_test, best_estimator_svc_kbest, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_svc_kbest, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,24,9
Sugar transport,6,31


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.8,0.727,0.762,33
Sugar transport,0.775,0.838,0.805,37
macro avg,0.788,0.783,0.784,70
weighted avg,0.787,0.786,0.785,70


In [40]:
display(get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,27,6
Sugar transport,10,27


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.73,0.818,0.771,33
Sugar transport,0.818,0.73,0.771,37
macro avg,0.774,0.774,0.771,70
weighted avg,0.776,0.771,0.771,70


## Conclusion

The models achieve F1 scores of around 0.80 with only PAAC. This is relatively consistent between training set and test set, and between the two substrates.

## Estimating validation variance 


In [41]:
df_scores, df_params = full_test(
    df_paac, labels, dim_reduction="pca", kernel="rbf", repetitions=10, remove_zero_var=True, select_k_steps=20
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
df_params

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.761,0.771
Sugar transport,0.7943,0.7775


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.078019,0.027051
Sugar transport,0.04587,0.027334


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.89,0.81,0.94,0.99,0.9,0.94,0.96,0.87,0.91,0.87
svc__C,1,1,1.0,10,10,1,10,10,1.0,10
svc__class_weight,balanced,balanced,,balanced,balanced,balanced,balanced,balanced,,balanced
svc__gamma,0.01,scale,0.01,scale,scale,0.01,0.01,0.01,0.01,scale


In [42]:
df_scores, df_params = full_test(
    df_paac, labels, kernel="rbf", repetitions=10, remove_zero_var=True, select_k_steps=20
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
df_params

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.7718,0.7465
Sugar transport,0.8211,0.7863


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.085864,0.026277
Sugar transport,0.05526,0.018209


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
svc__C,1,1,10,10,10,10,10,1,1,1
svc__class_weight,,,balanced,balanced,balanced,balanced,balanced,,,
svc__gamma,scale,scale,scale,scale,scale,scale,scale,scale,scale,scale


For the meta-organism, the AAC actually performs better than the PAAC, possibly because of increased noise. Only a small subset of PAAC features have high frequencies. The AAC better captures the fact that H and G are among the most important features.