# PAAC Feature evaluation

# Imports

In [3]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test
)
from subpred.compositions import calculate_paac

# Dataset

In [4]:
outliers = (
    ["Q9HBR0", "Q07837"]
    + ["P76773", "Q47706", "P02943", "P75733", "P69856", "P64550"]
    + ["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0",],
)
df = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Transmembrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    tax_ids_filter=[3702, 9606, 83333, 559292],
    output_log="../logs/meta_amino_sugar_dataset.log",
    outliers=outliers,
    sequence_clustering=70
)
taxid_to_organism = {
    3702: "A. thaliana",
    9606: "Human",
    83333: "E. coli",
    559292: "Yeast",
}
df = df.assign(organism=df.organism_id.map(taxid_to_organism))

cd-hit: clustered 428 sequences into 362 clusters at threshold 70


# Feature generation

In [5]:
labels = df.keywords_transport
labels.value_counts()

Sugar transport         186
Amino-acid transport    176
Name: keywords_transport, dtype: int64

In [6]:
df_paac = calculate_paac(df.sequence)
df_paac

Unnamed: 0_level_0,AA,AC,AD,AE,AF,AG,AH,AI,AK,AL,...,YM,YN,YP,YQ,YR,YS,YT,YV,YW,YY
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P69801,0.030189,0.007547,0.007547,0.000000,0.007547,0.030189,0.000000,0.015094,0.000000,0.018868,...,0.000000,0.003774,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
Q9SFG0,0.005929,0.001976,0.000000,0.000000,0.005929,0.005929,0.000000,0.003953,0.003953,0.009881,...,0.001976,0.000000,0.000000,0.000000,0.001976,0.000000,0.0,0.003953,0.000000,0.000000
Q08986,0.008532,0.000000,0.000000,0.006826,0.006826,0.003413,0.001706,0.008532,0.006826,0.015358,...,0.001706,0.001706,0.000000,0.001706,0.003413,0.001706,0.0,0.003413,0.003413,0.001706
Q9BRV3,0.004545,0.004545,0.004545,0.000000,0.000000,0.009091,0.000000,0.000000,0.004545,0.009091,...,0.000000,0.000000,0.009091,0.000000,0.000000,0.000000,0.0,0.000000,0.004545,0.000000
Q84WN3,0.004167,0.000000,0.000000,0.004167,0.004167,0.004167,0.000000,0.008333,0.004167,0.008333,...,0.004167,0.000000,0.000000,0.000000,0.004167,0.000000,0.0,0.004167,0.000000,0.004167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9FHH5,0.006803,0.006803,0.000000,0.000000,0.006803,0.006803,0.000000,0.000000,0.000000,0.013605,...,0.000000,0.000000,0.006803,0.000000,0.000000,0.000000,0.0,0.000000,0.006803,0.006803
Q8S8A0,0.025806,0.012903,0.000000,0.000000,0.006452,0.012903,0.000000,0.000000,0.006452,0.006452,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.006452,0.000000
Q3E965,0.015385,0.007692,0.000000,0.000000,0.015385,0.007692,0.000000,0.000000,0.015385,0.007692,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.007692,0.000000
Q3EAV6,0.000000,0.000000,0.000000,0.000000,0.000000,0.009091,0.000000,0.000000,0.009091,0.018182,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000


## Independent test set

In [7]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_paac, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)

## Model comparison



In [8]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.723,0.741,0.721,0.586,0.651,0.684,0.065
KNeighborsClassifier(),0.826,0.792,0.789,0.705,0.786,0.78,0.045
"LinearSVC(class_weight='balanced', max_iter=1000000.0, random_state=0)",0.741,0.741,0.705,0.566,0.7,0.691,0.072
"LinearSVC(max_iter=1000000.0, random_state=0)",0.741,0.741,0.705,0.566,0.7,0.691,0.072
"RandomForestClassifier(class_weight='balanced', random_state=0)",0.757,0.845,0.793,0.707,0.676,0.756,0.067
RandomForestClassifier(random_state=0),0.793,0.827,0.775,0.637,0.696,0.746,0.078
SGDClassifier(random_state=0),0.69,0.741,0.81,0.621,0.676,0.707,0.072
"SVC(class_weight='balanced', random_state=0)",0.827,0.827,0.809,0.706,0.806,0.795,0.051
SVC(random_state=0),0.845,0.81,0.826,0.689,0.823,0.799,0.062


## Parameter tuning

RBF kernel delivers better results when using all features. 

In [28]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="linear", C=[0.0001, 0.001, 0.01, 0.1, 1, 10])
best_estimator_lsvc = gsearch

{'linearsvc__C': 0.001, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0}
0.753


In [29]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="rbf", C=[0.1, 1, 10, 100]
)
best_estimator_svc = gsearch

{'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.802


## Dimensionality reduction

### Linear kernel

PCA does not lead to improvements:

In [22]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
    C=[1, 0.01, 0.1, 10],
)
best_estimator_lsvc_pca = gsearch

{'linearsvc__C': 0.01, 'linearsvc__class_weight': None, 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.8300000000000001}
0.757


Kbest perform worse with the linear kernel:

In [23]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="kbest",
    select_k_steps=20,
    remove_zero_var=True,
    C=[0.1, 1, 10],
)
best_estimator_lsvc_kbest = gsearch

{'linearsvc__C': 10, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__max_iter': 100000000.0, 'selectkbest__k': 261}
0.691


#### RBF

In [24]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[0.1, 1, 10, 100],
    # gamma = [1e-0, 1e-1, 1e-2, 1e-3,"scale"]
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.86, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.788


With the RBF kernel, the kbest model performs the best. It only removes nine features.

In [25]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="kbest",
    select_k_steps=10,
    remove_zero_var=True,
    C=[0.1, 1, 10, 100],
)
best_estimator_svc_kbest = gsearch

{'selectkbest__k': 391, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.805


In [27]:
feature_names[~best_estimator_svc_kbest.best_estimator_["selectkbest"].get_support()]

array(['EM', 'GN', 'GY', 'LE', 'PV', 'RW', 'SG', 'TR', 'YM'], dtype='<U2')

The linear kernel with feature selection shows the best result here.

## Validation

RBF with PCA delivers the best results on the test set:

### Linear kernel

In [34]:
display(get_confusion_matrix(X_test, y_test, best_estimator_lsvc, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_lsvc, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,28,7
Sugar transport,9,29


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.757,0.8,0.778,35
Sugar transport,0.806,0.763,0.784,38
macro avg,0.781,0.782,0.781,73
weighted avg,0.782,0.781,0.781,73


In [30]:
display(get_confusion_matrix(X_test, y_test, best_estimator_lsvc_kbest, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_lsvc_kbest, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,20,15
Sugar transport,13,25


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.606,0.571,0.588,35
Sugar transport,0.625,0.658,0.641,38
macro avg,0.616,0.615,0.615,73
weighted avg,0.616,0.616,0.616,73


In [31]:
display(get_confusion_matrix(X_test, y_test, best_estimator_lsvc_pca, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_lsvc_pca, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,26,9
Sugar transport,8,30


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.765,0.743,0.754,35
Sugar transport,0.769,0.789,0.779,38
macro avg,0.767,0.766,0.766,73
weighted avg,0.767,0.767,0.767,73


### RBF kernel

In [35]:
display(get_confusion_matrix(X_test, y_test, best_estimator_svc, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_svc, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,29,6
Sugar transport,10,28


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.744,0.829,0.784,35
Sugar transport,0.824,0.737,0.778,38
macro avg,0.784,0.783,0.781,73
weighted avg,0.785,0.781,0.781,73


In [32]:
display(get_confusion_matrix(X_test, y_test, best_estimator_svc_kbest, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_svc_kbest, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,28,7
Sugar transport,11,27


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.718,0.8,0.757,35
Sugar transport,0.794,0.711,0.75,38
macro avg,0.756,0.755,0.753,73
weighted avg,0.758,0.753,0.753,73


In [33]:
display(get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,25,10
Sugar transport,4,34


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.862,0.714,0.781,35
Sugar transport,0.773,0.895,0.829,38
macro avg,0.817,0.805,0.805,73
weighted avg,0.816,0.808,0.806,73


## Conclusion

The models achieve F1 scores of around 0.80 with only PAAC. This is relatively consistent between training set and test set, and between the two substrates.

## Estimating validation variance 


In [37]:
df_scores, df_params = full_test(
    df_paac, labels, dim_reduction="pca", kernel="rbf", repetitions=10, remove_zero_var=True, select_k_steps=20
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
df_params

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.7412,0.7768
Sugar transport,0.773,0.7865


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.03119,0.017223
Sugar transport,0.038012,0.020695


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.93,0.89,0.92,0.97,0.94,0.81,0.8,0.85,0.9,0.85
svc__C,10,10,1,1,1.0,1,10,1,10,10
svc__class_weight,balanced,balanced,balanced,,,balanced,balanced,,balanced,balanced
svc__gamma,scale,scale,scale,scale,0.01,scale,scale,scale,scale,scale


For the meta-organism, the AAC actually performs better than the PAAC, possibly because of increased noise. Only a small subset of PAAC features have high frequencies. The AAC better captures the fact that H and G are among the most important features.