# AAC feature evaluation

# Imports

In [3]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test
)
from subpred.compositions import calculate_aac

# Dataset

In [40]:
outliers = (
    ["Q9HBR0", "Q07837"]
    + ["P76773", "Q47706", "P02943", "P75733", "P69856", "P64550"]
    + ["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0",],
)
df = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Transmembrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    tax_ids_filter=[3702, 9606, 83333, 559292],
    output_log="../logs/meta_amino_sugar_dataset.log",
    outliers=outliers,
    sequence_clustering=70
)
taxid_to_organism = {
    3702: "A. thaliana",
    9606: "Human",
    83333: "E. coli",
    559292: "Yeast",
}
df = df.assign(organism=df.organism_id.map(taxid_to_organism))


cd-hit: clustered 428 sequences into 362 clusters at threshold 70


# Feature generation

## Labels

In [41]:
labels = df.keywords_transport
labels.value_counts()

Sugar transport         186
Amino-acid transport    176
Name: keywords_transport, dtype: int64

## AAC

In [42]:
df_aac = calculate_aac(df.sequence)
df_aac

Unnamed: 0_level_0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
P69801,0.150376,0.007519,0.030075,0.022556,0.037594,0.093985,0.015038,0.120301,0.011278,0.101504,0.048872,0.052632,0.030075,0.033835,0.018797,0.037594,0.056391,0.105263,0.007519,0.018797
Q9SFG0,0.090730,0.015779,0.033531,0.035503,0.086785,0.088757,0.015779,0.092702,0.033531,0.104536,0.033531,0.033531,0.039448,0.029586,0.039448,0.055227,0.041420,0.086785,0.017751,0.025641
Q08986,0.085179,0.025554,0.027257,0.047700,0.069847,0.078365,0.010221,0.091993,0.044293,0.090290,0.018739,0.035775,0.032368,0.027257,0.035775,0.105622,0.044293,0.074957,0.017036,0.037479
Q9BRV3,0.058824,0.022624,0.031674,0.018100,0.058824,0.072398,0.009050,0.045249,0.022624,0.176471,0.027149,0.031674,0.040724,0.045249,0.045249,0.063348,0.072398,0.072398,0.027149,0.058824
Q84WN3,0.066390,0.004149,0.020747,0.053942,0.070539,0.074689,0.008299,0.070539,0.041494,0.120332,0.029046,0.033195,0.041494,0.016598,0.029046,0.087137,0.058091,0.103734,0.016598,0.053942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9FHH5,0.060811,0.020270,0.087838,0.081081,0.020270,0.094595,0.020270,0.020270,0.040541,0.087838,0.027027,0.033784,0.081081,0.020270,0.047297,0.067568,0.054054,0.081081,0.013514,0.040541
Q8S8A0,0.108974,0.025641,0.051282,0.096154,0.032051,0.089744,0.025641,0.044872,0.057692,0.083333,0.038462,0.019231,0.057692,0.006410,0.057692,0.089744,0.038462,0.051282,0.012821,0.012821
Q3E965,0.083969,0.022901,0.015267,0.106870,0.038168,0.061069,0.007634,0.045802,0.068702,0.091603,0.053435,0.053435,0.045802,0.045802,0.045802,0.053435,0.068702,0.061069,0.015267,0.015267
Q3EAV6,0.045045,0.036036,0.063063,0.036036,0.027027,0.036036,0.018018,0.063063,0.063063,0.117117,0.018018,0.054054,0.108108,0.009009,0.027027,0.072072,0.063063,0.117117,0.009009,0.018018


### Independent test set

In [43]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_aac, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)

### Model selection


In [44]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.724,0.637,0.723,0.586,0.62,0.658,0.062
KNeighborsClassifier(),0.793,0.758,0.844,0.637,0.754,0.757,0.076
"LinearSVC(class_weight='balanced', max_iter=1000000.0, random_state=0)",0.654,0.757,0.741,0.672,0.608,0.686,0.062
"LinearSVC(max_iter=1000000.0, random_state=0)",0.62,0.739,0.741,0.69,0.64,0.686,0.055
"RandomForestClassifier(class_weight='balanced', random_state=0)",0.774,0.637,0.845,0.655,0.648,0.712,0.093
RandomForestClassifier(random_state=0),0.738,0.688,0.757,0.724,0.648,0.711,0.043
SGDClassifier(random_state=0),0.568,0.741,0.586,0.603,0.578,0.615,0.072
"SVC(class_weight='balanced', random_state=0)",0.758,0.879,0.81,0.706,0.737,0.778,0.068
SVC(random_state=0),0.758,0.914,0.81,0.706,0.737,0.785,0.081


### Parameter tuning

RBF kernel delivers slightly better results

In [45]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="linear")

{'linearsvc__C': 0.1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0}
0.696


In [46]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="rbf", C=[0.1, 1, 10, 100])
best_estimator_svc = gsearch

{'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 0.1}
0.792


### Dimensionality reduction

Kbest performs slightly worse, and only removes one feature:

In [47]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="rbf", dim_reduction="kbest"#, C=[1,10,100], gamma=["scale"]
)
best_estimator_svc_kbest = gsearch

{'selectkbest__k': 19, 'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 0.1}
0.788


In [48]:
feature_names[~best_estimator_svc_kbest.best_estimator_["selectkbest"].get_support()]

array(['E'], dtype='<U1')

The amino acid whose removal improves the model is Glu.

PCA performs worse, which might be caused by the fact that it removes more information to reach 99% of variance explained:

In [49]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[0.1, 1, 10, 100],
    gamma=["scale", 1, 0.1, 0.01, 0.001],
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.86, 'svc__C': 1, 'svc__class_weight': None, 'svc__gamma': 0.1}
0.761


### Validation

Kbest had a slightly worse performance on the training set than PCA, and a much better performance on the test set.The performance of the model without feature selection is between that of PCA and Kbest.

#### Kbest

In [50]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_kbest, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,30,5
Sugar transport,7,31


In [51]:
get_classification_report(X_test, y_test, best_estimator_svc_kbest, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.811,0.857,0.833,35
Sugar transport,0.861,0.816,0.838,38
macro avg,0.836,0.836,0.836,73
weighted avg,0.837,0.836,0.836,73


#### No feature selection:

In [52]:
get_confusion_matrix(X_test, y_test, best_estimator_svc, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,29,6
Sugar transport,8,30


In [53]:
get_classification_report(X_test, y_test, best_estimator_svc, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.784,0.829,0.806,35
Sugar transport,0.833,0.789,0.811,38
macro avg,0.809,0.809,0.808,73
weighted avg,0.81,0.808,0.808,73


#### PCA

In [54]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,24,11
Sugar transport,12,26


In [55]:
get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.667,0.686,0.676,35
Sugar transport,0.703,0.684,0.693,38
macro avg,0.685,0.685,0.685,73
weighted avg,0.685,0.685,0.685,73


## Checking dependence on train test split


### Kbest model

On average, the model performs about as good as on the split above:

In [56]:
df_scores, df_params = full_test(
    df_aac, labels, dim_reduction="kbest", kernel="rbf", repetitions=10
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.7815,0.8302
Sugar transport,0.7932,0.8242


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.053473,0.02493
Sugar transport,0.048237,0.021332


#### PCA

As before, the PCA model delivers worse results than Kbest:

In [57]:
df_scores, df_params = full_test(
    df_aac, labels, dim_reduction="pca", kernel="rbf", repetitions=10
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.7164,0.7888
Sugar transport,0.7344,0.7901


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.035081,0.03474
Sugar transport,0.032722,0.024969


### All features

Interestingly, the average performance is actually better on average without feature selection. The removal of Glu improving scores might have been something that was only the case in our split.  

In [58]:
df_scores, df_params = full_test(
    df_aac, labels, kernel="rbf", repetitions=10
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)
df_params

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8015,0.815
Sugar transport,0.8089,0.8098


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.057231,0.019131
Sugar transport,0.050346,0.016206


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
svc__C,1.0,1,10,10,1,1,1.0,10,1.0,1.0
svc__class_weight,,balanced,balanced,balanced,balanced,balanced,,balanced,,
svc__gamma,0.1,scale,0.1,0.1,scale,0.1,0.1,0.1,0.1,0.1


## Conclusion

Using all frequencies of the AAC together with a RBF-SVM delivers the best results for the meta-model. 