# AAC feature evaluation

# Imports

In [1]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test
)
from subpred.compositions import calculate_aac

# Dataset

In [2]:
outliers = (
    ["Q9HBR0", "Q07837"]
    + ["P76773", "Q47706", "P02943", "P75733", "P69856", "P64550"]
    + ["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0"]
)
df = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Transmembrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    tax_ids_filter=[3702, 9606, 83333, 559292],
    output_log="../logs/meta_amino_sugar_dataset.log",
    outliers=outliers,
    sequence_clustering=70,
)
taxid_to_organism = {
    3702: "A. thaliana",
    9606: "Human",
    83333: "E. coli",
    559292: "Yeast",
}
df = df.assign(organism=df.organism_id.map(taxid_to_organism))


cd-hit: clustered 413 sequences into 347 clusters at threshold 70


# Feature generation

## Labels

In [3]:
labels = df.keywords_transport
labels.value_counts()

Sugar transport         181
Amino-acid transport    166
Name: keywords_transport, dtype: int64

## AAC

In [4]:
df_aac = calculate_aac(df.sequence)
df_aac

Unnamed: 0_level_0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
P69801,0.150376,0.007519,0.030075,0.022556,0.037594,0.093985,0.015038,0.120301,0.011278,0.101504,0.048872,0.052632,0.030075,0.033835,0.018797,0.037594,0.056391,0.105263,0.007519,0.018797
Q9SFG0,0.090730,0.015779,0.033531,0.035503,0.086785,0.088757,0.015779,0.092702,0.033531,0.104536,0.033531,0.033531,0.039448,0.029586,0.039448,0.055227,0.041420,0.086785,0.017751,0.025641
Q08986,0.085179,0.025554,0.027257,0.047700,0.069847,0.078365,0.010221,0.091993,0.044293,0.090290,0.018739,0.035775,0.032368,0.027257,0.035775,0.105622,0.044293,0.074957,0.017036,0.037479
Q9BRV3,0.058824,0.022624,0.031674,0.018100,0.058824,0.072398,0.009050,0.045249,0.022624,0.176471,0.027149,0.031674,0.040724,0.045249,0.045249,0.063348,0.072398,0.072398,0.027149,0.058824
Q84WN3,0.066390,0.004149,0.020747,0.053942,0.070539,0.074689,0.008299,0.070539,0.041494,0.120332,0.029046,0.033195,0.041494,0.016598,0.029046,0.087137,0.058091,0.103734,0.016598,0.053942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F4IHS9,0.055556,0.017544,0.023392,0.017544,0.076023,0.064327,0.023392,0.067251,0.055556,0.154971,0.026316,0.046784,0.032164,0.035088,0.011696,0.093567,0.081871,0.084795,0.002924,0.029240
Q04162,0.072072,0.018018,0.036036,0.032432,0.063063,0.057658,0.009009,0.095495,0.027027,0.122523,0.030631,0.037838,0.037838,0.019820,0.048649,0.106306,0.070270,0.063063,0.010811,0.041441
P33361,0.135065,0.012987,0.025974,0.007792,0.044156,0.083117,0.012987,0.067532,0.015584,0.197403,0.023377,0.020779,0.046753,0.038961,0.041558,0.046753,0.044156,0.090909,0.028571,0.015584
P39328,0.114370,0.005865,0.029326,0.008798,0.038123,0.102639,0.002933,0.099707,0.020528,0.155425,0.035191,0.032258,0.043988,0.032258,0.041056,0.049853,0.061584,0.105572,0.017595,0.002933


### Independent test set

In [5]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_aac, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)

### Model selection


In [6]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.642,0.75,0.633,0.743,0.672,0.688,0.055
KNeighborsClassifier(),0.768,0.819,0.835,0.78,0.836,0.808,0.032
"LinearSVC(class_weight='balanced', max_iter=1000000.0, random_state=0)",0.696,0.678,0.727,0.763,0.69,0.711,0.034
"LinearSVC(max_iter=1000000.0, random_state=0)",0.677,0.696,0.727,0.745,0.69,0.707,0.028
"RandomForestClassifier(class_weight='balanced', random_state=0)",0.768,0.875,0.764,0.776,0.778,0.792,0.047
RandomForestClassifier(random_state=0),0.768,0.893,0.726,0.778,0.738,0.781,0.066
SGDClassifier(random_state=0),0.607,0.679,0.654,0.673,0.673,0.657,0.03
"SVC(class_weight='balanced', random_state=0)",0.821,0.875,0.836,0.818,0.727,0.815,0.054
SVC(random_state=0),0.804,0.875,0.836,0.815,0.727,0.811,0.054


### Parameter tuning

RBF kernel delivers slightly better results

In [7]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="linear")

{'linearsvc__C': 0.1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0}
0.715


In [8]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="rbf", C=[0.1, 1, 10, 100])
best_estimator_svc = gsearch

{'svc__C': 1, 'svc__class_weight': None, 'svc__gamma': 0.1}
0.841


### Dimensionality reduction

Kbest performs slightly worse, and only removes one feature:

In [9]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="rbf", dim_reduction="kbest"#, C=[1,10,100], gamma=["scale"]
)
best_estimator_svc_kbest = gsearch

{'selectkbest__k': 19, 'svc__C': 1, 'svc__class_weight': None, 'svc__gamma': 0.1}
0.819


In [10]:
feature_names[~best_estimator_svc_kbest.best_estimator_["selectkbest"].get_support()]

array(['N'], dtype='<U1')

The amino acid whose removal improves the model is Asn.

PCA performs worse, which might be caused by the fact that it removes more information to reach 99% of variance explained:

In [11]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[0.1, 1, 10, 100],
    gamma=["scale", 1, 0.1, 0.01, 0.001],
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.99, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.804


### Validation


#### Kbest

In [12]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_kbest, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,25,8
Sugar transport,11,26


In [13]:
get_classification_report(X_test, y_test, best_estimator_svc_kbest, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.694,0.758,0.725,33
Sugar transport,0.765,0.703,0.732,37
macro avg,0.73,0.73,0.729,70
weighted avg,0.732,0.729,0.729,70


#### No feature selection:

In [14]:
get_confusion_matrix(X_test, y_test, best_estimator_svc, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,25,8
Sugar transport,10,27


In [15]:
get_classification_report(X_test, y_test, best_estimator_svc, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.714,0.758,0.735,33
Sugar transport,0.771,0.73,0.75,37
macro avg,0.743,0.744,0.743,70
weighted avg,0.744,0.743,0.743,70


#### PCA

In [16]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,21,12
Sugar transport,13,24


In [17]:
get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.618,0.636,0.627,33
Sugar transport,0.667,0.649,0.658,37
macro avg,0.642,0.643,0.642,70
weighted avg,0.644,0.643,0.643,70


## Checking dependence on train test split


### Kbest model

On average, the model performs about as good as on the split above:

In [18]:
df_scores, df_params = full_test(
    df_aac, labels, dim_reduction="kbest", kernel="rbf", repetitions=10
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.7867,0.8261
Sugar transport,0.8031,0.8274


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.063523,0.021163
Sugar transport,0.055671,0.016641


#### PCA

As before, the PCA model delivers worse results than Kbest:

In [19]:
df_scores, df_params = full_test(
    df_aac, labels, dim_reduction="pca", kernel="rbf", repetitions=10
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.7417,0.7869
Sugar transport,0.7697,0.7913


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.042301,0.03115
Sugar transport,0.039004,0.028269


### All features

Interestingly, the average performance is actually better on average without feature selection. The removal of Asn improving scores might have been something that was only the case in our split.  

In [20]:
df_scores, df_params = full_test(
    df_aac, labels, kernel="rbf", repetitions=10
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)
df_params

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8108,0.8072
Sugar transport,0.8251,0.8105


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.03944,0.025359
Sugar transport,0.038561,0.016715


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
svc__C,1,10,1,1,10,1,10,1.0,1,1
svc__class_weight,,balanced,balanced,balanced,balanced,balanced,balanced,,balanced,balanced
svc__gamma,scale,0.1,scale,0.1,0.1,0.1,0.1,0.1,0.1,0.1


## Conclusion

Using all frequencies of the AAC together with a RBF-SVM delivers the best results for the meta-model. 