# AAC feature evaluation

# Imports

In [1]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test
)
from subpred.compositions import calculate_aac

# Dataset

In [2]:
df = create_dataset(
    keywords_substrate_filter = ["Amino-acid transport","Sugar transport"],
    keywords_component_filter = ["Transmembrane"],
    keywords_transport_filter = ["Transport"],
    input_file = "../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate = "integrate",
    verbose = True,
    tax_ids_filter = [559292],
    output_log = "../logs/yeast_amino_sugar_dataset.log",
    sequence_clustering=70
)

cd-hit: clustered 64 sequences into 51 clusters at threshold 70


# Feature generation

## Labels

In [3]:
labels = df.keywords_transport
labels.value_counts()

Amino-acid transport    34
Sugar transport         17
Name: keywords_transport, dtype: int64

There will be a very low number of samples available for training and testing, only 0.8\*0.8\*17 for sugar. This is likely not enough. We will try nested LOOCV later.

## AAC

In [4]:
df_aac = calculate_aac(df.sequence)
df_aac

Unnamed: 0_level_0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Q08986,0.085179,0.025554,0.027257,0.0477,0.069847,0.078365,0.010221,0.091993,0.044293,0.09029,0.018739,0.035775,0.032368,0.027257,0.035775,0.105622,0.044293,0.074957,0.017036,0.037479
P38967,0.069257,0.023649,0.035473,0.035473,0.069257,0.089527,0.023649,0.103041,0.050676,0.096284,0.018581,0.035473,0.032095,0.025338,0.032095,0.092905,0.035473,0.076014,0.028716,0.027027
P38085,0.074313,0.022617,0.035541,0.04685,0.06462,0.069467,0.021002,0.075929,0.048465,0.096931,0.019386,0.042003,0.037157,0.030695,0.037157,0.096931,0.040388,0.084006,0.017771,0.038772
P15380,0.07815,0.025518,0.041467,0.035088,0.057416,0.092504,0.019139,0.087719,0.039872,0.087719,0.015949,0.039872,0.041467,0.020734,0.038278,0.079745,0.059011,0.076555,0.028708,0.035088
P38206,0.04878,0.012195,0.029617,0.041812,0.095819,0.045296,0.012195,0.099303,0.057491,0.141115,0.017422,0.052265,0.022648,0.036585,0.038328,0.083624,0.055749,0.054007,0.015679,0.04007
Q12300,0.074705,0.011796,0.040629,0.039318,0.045872,0.072084,0.017038,0.093054,0.034076,0.077326,0.018349,0.076016,0.045872,0.031455,0.031455,0.10616,0.057667,0.073394,0.010485,0.04325
Q12010,0.045455,0.016234,0.045455,0.038961,0.071429,0.055195,0.00974,0.097403,0.029221,0.123377,0.025974,0.068182,0.051948,0.042208,0.025974,0.087662,0.025974,0.077922,0.022727,0.038961
Q03697,0.047511,0.011312,0.033937,0.040724,0.072398,0.061086,0.015837,0.08371,0.047511,0.128959,0.022624,0.056561,0.036199,0.036199,0.027149,0.085973,0.058824,0.08371,0.015837,0.033937
Q04602,0.049479,0.022135,0.039062,0.040365,0.0625,0.052083,0.019531,0.078125,0.05599,0.121094,0.027344,0.042969,0.042969,0.036458,0.033854,0.122396,0.050781,0.0625,0.011719,0.028646
P10870,0.06448,0.007919,0.053167,0.047511,0.054299,0.072398,0.015837,0.074661,0.038462,0.078054,0.022624,0.06448,0.039593,0.036199,0.038462,0.114253,0.072398,0.065611,0.007919,0.031674


### Independent test set

In [5]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_aac, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)

### Model selection


In [6]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.667,0.564,1.0,0.385,0.564,0.636,0.228
KNeighborsClassifier(),1.0,0.795,0.667,0.385,0.667,0.703,0.224
"LinearSVC(class_weight='balanced', max_iter=1000000.0, random_state=0)",0.855,0.667,0.873,0.333,0.385,0.622,0.254
"LinearSVC(max_iter=1000000.0, random_state=0)",0.855,0.667,0.873,0.333,0.385,0.622,0.254
"RandomForestClassifier(class_weight='balanced', random_state=0)",0.795,0.795,0.855,0.385,0.385,0.643,0.237
RandomForestClassifier(random_state=0),0.795,0.667,0.855,0.385,0.385,0.617,0.223
SGDClassifier(random_state=0),0.855,0.667,0.873,0.333,0.385,0.622,0.254
"SVC(class_weight='balanced', random_state=0)",1.0,0.795,1.0,0.385,0.667,0.769,0.258
SVC(random_state=0),0.795,0.795,0.855,0.385,0.667,0.699,0.189


### Parameter tuning

RBF kernel delivers slightly better results

In [7]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="linear", dim_reduction=None)

{'linearsvc__C': 0.1, 'linearsvc__class_weight': None, 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0}
0.689


In [8]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="rbf", dim_reduction=None, C=[1,0.1,10])
best_estimator_svc = gsearch

{'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.769


### Dimensionality reduction

Kbest performs slightly better:

In [9]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="rbf", dim_reduction="kbest"#, C=[1,10,100], gamma=["scale"]
)
best_estimator_svc_kbest = gsearch

{'selectkbest__k': 5, 'svc__C': 1, 'svc__class_weight': None, 'svc__gamma': 'scale'}
0.785


PCA improves the results a bit more_

In [10]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    # C=[1, 10, 100],
    # gamma=["scale", 0.1, 0.01, 0.001],
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.94, 'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 0.1}
0.769


### Validation

The test set is only made up of 10 amino and 7 sugar transporters, so one miss-classification can already change the score by quite a large amount. The sample count does not seem to be enough for a good model.

The PCA model seems to lead to less overfitting, having better scores on the test set.

In [11]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_kbest, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,6,1
Sugar transport,3,1


In [12]:
get_classification_report(X_test, y_test, best_estimator_svc_kbest, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.667,0.857,0.75,7
Sugar transport,0.5,0.25,0.333,4
macro avg,0.583,0.554,0.542,11
weighted avg,0.606,0.636,0.598,11


In [13]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,7,0
Sugar transport,3,1


In [14]:
get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.7,1.0,0.824,7
Sugar transport,1.0,0.25,0.4,4
macro avg,0.85,0.625,0.612,11
weighted avg,0.809,0.727,0.67,11


## Checking dependence on train test split

Especially the performance of the sugar transporter test set is not good, it's the class with the fewest number of samples

#### PCA model:

The average model performance does not improve with other train test splits:

In [15]:
df_scores, df_params = full_test(
    df_aac, labels, dim_reduction="pca", kernel="rbf", repetitions=10
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8143,0.8959
Sugar transport,0.5693,0.7593


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.044756,0.032895
Sugar transport,0.142985,0.083758


#### Kbest

The same is true for kbest:

In [16]:
df_scores, df_params = full_test(
    df_aac, labels, dim_reduction="kbest", kernel="rbf", repetitions=10
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.7996,0.926
Sugar transport,0.5391,0.8173


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.070122,0.022371
Sugar transport,0.167901,0.063901


## LOOCV

To increase the number of samples availale for training, we carry out a nested leave one out cross validation:

In [17]:
df_scores, df_params = full_test(
    df_aac, labels, dim_reduction="pca", kernel="rbf", repetitions=10, cross_val_method="loocv"
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8008,0.9122
Sugar transport,0.537,0.7492


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.080168,0.033947
Sugar transport,0.128152,0.113284


In [18]:
df_params

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.93,0.98,0.97,0.94,0.8,0.9,0.83,0.98,0.93,0.85
svc__C,10,10.0,10.0,1,10,1,1,10.0,1,1
svc__class_weight,balanced,,,balanced,balanced,balanced,balanced,,balanced,balanced
svc__gamma,scale,0.01,0.01,scale,scale,scale,scale,0.01,0.1,scale


With LOOCV, there is a slight improvement in average F1 scores and their standard deviations. It becomes clear that AAC alone is not suitable for this classification task.

In [19]:
df_scores, df_params = full_test(
    df_aac, labels, dim_reduction="kbest", kernel="rbf", repetitions=10, cross_val_method="loocv"
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)