# AAC feature evaluation

# Imports

In [1]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test
)
from subpred.compositions import calculate_aac

# Dataset

In [2]:
df = create_dataset(
    keywords_substrate_filter = ["Amino-acid transport","Sugar transport"],
    keywords_component_filter = ["Transmembrane"],
    keywords_transport_filter = ["Transport"],
    input_file = "../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate = "integrate",
    verbose = True,
    tax_ids_filter = [9606],
    output_log = "../logs/human_amino_sugar_dataset.log",
    outliers=["Q9HBR0", "Q07837"],
    sequence_clustering=70
)

cd-hit: clustered 85 sequences into 81 clusters at threshold 70


# Feature generation

## Labels

In [3]:
labels = df.keywords_transport
labels.value_counts()

Amino-acid transport    48
Sugar transport         33
Name: keywords_transport, dtype: int64

There will be a relatively low number of samples available for training and testing, only 0.8\*0.8\*33 for sugar. This is likely not enough. We will try LOOCV later.

## AAC

In [4]:
df_aac = calculate_aac(df.sequence)
df_aac

Unnamed: 0_level_0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Q9BRV3,0.058824,0.022624,0.031674,0.018100,0.058824,0.072398,0.009050,0.045249,0.022624,0.176471,0.027149,0.031674,0.040724,0.045249,0.045249,0.063348,0.072398,0.072398,0.027149,0.058824
Q5M8T2,0.096154,0.024038,0.024038,0.050481,0.038462,0.093750,0.012019,0.052885,0.026442,0.129808,0.021635,0.019231,0.048077,0.028846,0.048077,0.088942,0.062500,0.098558,0.009615,0.026442
Q969S0,0.069486,0.018127,0.015106,0.033233,0.090634,0.060423,0.018127,0.090634,0.045317,0.126888,0.036254,0.045317,0.036254,0.024169,0.030211,0.063444,0.063444,0.075529,0.018127,0.039275
O75387,0.076923,0.026834,0.028623,0.035778,0.064401,0.078712,0.012522,0.046512,0.028623,0.148479,0.035778,0.035778,0.046512,0.041145,0.039356,0.075134,0.076923,0.057245,0.016100,0.028623
Q9NTN3,0.090141,0.011268,0.030986,0.039437,0.076056,0.076056,0.005634,0.061972,0.061972,0.129577,0.030986,0.039437,0.022535,0.025352,0.028169,0.059155,0.076056,0.081690,0.011268,0.042254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q96A29,0.082418,0.024725,0.024725,0.032967,0.054945,0.079670,0.008242,0.038462,0.038462,0.162088,0.027473,0.035714,0.038462,0.021978,0.032967,0.079670,0.074176,0.082418,0.024725,0.035714
Q9BYW1,0.098790,0.032258,0.012097,0.054435,0.048387,0.100806,0.010081,0.064516,0.018145,0.157258,0.032258,0.018145,0.046371,0.034274,0.060484,0.062500,0.044355,0.066532,0.016129,0.022177
P14672,0.086444,0.005894,0.011788,0.051081,0.053045,0.104126,0.007859,0.066798,0.015717,0.155206,0.023576,0.025540,0.058939,0.045187,0.045187,0.070727,0.051081,0.080550,0.013752,0.027505
Q96AA3,0.075786,0.018484,0.018484,0.040665,0.072089,0.070240,0.016636,0.046211,0.031423,0.179298,0.014787,0.027726,0.029575,0.033272,0.046211,0.068392,0.066543,0.090573,0.022181,0.031423


### Independent test set

In [5]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_aac, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)

### Model selection


In [6]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.764,0.764,0.567,0.462,0.486,0.608,0.147
KNeighborsClassifier(),0.615,0.458,0.567,0.513,0.829,0.596,0.143
"LinearSVC(class_weight='balanced', max_iter=1000000.0, random_state=0)",0.536,0.536,0.675,0.536,0.486,0.554,0.071
"LinearSVC(max_iter=1000000.0, random_state=0)",0.35,0.448,0.745,0.536,0.438,0.503,0.15
"RandomForestClassifier(class_weight='balanced', random_state=0)",0.536,0.511,0.381,0.536,0.556,0.504,0.07
RandomForestClassifier(random_state=0),0.615,0.511,0.381,0.536,0.625,0.534,0.099
SGDClassifier(random_state=0),0.35,0.606,0.639,0.536,0.333,0.493,0.143
"SVC(class_weight='balanced', random_state=0)",0.512,0.575,0.567,0.462,0.667,0.557,0.077
SVC(random_state=0),0.675,0.458,0.567,0.405,0.697,0.561,0.129


### Parameter tuning

RBF kernel delivers slightly better results

In [7]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="linear", dim_reduction=None)

{'linearsvc__C': 0.1, 'linearsvc__class_weight': None, 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0}
0.556


In [8]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="rbf", dim_reduction=None, C=[1,0.1,10])
best_estimator_svc = gsearch

{'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01}
0.624


### Dimensionality reduction

Kbest performs slightly better:

In [9]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="rbf", dim_reduction="kbest"#, C=[1,10,100], gamma=["scale"]
)
best_estimator_svc_kbest = gsearch

{'selectkbest__k': 3, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01}
0.664


PCA improves the results a bit more_

In [10]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    # C=[1, 10, 100],
    # gamma=["scale", 0.1, 0.01, 0.001],
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.94, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.659


### Validation

The test set is only made up of 10 amino and 7 sugar transporters, so one miss-classification can already change the score by quite a large amount. The sample count does not seem to be enough for a good model.

The PCA model seems to lead to less overfitting, having better scores on the test set.

In [11]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_kbest, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,6,4
Sugar transport,2,5


In [12]:
get_classification_report(X_test, y_test, best_estimator_svc_kbest, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.75,0.6,0.667,10
Sugar transport,0.556,0.714,0.625,7
macro avg,0.653,0.657,0.646,17
weighted avg,0.67,0.647,0.65,17


In [13]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,8,2
Sugar transport,2,5


In [14]:
get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.8,0.8,0.8,10
Sugar transport,0.714,0.714,0.714,7
macro avg,0.757,0.757,0.757,17
weighted avg,0.765,0.765,0.765,17


## Checking dependence on train test split

Especially the performance of the sugar transporter test set is not good, it's the class with the fewest number of samples

#### PCA model:

The average model performance does not improve with other train test splits:

In [15]:
df_scores, df_params = full_test(
    df_aac, labels, dim_reduction="pca", kernel="rbf", repetitions=10
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.7273,0.8137
Sugar transport,0.5766,0.7316


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.107377,0.040288
Sugar transport,0.168679,0.082974


#### Kbest

The same is true for kbest:

In [16]:
df_scores, df_params = full_test(
    df_aac, labels, dim_reduction="kbest", kernel="rbf", repetitions=10
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.7324,0.8426
Sugar transport,0.5921,0.7394


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.1023,0.075029
Sugar transport,0.146802,0.101609


## LOOCV

To increase the number of samples availale for training, we carry out a nested leave one out cross validation:

In [17]:
df_scores, df_params = full_test(
    df_aac, labels, dim_reduction="pca", kernel="rbf", repetitions=10, cross_val_method="loocv"
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.7184,0.8587
Sugar transport,0.5211,0.7627


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.080949,0.067067
Sugar transport,0.139697,0.11533


In [18]:
df_params

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.8,0.91,0.84,0.96,0.9,0.84,0.84,0.82,0.87,0.87
svc__C,1,1,10,1,10,1,10.0,1,1.0,1
svc__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,,,,balanced
svc__gamma,scale,0.1,0.01,0.01,scale,0.1,0.01,scale,0.1,0.1


With LOOCV, there is a slight improvement in average F1 scores and their standard deviations. It becomes clear that AAC alone is not suitable for this classification task.

In [19]:
df_scores, df_params = full_test(
    df_aac, labels, dim_reduction="kbest", kernel="rbf", repetitions=10, cross_val_method="loocv"
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.6965,0.8873
Sugar transport,0.5266,0.8102


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.117582,0.070659
Sugar transport,0.201974,0.100418


## Without Sideroflexin

Does removing the Sideroflexin proteins improve classification performance?

In [20]:
mask_sideroflexin = ~df.protein_names.str.startswith("Side")
df_scores, df_params = full_test(
    df_aac.loc[mask_sideroflexin],
    labels.loc[mask_sideroflexin],
    dim_reduction="pca",
    kernel="rbf",
    repetitions=10,
    cross_val_method="loocv",
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)
df_params

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.7428,0.8386
Sugar transport,0.5839,0.7523


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.046621,0.042636
Sugar transport,0.133887,0.074288


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.94,0.98,0.8,0.8,0.93,0.8,0.83,0.89,0.84,0.97
svc__C,10.0,10,1,1,10,1.0,10,10,10,10.0
svc__class_weight,,balanced,,balanced,balanced,,balanced,balanced,balanced,
svc__gamma,0.01,0.01,scale,scale,scale,0.1,0.01,0.01,scale,0.01


No, removing these proteins does not make much of a difference. They only occurred in the PCA of PAAC anyway.