# PAAC Feature evaluation

# Imports

In [1]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test,
    nested_loocv
)
from subpred.compositions import calculate_paac

# Dataset

In [2]:
df = create_dataset(
    keywords_substrate_filter = ["Amino-acid transport","Sugar transport"],
    keywords_component_filter = ["Transmembrane"],
    keywords_transport_filter = ["Transport"],
    input_file = "../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate = "integrate",
    verbose = True,
    tax_ids_filter = [9606],
    output_log = "../logs/human_amino_sugar_dataset.log",
    outliers=["Q9HBR0", "Q07837"],
    sequence_clustering=70
)

cd-hit: clustered 85 sequences into 81 clusters at threshold 70


# Feature generation

In [3]:
labels = df.keywords_transport
labels.value_counts()

Amino-acid transport    48
Sugar transport         33
Name: keywords_transport, dtype: int64

In [4]:
df_paac = calculate_paac(df.sequence)
df_paac

Unnamed: 0_level_0,AA,AC,AD,AE,AF,AG,AH,AI,AK,AL,...,YM,YN,YP,YQ,YR,YS,YT,YV,YW,YY
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q9BRV3,0.004545,0.004545,0.004545,0.000000,0.000000,0.009091,0.000000,0.000000,0.004545,0.009091,...,0.000000,0.000000,0.009091,0.000000,0.000000,0.000000,0.000000,0.000000,0.004545,0.000000
Q5M8T2,0.012048,0.002410,0.002410,0.000000,0.002410,0.014458,0.002410,0.002410,0.002410,0.004819,...,0.002410,0.000000,0.000000,0.002410,0.000000,0.000000,0.000000,0.007229,0.000000,0.000000
Q969S0,0.000000,0.000000,0.003030,0.000000,0.003030,0.003030,0.000000,0.006061,0.003030,0.021212,...,0.000000,0.003030,0.000000,0.000000,0.000000,0.003030,0.006061,0.003030,0.000000,0.003030
O75387,0.005376,0.007168,0.000000,0.003584,0.007168,0.001792,0.000000,0.003584,0.000000,0.010753,...,0.001792,0.000000,0.000000,0.000000,0.003584,0.003584,0.003584,0.000000,0.000000,0.001792
Q9NTN3,0.008475,0.000000,0.002825,0.008475,0.008475,0.005650,0.000000,0.002825,0.002825,0.008475,...,0.000000,0.005650,0.000000,0.000000,0.002825,0.002825,0.000000,0.002825,0.000000,0.002825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q96A29,0.002755,0.008264,0.000000,0.008264,0.002755,0.000000,0.002755,0.005510,0.002755,0.016529,...,0.000000,0.005510,0.000000,0.002755,0.000000,0.000000,0.005510,0.005510,0.002755,0.002755
Q9BYW1,0.010101,0.008081,0.000000,0.000000,0.002020,0.014141,0.000000,0.010101,0.002020,0.018182,...,0.000000,0.002020,0.002020,0.000000,0.000000,0.002020,0.002020,0.002020,0.000000,0.000000
P14672,0.007874,0.000000,0.001969,0.005906,0.003937,0.009843,0.000000,0.007874,0.000000,0.007874,...,0.000000,0.003937,0.000000,0.000000,0.000000,0.003937,0.000000,0.007874,0.000000,0.001969
Q96AA3,0.007407,0.001852,0.000000,0.001852,0.011111,0.003704,0.003704,0.001852,0.003704,0.007407,...,0.000000,0.001852,0.000000,0.000000,0.001852,0.003704,0.001852,0.005556,0.000000,0.001852


## Independent test set

In [5]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_paac, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)

## Model comparison

Before optimization, linear SVC looks best. This could be an indicator of overfitting, which can be caused by low sample count

In [6]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.69,0.575,0.511,0.69,0.556,0.605,0.082
KNeighborsClassifier(),0.606,0.639,0.639,0.606,0.733,0.645,0.052
"LinearSVC(class_weight='balanced', max_iter=1000000.0, random_state=0)",0.921,0.536,0.639,0.606,0.911,0.723,0.181
"LinearSVC(max_iter=1000000.0, random_state=0)",0.921,0.536,0.639,0.606,0.911,0.723,0.181
"RandomForestClassifier(class_weight='balanced', random_state=0)",0.707,0.707,0.567,0.35,0.556,0.577,0.146
RandomForestClassifier(random_state=0),0.707,0.819,0.511,0.511,0.556,0.621,0.137
SGDClassifier(random_state=0),0.675,0.69,0.675,0.536,0.812,0.678,0.098
"SVC(class_weight='balanced', random_state=0)",0.819,0.511,0.381,0.639,0.697,0.61,0.169
SVC(random_state=0),0.381,0.567,0.381,0.35,0.368,0.409,0.089


## Parameter tuning

Results okay without feature selection or pca. Linear kernel performs slightly better. The linear kernel wants a relatively high value of C, the RBF kernel chooses the default C of 1.

In [7]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="linear", C=[1, 10, 100, 1000])

{'linearsvc__C': 100, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__max_iter': 100000000.0}
0.762


In [8]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="rbf"
)

{'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.61


## Dimensionality reduction

### Linear kernel

In [9]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
    C=[1, 0.01, 0.1, 10, 0.001],
)

{'linearsvc__C': 0.01, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.8}
0.755


In [10]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="kbest",
    C=[1, 10, 100],
)
best_estimator_lsvc_kbest = gsearch

{'linearsvc__C': 10, 'linearsvc__class_weight': None, 'linearsvc__dual': False, 'linearsvc__max_iter': 100000000.0, 'selectkbest__k': 171}
0.822


#### RBF

In [11]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="kbest",
    C=[1, 10, 100],
    # gamma = [1e-0, 1e-1, 1e-2, 1e-3,"scale"]
)
best_estimator_svc_kbest = gsearch

{'selectkbest__k': 58, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01}
0.796


In [12]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[0.1, 1, 10],
    # gamma = [1e-0, 1e-1, 1e-2, 1e-3,"scale"]
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.8200000000000001, 'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01}
0.77


The linear kernel with feature selection shows the best result here.

## Validation

### Linear kernel

In [13]:
get_confusion_matrix(X_test, y_test, best_estimator_lsvc_kbest, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,8,2
Sugar transport,2,5


In [14]:
get_classification_report(X_test, y_test, best_estimator_lsvc_kbest, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.8,0.8,0.8,10
Sugar transport,0.714,0.714,0.714,7
macro avg,0.757,0.757,0.757,17
weighted avg,0.765,0.765,0.765,17


### RBF kernel

In [15]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_kbest, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,9,1
Sugar transport,0,7


In [16]:
get_classification_report(X_test, y_test, best_estimator_svc_kbest, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,1.0,0.9,0.947,10
Sugar transport,0.875,1.0,0.933,7
macro avg,0.938,0.95,0.94,17
weighted avg,0.949,0.941,0.942,17


The RBF kernel with Kbest works well on this dataset.

## Alternative eval with nested LOOCV

In [17]:
nested_loocv(df_features=df_paac, labels=labels, dim_reduction=None, kernel="rbf")

Unnamed: 0,train,test
F1 (macro),0.756,0.693


In [18]:
nested_loocv(df_features=df_paac, labels=labels, dim_reduction="pca", kernel="rbf")

Unnamed: 0,train,test
F1 (macro),0.882,0.839


## Estimating validation variance

Mean and standard deviation for randomly selected training and validation sets.

In [19]:
df_scores, df_params = full_test(
    df_paac, labels, dim_reduction="kbest", kernel="rbf", repetitions=10
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
df_params

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.7855,0.8579
Sugar transport,0.5691,0.722


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.090347,0.037731
Sugar transport,0.18846,0.087481


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
selectkbest__k,155,16,71,46,86,131,51,98,233,153
svc__C,1,10,10,10,10,10,1,1,1,10
svc__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,balanced,,balanced,balanced
svc__gamma,scale,scale,0.01,0.01,0.01,scale,0.01,scale,scale,scale


The performance for the minority class is still not very good on the independent test set.

## Additional outliers

What happens when we remove the Sideroflexins? 

In [20]:
mask_sideroflexin = ~df.protein_names.str.startswith("Side")
df_scores, df_params = full_test(
    df_paac.loc[mask_sideroflexin],
    labels.loc[mask_sideroflexin],
    dim_reduction="pca",
    kernel="rbf",
    repetitions=10,
    cross_val_method="loocv",
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)
df_params

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8377,0.952
Sugar transport,0.7959,0.9286


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.102214,0.023195
Sugar transport,0.151627,0.03566


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.99,0.95,0.99,0.99,0.98,0.97,0.97,0.98,0.95,0.99
svc__C,1,1,1,1,1,1,1,1,1,1
svc__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced
svc__gamma,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01


Removing the outliers from the PAAC PCA plot improves the score by a good amount! The parameters are always the same, across all datasets, which is a good sign for a stable model. Can overfitting be reduced by using a linear kernel?

In [21]:
mask_sideroflexin = ~df.protein_names.str.startswith("Side")
df_scores, df_params = full_test(
    df_paac.loc[mask_sideroflexin],
    labels.loc[mask_sideroflexin],
    dim_reduction="pca",
    kernel="linear",
    repetitions=10,
    cross_val_method="loocv",
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .mean()
    .pivot(index="label", columns="dataset", values="F1 score")
)
display(
    df_scores.groupby(["label", "dataset"], as_index=False)
    .std()
    .pivot(index="label", columns="dataset", values="F1 score")
)
df_params

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8138,0.8673
Sugar transport,0.6587,0.7962


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.062464,0.044592
Sugar transport,0.176582,0.068775


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
linearsvc__C,0.1,1,1,0.1,0.1,1,1,1,1,10
linearsvc__class_weight,balanced,balanced,balanced,,,balanced,balanced,balanced,balanced,balanced
linearsvc__dual,True,True,True,True,True,True,True,True,True,True
linearsvc__max_iter,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0,100000000.0
pca__n_components,0.96,0.97,0.95,0.87,0.93,0.97,0.99,0.8,0.91,0.85


Yes, but the scores are lower in general, compared to RBF.