# PAAC Feature evaluation

# Imports

In [2]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test
)
from subpred.compositions import calculate_paac

# Dataset

In [3]:
df = create_dataset(
    keywords_substrate_filter = ["Amino-acid transport","Sugar transport"],
    keywords_component_filter = ["Transmembrane"],
    keywords_transport_filter = ["Transport"],
    input_file = "../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate = "integrate",
    outliers= ['P76773', 'Q47706', 'P02943', 'P75733', 'P69856', 'P64550'],
    verbose = True,
    tax_ids_filter = [83333],
    output_log = "../logs/ecoli_amino_sugar_dataset.log",
    sequence_clustering=70
)

cd-hit: clustered 99 sequences into 98 clusters at threshold 70


# Feature generation

In [4]:
labels = df.keywords_transport
labels.value_counts()

Amino-acid transport    51
Sugar transport         47
Name: keywords_transport, dtype: int64

In [5]:
df_paac = calculate_paac(df.sequence)
df_paac

Unnamed: 0_level_0,AA,AC,AD,AE,AF,AG,AH,AI,AK,AL,...,YM,YN,YP,YQ,YR,YS,YT,YV,YW,YY
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P69801,0.030189,0.007547,0.007547,0.000000,0.007547,0.030189,0.000000,0.015094,0.000000,0.018868,...,0.000000,0.003774,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
P36672,0.006356,0.000000,0.000000,0.000000,0.002119,0.006356,0.002119,0.010593,0.002119,0.016949,...,0.000000,0.000000,0.002119,0.006356,0.004237,0.000000,0.000000,0.000000,0.002119,0.002119
P56580,0.003145,0.003145,0.000000,0.009434,0.003145,0.003145,0.003145,0.009434,0.003145,0.012579,...,0.000000,0.000000,0.003145,0.003145,0.000000,0.000000,0.000000,0.003145,0.000000,0.000000
P0AA47,0.004435,0.002217,0.000000,0.002217,0.011086,0.008869,0.004435,0.011086,0.002217,0.024390,...,0.002217,0.000000,0.002217,0.000000,0.000000,0.000000,0.002217,0.006652,0.000000,0.000000
P08722,0.006410,0.001603,0.006410,0.001603,0.004808,0.014423,0.001603,0.009615,0.001603,0.012821,...,0.000000,0.000000,0.001603,0.001603,0.000000,0.003205,0.001603,0.001603,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P19642,0.007561,0.001890,0.001890,0.000000,0.007561,0.011342,0.000000,0.011342,0.001890,0.015123,...,0.001890,0.001890,0.000000,0.000000,0.000000,0.000000,0.000000,0.003781,0.000000,0.001890
P0AAD4,0.009950,0.000000,0.007463,0.000000,0.002488,0.024876,0.000000,0.007463,0.004975,0.029851,...,0.002488,0.000000,0.002488,0.002488,0.002488,0.000000,0.002488,0.000000,0.000000,0.000000
P23173,0.002415,0.000000,0.002415,0.002415,0.007246,0.009662,0.000000,0.012077,0.000000,0.009662,...,0.000000,0.000000,0.002415,0.000000,0.000000,0.002415,0.000000,0.000000,0.000000,0.000000
P33361,0.018229,0.005208,0.000000,0.000000,0.002604,0.007812,0.000000,0.013021,0.000000,0.046875,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.002604,0.000000,0.000000,0.000000,0.000000


## Independent test set

In [6]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_paac, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)

## Model comparison

In [7]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.36,0.686,0.619,0.583,0.641,0.578,0.127
KNeighborsClassifier(),0.561,0.875,0.686,0.354,0.55,0.605,0.192
"LinearSVC(class_weight='balanced', max_iter=1000000.0)",0.873,0.746,0.937,0.533,0.4,0.698,0.227
LinearSVC(max_iter=1000000.0),0.873,0.746,0.937,0.533,0.4,0.698,0.227
RandomForestClassifier(),0.564,0.75,0.873,0.525,0.525,0.647,0.157
RandomForestClassifier(class_weight='balanced'),0.6,0.654,0.812,0.732,0.533,0.666,0.109
SGDClassifier(),0.564,0.812,0.812,0.598,0.498,0.657,0.146
SVC(),0.733,0.873,0.654,0.55,0.525,0.667,0.142
SVC(class_weight='balanced'),0.733,0.875,0.937,0.444,0.533,0.705,0.213


## Parameter tuning

Results are not as good without feature selection or pca. RBF kernel performs slightly better.

In [8]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="linear"
)

{'linearsvc__C': 1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0}
0.698


In [9]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="rbf"
)

{'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.705


## Dimensionality reduction

In [10]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[0.1, 1, 10],
    gamma = [1e-0, 1e-1, 1e-2, 1e-3,"scale"]
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.8300000000000001, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 0.1}
0.792


Note: Kbest showed slightly lower performance, but was raising exceptions due to constant features.

## Validation

### PCA

The model correctly predicts all amino acid transporters, but only 60% of the sugar transporters. PAAC alone is not a suitable feature for prediction in E Coli.

It is unclear why the results are 0.05 better in the old notebook, as all the parameters are the same. The reason could be an older version of a program in the old conda environment, or the fact that we used the Intel-accelerated version of sklearn in the previous version of the package.

In [11]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,10,0
Sugar transport,4,6


In [12]:
get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.714,1.0,0.833,10
Sugar transport,1.0,0.6,0.75,10
macro avg,0.857,0.8,0.792,20
weighted avg,0.857,0.8,0.792,20


## Estimating validation variance

Mean and standard deviation for randomly selected training and validation sets.

In [13]:
df_scores, df_params = full_test(
    df_paac, labels, dim_reduction="pca", kernel="rbf", repetitions=10
)

In [14]:
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)

#### Mean F1

The scores do not improve on average, and are worse than for AAC.

In [15]:
df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score")

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.7507,0.7667
Sugar transport,0.7111,0.7497


#### Standard deviation F1

In [16]:
df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score")

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.079595,0.04601
Sugar transport,0.145187,0.047798


#### Parameters

In [17]:
df_params

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.9,0.86,0.95,0.98,0.8,0.8,0.9,0.94,0.88,0.94
svc__C,1,10,1,1,10,1,10,10,1.0,1
svc__class_weight,balanced,balanced,,,balanced,balanced,balanced,balanced,,
svc__gamma,0.1,0.01,scale,scale,scale,0.1,0.01,0.01,0.01,scale


## Second outlier

What would happen if we removed the second potential outlier that was found in notebook 1?

In [18]:
df_filtered = df[df.index != "P56579"]
df_paac_filtered = calculate_paac(df_filtered.sequence)
labels_filtered = df_filtered.keywords_transport
df_scores, df_params = full_test(
    df_paac_filtered, labels_filtered, dim_reduction="pca", kernel="rbf", repetitions=10
)

In [23]:
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
print("Mean F1")
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
print("F1 sdev")
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))

Mean F1


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.7848,0.7987
Sugar transport,0.762,0.7943


F1 sdev


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.112179,0.032366
Sugar transport,0.118061,0.038733


Removing the seventh outlier increases the average scores, but also the standard deviations on the test set. 