# PAAC Feature evaluation

During the dataset evaluation, we found that E Coli transports form its own cluster in the PCA plot. How does the model perform without E Coli transporters?

# Imports

In [1]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test
)
from subpred.compositions import calculate_paac

# Dataset

In [2]:
outliers = (
    ["Q9HBR0", "Q07837"]  + ["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0"]
    
)
df = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Transmembrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    tax_ids_filter=[3702, 9606, 559292],
    output_log="../logs/meta_amino_sugar_dataset.log",
    outliers=outliers,
    sequence_clustering=70
)
taxid_to_organism = {
    3702: "A. thaliana",
    9606: "Human",
    559292: "Yeast",
}
df = df.assign(organism=df.organism_id.map(taxid_to_organism))


cd-hit: clustered 314 sequences into 249 clusters at threshold 70


# Feature generation

In [3]:
labels = df.keywords_transport
labels.value_counts()

Sugar transport         134
Amino-acid transport    115
Name: keywords_transport, dtype: int64

In [4]:
df_paac = calculate_paac(df.sequence)
df_paac

Unnamed: 0_level_0,AA,AC,AD,AE,AF,AG,AH,AI,AK,AL,...,YM,YN,YP,YQ,YR,YS,YT,YV,YW,YY
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q9SFG0,0.005929,0.001976,0.000000,0.000000,0.005929,0.005929,0.000000,0.003953,0.003953,0.009881,...,0.001976,0.000000,0.000000,0.000000,0.001976,0.000000,0.000000,0.003953,0.000000,0.000000
Q08986,0.008532,0.000000,0.000000,0.006826,0.006826,0.003413,0.001706,0.008532,0.006826,0.015358,...,0.001706,0.001706,0.000000,0.001706,0.003413,0.001706,0.000000,0.003413,0.003413,0.001706
Q9BRV3,0.004545,0.004545,0.004545,0.000000,0.000000,0.009091,0.000000,0.000000,0.004545,0.009091,...,0.000000,0.000000,0.009091,0.000000,0.000000,0.000000,0.000000,0.000000,0.004545,0.000000
Q84WN3,0.004167,0.000000,0.000000,0.004167,0.004167,0.004167,0.000000,0.008333,0.004167,0.008333,...,0.004167,0.000000,0.000000,0.000000,0.004167,0.000000,0.000000,0.004167,0.000000,0.004167
O04249,0.005859,0.001953,0.000000,0.003906,0.009766,0.011719,0.001953,0.003906,0.001953,0.005859,...,0.000000,0.000000,0.000000,0.001953,0.000000,0.003906,0.000000,0.001953,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q94EI9,0.005900,0.000000,0.005900,0.005900,0.002950,0.008850,0.000000,0.011799,0.000000,0.014749,...,0.000000,0.002950,0.002950,0.002950,0.000000,0.000000,0.000000,0.005900,0.000000,0.002950
Q92536,0.005837,0.003891,0.001946,0.003891,0.005837,0.000000,0.000000,0.011673,0.001946,0.015564,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.003891,0.001946,0.001946,0.001946,0.001946
F4IHS9,0.002933,0.000000,0.000000,0.000000,0.002933,0.002933,0.000000,0.008798,0.002933,0.005865,...,0.000000,0.000000,0.000000,0.008798,0.000000,0.005865,0.002933,0.000000,0.000000,0.000000
Q04162,0.005415,0.001805,0.001805,0.000000,0.007220,0.000000,0.000000,0.014440,0.000000,0.009025,...,0.001805,0.000000,0.003610,0.000000,0.000000,0.003610,0.003610,0.005415,0.000000,0.001805


## Independent test set

In [5]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_paac, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)

## Model comparison



In [6]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.824,0.749,0.774,0.749,0.795,0.778,0.032
KNeighborsClassifier(),0.774,0.635,0.749,0.749,0.715,0.725,0.054
"LinearSVC(class_weight='balanced', max_iter=1000000.0, random_state=0)",0.875,0.749,0.75,0.775,0.845,0.799,0.058
"LinearSVC(max_iter=1000000.0, random_state=0)",0.875,0.749,0.75,0.775,0.845,0.799,0.058
"RandomForestClassifier(class_weight='balanced', random_state=0)",0.819,0.822,0.795,0.768,0.717,0.784,0.044
RandomForestClassifier(random_state=0),0.763,0.822,0.74,0.768,0.743,0.767,0.033
SGDClassifier(random_state=0),0.85,0.67,0.848,0.775,0.82,0.793,0.075
"SVC(class_weight='balanced', random_state=0)",0.899,0.799,0.819,0.898,0.844,0.852,0.045
SVC(random_state=0),0.899,0.824,0.819,0.898,0.816,0.851,0.043


## Parameter tuning

RBF kernel delivers better results when using all features, with default parameters. 

In [7]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="linear", C=[0.0001, 0.001, 0.01, 0.1, 1, 10])
best_estimator_lsvc = gsearch

{'linearsvc__C': 0.0001, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0}
0.834


In [8]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="rbf", C=[0.1, 1, 10, 100]
)
best_estimator_svc = gsearch

{'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.852


## Dimensionality reduction

### Linear kernel

PCA leads to minor improvements:

In [9]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
    C=[1, 0.01, 0.1, 10],
)
best_estimator_lsvc_pca = gsearch

{'linearsvc__C': 1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.97}
0.838


Kbest perform worse with the linear kernel:

In [10]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="kbest",
    select_k_steps=20,
    remove_zero_var=True,
    C=[0.1, 1, 10],
)
best_estimator_lsvc_kbest = gsearch

{'linearsvc__C': 10, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__max_iter': 100000000.0, 'selectkbest__k': 341}
0.809


#### RBF

In [11]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[0.1, 1, 10, 100],
    # gamma = [1e-0, 1e-1, 1e-2, 1e-3,"scale"]
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.97, 'svc__C': 1, 'svc__class_weight': None, 'svc__gamma': 0.01}
0.837


With the RBF kernel, the kbest model performs the best. It removes more than half of the features, in contrast to the full dataset, where it only removed 29.

In [12]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="kbest",
    select_k_steps=10,
    remove_zero_var=True,
    C=[0.1, 1, 10, 100],
)
best_estimator_svc_kbest = gsearch

{'selectkbest__k': 141, 'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.868


In [13]:
feature_names[~best_estimator_svc_kbest.best_estimator_["selectkbest"].get_support()]

array(['AA', 'AC', 'AD', 'AE', 'AG', 'AH', 'AQ', 'AR', 'AS', 'AT', 'AV',
       'CA', 'CC', 'CD', 'CE', 'CG', 'CI', 'CK', 'CL', 'CM', 'CN', 'CP',
       'CQ', 'CR', 'CS', 'CT', 'CW', 'DA', 'DD', 'DE', 'DF', 'DG', 'DH',
       'DK', 'DL', 'DN', 'DQ', 'DR', 'DT', 'DV', 'DY', 'EA', 'EE', 'EG',
       'EK', 'EM', 'EQ', 'ER', 'ES', 'EV', 'EW', 'EY', 'FA', 'FD', 'FE',
       'FF', 'FK', 'FM', 'FN', 'FR', 'FS', 'FT', 'FV', 'FY', 'GG', 'GK',
       'GL', 'GN', 'GP', 'GS', 'GY', 'HA', 'HD', 'HE', 'HK', 'HM', 'HP',
       'HQ', 'HR', 'HS', 'HT', 'HV', 'HW', 'IA', 'IC', 'ID', 'IE', 'IF',
       'IG', 'IK', 'IL', 'IM', 'IN', 'IQ', 'IS', 'IT', 'IY', 'KA', 'KD',
       'KE', 'KF', 'KG', 'KH', 'KI', 'KK', 'KL', 'KM', 'KP', 'KQ', 'KR',
       'KS', 'KV', 'KW', 'LA', 'LC', 'LE', 'LG', 'LH', 'LK', 'LL', 'LN',
       'LR', 'LS', 'LT', 'LV', 'LW', 'MA', 'ME', 'MK', 'ML', 'MN', 'MP',
       'MQ', 'MR', 'MS', 'MV', 'NA', 'NC', 'NE', 'NF', 'NG', 'NH', 'NI',
       'NM', 'NN', 'NQ', 'NR', 'NS', 'NT', 'NV', 'N

The linear kernel with feature selection shows the best result here.

## Validation

RBF with PCA delivers the best results on the test set, but in a less balanced way with six mis-classified sugar transporters and one AA transporters, compared to the other models.

### Linear kernel

In [14]:
display(get_confusion_matrix(X_test, y_test, best_estimator_lsvc, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_lsvc, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,17,6
Sugar transport,6,21


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.739,0.739,0.739,23
Sugar transport,0.778,0.778,0.778,27
macro avg,0.758,0.758,0.758,50
weighted avg,0.76,0.76,0.76,50


In [15]:
display(get_confusion_matrix(X_test, y_test, best_estimator_lsvc_kbest, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_lsvc_kbest, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,16,7
Sugar transport,4,23


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.8,0.696,0.744,23
Sugar transport,0.767,0.852,0.807,27
macro avg,0.783,0.774,0.776,50
weighted avg,0.782,0.78,0.778,50


In [16]:
display(get_confusion_matrix(X_test, y_test, best_estimator_lsvc_pca, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_lsvc_pca, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,16,7
Sugar transport,3,24


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.842,0.696,0.762,23
Sugar transport,0.774,0.889,0.828,27
macro avg,0.808,0.792,0.795,50
weighted avg,0.805,0.8,0.797,50


### RBF kernel

In [17]:
display(get_confusion_matrix(X_test, y_test, best_estimator_svc, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_svc, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,17,6
Sugar transport,5,22


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.773,0.739,0.756,23
Sugar transport,0.786,0.815,0.8,27
macro avg,0.779,0.777,0.778,50
weighted avg,0.78,0.78,0.78,50


In [18]:
display(get_confusion_matrix(X_test, y_test, best_estimator_svc_kbest, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_svc_kbest, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,17,6
Sugar transport,5,22


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.773,0.739,0.756,23
Sugar transport,0.786,0.815,0.8,27
macro avg,0.779,0.777,0.778,50
weighted avg,0.78,0.78,0.78,50


In [19]:
display(get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels))
display(get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels))

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,17,6
Sugar transport,1,26


Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.944,0.739,0.829,23
Sugar transport,0.812,0.963,0.881,27
macro avg,0.878,0.851,0.855,50
weighted avg,0.873,0.86,0.857,50


## Conclusion

The models achieve F1 scores of around 0.80 with only PAAC. This is relatively consistent between training set and test set, and between the two substrates.

## Estimating validation variance 


In [20]:
df_scores, df_params = full_test(
    df_paac, labels, dim_reduction="pca", kernel="rbf", repetitions=10, remove_zero_var=True, select_k_steps=20
)
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)
display(df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score"))
display(df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score"))
df_params

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8156,0.8393
Sugar transport,0.862,0.8602


dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.106622,0.016853
Sugar transport,0.046731,0.018618


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.86,0.84,0.93,0.97,0.93,0.98,0.99,0.93,0.92,0.94
svc__C,10,10,1.0,10,10,1,10,1,1.0,1.0
svc__class_weight,balanced,balanced,,balanced,balanced,balanced,balanced,balanced,,
svc__gamma,scale,scale,0.01,scale,scale,0.01,0.01,0.01,0.01,0.01


PAAC leads to higher evaluation scores than AAC for the META-features without E coli. With E coli in the dataset, the AAC outperformed the PAAC. In conclusion, removing Prokaryotes from the dataset improved the PAAC features by 0.04-0.05 (avg. F1)