# Feature performance comparison

The purpose of this notebook is to compare the classification performance of the individual features, and their combination, for A. Thaliana

# Imports

In [1]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test
)
from subpred.compositions import calculate_paac

# Dataset

In [2]:
df = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Membrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="remove",
    outliers=["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0"],
    verbose=True,
    tax_ids_filter=[3702],
    output_log="../logs/athaliana_sugar_amino.log",
    sequence_clustering=70
)

cd-hit: clustered 165 sequences into 117 clusters at threshold 70


# Feature generation

In [3]:
labels = df.keywords_transport
labels.value_counts()

Sugar transport         84
Amino-acid transport    33
Name: keywords_transport, dtype: int64

In [4]:
df_paac = calculate_paac(df.sequence)
df_paac

Unnamed: 0_level_0,AA,AC,AD,AE,AF,AG,AH,AI,AK,AL,...,YM,YN,YP,YQ,YR,YS,YT,YV,YW,YY
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q9SFG0,0.005929,0.001976,0.000000,0.000000,0.005929,0.005929,0.000000,0.003953,0.003953,0.009881,...,0.001976,0.000000,0.000000,0.000000,0.001976,0.000000,0.000000,0.003953,0.000000,0.000000
Q84WN3,0.004167,0.000000,0.000000,0.004167,0.004167,0.004167,0.000000,0.008333,0.004167,0.008333,...,0.004167,0.000000,0.000000,0.000000,0.004167,0.000000,0.000000,0.004167,0.000000,0.004167
O04249,0.005859,0.001953,0.000000,0.003906,0.009766,0.011719,0.001953,0.003906,0.001953,0.005859,...,0.000000,0.000000,0.000000,0.001953,0.000000,0.003906,0.000000,0.001953,0.000000,0.000000
Q56ZZ7,0.011009,0.001835,0.003670,0.003670,0.007339,0.018349,0.000000,0.012844,0.003670,0.020183,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.003670,0.000000,0.001835,0.001835,0.001835
Q8H184,0.004843,0.000000,0.002421,0.000000,0.007264,0.004843,0.002421,0.002421,0.002421,0.004843,...,0.000000,0.002421,0.000000,0.000000,0.000000,0.002421,0.002421,0.004843,0.002421,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q94B65,0.009317,0.000000,0.000000,0.006211,0.003106,0.006211,0.003106,0.003106,0.018634,0.018634,...,0.000000,0.006211,0.003106,0.000000,0.003106,0.003106,0.003106,0.000000,0.000000,0.000000
Q0WWW9,0.005386,0.000000,0.007181,0.005386,0.003591,0.003591,0.000000,0.003591,0.001795,0.017953,...,0.001795,0.000000,0.000000,0.001795,0.001795,0.000000,0.000000,0.001795,0.000000,0.003591
Q2V4B9,0.006073,0.000000,0.004049,0.010121,0.004049,0.008097,0.000000,0.004049,0.004049,0.010121,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004049,0.006073,0.000000,0.000000
Q94EI9,0.005900,0.000000,0.005900,0.005900,0.002950,0.008850,0.000000,0.011799,0.000000,0.014749,...,0.000000,0.002950,0.002950,0.002950,0.000000,0.000000,0.000000,0.005900,0.000000,0.002950


## Independent test set

In [5]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_paac, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)



## Model comparison

In [6]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.729,0.842,0.708,0.862,0.63,0.754,0.097
KNeighborsClassifier(),0.936,0.864,0.725,1.0,0.804,0.866,0.108
"LinearSVC(class_weight='balanced', max_iter=1000000.0)",0.808,0.927,0.887,0.926,0.75,0.86,0.078
LinearSVC(max_iter=1000000.0),0.808,0.927,0.887,0.926,0.75,0.86,0.078
RandomForestClassifier(),0.604,0.424,0.562,0.6,0.734,0.585,0.111
RandomForestClassifier(class_weight='balanced'),0.737,0.737,0.406,0.734,0.419,0.607,0.177
SGDClassifier(),0.878,0.927,0.887,0.926,0.673,0.858,0.106
SVC(),0.424,0.424,0.406,0.419,0.419,0.419,0.007
SVC(class_weight='balanced'),0.737,0.604,0.782,0.839,0.734,0.739,0.087


## Parameter tuning

In [7]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="linear", dim_reduction=None, C=[1, 10, 100]
)
best_estimator_svc = gsearch

{'linearsvc__C': 10, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__max_iter': 100000000.0}
0.893


## Dimensionality reduction

In [8]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="linear", dim_reduction="kbest", C=[1, 10, 100]
)
best_estimator_svc_kbest = gsearch

{'linearsvc__C': 100, 'linearsvc__class_weight': None, 'linearsvc__dual': False, 'linearsvc__max_iter': 100000000.0, 'selectkbest__k': 241}
0.893


In [9]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="linear",
    dim_reduction="pca",
    # C=[0.1, 0.01, 0.001],
    # gamma=[0.1, 0.01, 0.001],
)

{'linearsvc__C': 0.1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0, 'pca__n_components': 0.8}
0.907


PCA seems to perform the best, how about the RBF kernel on the lower dimensional feature set?

In [10]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    # C=[0.1, 0.01, 0.001],
    # gamma=[0.1, 0.01, 0.001],
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.81, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.936


## Validation

### Kbest

In [11]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_kbest, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,6,1
Sugar transport,1,16


In [12]:
get_classification_report(X_test, y_test, best_estimator_svc_kbest, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.857,0.857,0.857,7
Sugar transport,0.941,0.941,0.941,17
macro avg,0.899,0.899,0.899,24
weighted avg,0.917,0.917,0.917,24


### PCA

In [13]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,7,0
Sugar transport,0,17


In [14]:
get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,1.0,1.0,1.0,7
Sugar transport,1.0,1.0,1.0,17
macro avg,1.0,1.0,1.0,24
weighted avg,1.0,1.0,1.0,24


## Estimating validation variance

Mean and standard deviation for randomly selected training and validation sets.

In [15]:
df_scores, df_params = full_test(
    df_paac, labels, dim_reduction="pca", kernel="rbf", repetitions=10
)

In [16]:
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)

#### Mean F1

In [17]:
df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score")

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8395,0.8818
Sugar transport,0.9394,0.9554


#### Standard deviation F1

In [18]:
df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score")

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.081425,0.052482
Sugar transport,0.028068,0.017115


#### Parameters

In [19]:
df_params

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.81,0.8,0.82,0.8,0.88,0.97,0.9,0.85,0.88,0.8
svc__C,1,10,1,1,10,1,1,10,1,10
svc__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced
svc__gamma,scale,scale,0.01,0.01,scale,0.01,scale,scale,0.01,scale
