# Feature performance comparison

The purpose of this notebook is to compare the classification performance of the individual features, and their combination

# Imports

In [1]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test
)
from subpred.util import get_feature_score
from subpred.compositions import calculate_paac

# Dataset

In [2]:
df = create_dataset(
    keywords_substrate_filter = ["Amino-acid transport","Sugar transport"],
    keywords_component_filter = ["Transmembrane"],
    keywords_transport_filter = ["Transport"],
    input_file = "../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate = "integrate",
    outliers= ['P76773', 'Q47706', 'P02943', 'P75733', 'P69856'] + ["P56579", "P64550"],
    verbose = True,
    tax_ids_filter = [83333],
    output_log = "../logs/ecoli_amino_sugar_dataset.log",
    sequence_clustering=70
)

cd-hit: clustered 98 sequences into 97 clusters at threshold 70


# Feature generation

In [3]:
labels = df.keywords_transport
labels.value_counts()

Amino-acid transport    51
Sugar transport         46
Name: keywords_transport, dtype: int64

In [4]:
df_paac = calculate_paac(df.sequence)
df_paac

Unnamed: 0_level_0,AA,AC,AD,AE,AF,AG,AH,AI,AK,AL,...,YM,YN,YP,YQ,YR,YS,YT,YV,YW,YY
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P69801,0.030189,0.007547,0.007547,0.000000,0.007547,0.030189,0.000000,0.015094,0.000000,0.018868,...,0.000000,0.003774,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
P36672,0.006356,0.000000,0.000000,0.000000,0.002119,0.006356,0.002119,0.010593,0.002119,0.016949,...,0.000000,0.000000,0.002119,0.006356,0.004237,0.000000,0.000000,0.000000,0.002119,0.002119
P56580,0.003145,0.003145,0.000000,0.009434,0.003145,0.003145,0.003145,0.009434,0.003145,0.012579,...,0.000000,0.000000,0.003145,0.003145,0.000000,0.000000,0.000000,0.003145,0.000000,0.000000
P0AA47,0.004435,0.002217,0.000000,0.002217,0.011086,0.008869,0.004435,0.011086,0.002217,0.024390,...,0.002217,0.000000,0.002217,0.000000,0.000000,0.000000,0.002217,0.006652,0.000000,0.000000
P08722,0.006410,0.001603,0.006410,0.001603,0.004808,0.014423,0.001603,0.009615,0.001603,0.012821,...,0.000000,0.000000,0.001603,0.001603,0.000000,0.003205,0.001603,0.001603,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P19642,0.007561,0.001890,0.001890,0.000000,0.007561,0.011342,0.000000,0.011342,0.001890,0.015123,...,0.001890,0.001890,0.000000,0.000000,0.000000,0.000000,0.000000,0.003781,0.000000,0.001890
P0AAD4,0.009950,0.000000,0.007463,0.000000,0.002488,0.024876,0.000000,0.007463,0.004975,0.029851,...,0.002488,0.000000,0.002488,0.002488,0.002488,0.000000,0.002488,0.000000,0.000000,0.000000
P23173,0.002415,0.000000,0.002415,0.002415,0.007246,0.009662,0.000000,0.012077,0.000000,0.009662,...,0.000000,0.000000,0.002415,0.000000,0.000000,0.002415,0.000000,0.000000,0.000000,0.000000
P33361,0.018229,0.005208,0.000000,0.000000,0.002604,0.007812,0.000000,0.013021,0.000000,0.046875,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.002604,0.000000,0.000000,0.000000,0.000000


One of the features is constant, that can be removed at the beginning for this analysis.

In [5]:
# df_paac = df_paac.loc[:,df_paac.var() != 0]

## Independent test set

In [6]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_paac, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)

## Model comparison

In [7]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.676,0.812,0.583,0.641,0.641,0.671,0.086
KNeighborsClassifier(),0.676,0.873,0.661,0.583,0.583,0.675,0.119
"LinearSVC(class_weight='balanced', max_iter=1000000.0)",0.746,0.875,0.861,0.796,0.598,0.775,0.112
LinearSVC(max_iter=1000000.0),0.746,0.875,0.861,0.796,0.598,0.775,0.112
RandomForestClassifier(),0.619,0.619,0.7,0.661,0.732,0.666,0.05
RandomForestClassifier(class_weight='balanced'),0.561,0.812,0.7,0.598,0.525,0.639,0.117
SGDClassifier(),0.812,0.746,0.796,0.732,0.667,0.751,0.058
SVC(),0.746,0.746,0.7,0.722,0.444,0.672,0.128
SVC(class_weight='balanced'),0.75,0.875,0.785,0.796,0.598,0.761,0.102


## Parameter tuning

In [8]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="rbf", dim_reduction=None, C=[0.1, 1, 10], gamma=["scale", "auto", 1e-0, 1e-1, 1e-2]
)
best_estimator_svc = gsearch

{'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.761


## Dimensionality reduction

In [9]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[1, 0.1, 0.01],
    gamma=[1e-0, 1e-1, 1e-2, 1e-3,"scale"]
    # gamma=[0.1, 0.01, 0.001],
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.95, 'svc__C': 1, 'svc__class_weight': None, 'svc__gamma': 'scale'}
0.791


## Validation

### PCA

In [10]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,6,5
Sugar transport,2,7


In [11]:
get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.75,0.545,0.632,11
Sugar transport,0.583,0.778,0.667,9
macro avg,0.667,0.662,0.649,20
weighted avg,0.675,0.65,0.647,20


## Estimating validation variance

Mean and standard deviation for randomly selected training and validation sets.

In [12]:
df_scores, df_params = full_test(
    df_paac, labels, dim_reduction="pca", kernel="rbf", repetitions=10
)

In [13]:
df_scores_gr = df_scores.groupby(["label", "dataset"], as_index=False)

#### Mean F1

In [14]:
df_scores_gr.mean().pivot(index="label", columns="dataset", values="F1 score")

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.7848,0.7987
Sugar transport,0.762,0.7943


#### Standard deviation F1

In [15]:
df_scores_gr.std().pivot(index="label", columns="dataset", values="F1 score")

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.112179,0.032366
Sugar transport,0.118061,0.038733


#### Parameters

In [16]:
df_params

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
pca__n_components,0.82,0.86,0.85,0.87,0.95,0.8,0.94,0.84,0.91,0.95
svc__C,10,1.0,1.0,1.0,10,1,10,1,10,1
svc__class_weight,balanced,,,,balanced,balanced,balanced,balanced,balanced,balanced
svc__gamma,scale,0.01,0.01,0.01,scale,0.01,0.01,0.1,0.01,0.01
