# Feature performance comparison

The purpose of this notebook is to compare the classification performance of the individual features, and their combination

# Imports

In [26]:
from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    get_independent_test_set,
    optimize_hyperparams,
    preprocess_pandas,
    models_quick_compare,
    get_confusion_matrix,
    get_classification_report,
    full_test
)
from subpred.compositions import calculate_aac

# Dataset

In [2]:
df = create_dataset(
    keywords_substrate_filter = ["Amino-acid transport","Sugar transport"],
    keywords_component_filter = ["Transmembrane"],
    keywords_transport_filter = ["Transport"],
    input_file = "../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate = "integrate",
    outliers= ['P76773', 'Q47706', 'P02943', 'P75733', 'P69856'],
    verbose = True,
    tax_ids_filter = [83333],
    output_log = "../logs/ecoli_amino_sugar_dataset.log",
    sequence_clustering=70
)

cd-hit: clustered 100 sequences into 99 clusters at threshold 70


# Feature generation

## Labels

In [4]:
labels = df.keywords_transport
labels.value_counts()

Amino-acid transport    52
Sugar transport         47
Name: keywords_transport, dtype: int64

## AAC

In [5]:
df_aac = calculate_aac(df.sequence)
df_aac

Unnamed: 0_level_0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
P69801,0.150376,0.007519,0.030075,0.022556,0.037594,0.093985,0.015038,0.120301,0.011278,0.101504,0.048872,0.052632,0.030075,0.033835,0.018797,0.037594,0.056391,0.105263,0.007519,0.018797
P36672,0.093023,0.012685,0.023256,0.023256,0.044397,0.103594,0.016913,0.105708,0.029598,0.120507,0.038055,0.031712,0.052854,0.052854,0.033827,0.048626,0.040169,0.082452,0.014799,0.031712
P56580,0.097179,0.012539,0.040752,0.034483,0.034483,0.122257,0.015674,0.100313,0.040752,0.100313,0.018809,0.012539,0.065831,0.034483,0.031348,0.056426,0.062696,0.087774,0.012539,0.018809
P0AA47,0.106195,0.006637,0.028761,0.026549,0.077434,0.070796,0.013274,0.075221,0.030973,0.110619,0.044248,0.030973,0.046460,0.019912,0.033186,0.064159,0.064159,0.097345,0.015487,0.037611
P08722,0.100800,0.012800,0.038400,0.033600,0.062400,0.100800,0.016000,0.092800,0.035200,0.108800,0.024000,0.027200,0.054400,0.030400,0.024000,0.056000,0.065600,0.084800,0.014400,0.017600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P19642,0.090566,0.013208,0.026415,0.024528,0.066038,0.105660,0.024528,0.084906,0.032075,0.120755,0.033962,0.037736,0.045283,0.032075,0.028302,0.056604,0.056604,0.094340,0.013208,0.013208
P0AAD4,0.116625,0.007444,0.019851,0.012407,0.064516,0.106700,0.019851,0.062035,0.017370,0.171216,0.032258,0.019851,0.037221,0.027295,0.032258,0.059553,0.057072,0.091811,0.014888,0.029777
P23173,0.084337,0.016867,0.024096,0.012048,0.093976,0.096386,0.014458,0.113253,0.040964,0.120482,0.019277,0.043373,0.033735,0.009639,0.016867,0.079518,0.050602,0.072289,0.024096,0.033735
P33361,0.135065,0.012987,0.025974,0.007792,0.044156,0.083117,0.012987,0.067532,0.015584,0.197403,0.023377,0.020779,0.046753,0.038961,0.041558,0.046753,0.044156,0.090909,0.028571,0.015584


### Independent test set

In [6]:
X, y, feature_names, sample_names = preprocess_pandas(
    df_aac, labels, return_names=True
)
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
) = get_independent_test_set(X, y, sample_names=sample_names, test_size=0.2)



### Model selection

SVC (with default RBF kernel) looks the most promising.

In [7]:
models_quick_compare(X_train, y_train)

Unnamed: 0_level_0,0,1,2,3,4,mean,std
est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB(),0.746,0.619,0.686,0.686,0.722,0.692,0.048
KNeighborsClassifier(),0.812,0.625,0.686,0.746,0.8,0.734,0.079
"LinearSVC(class_weight='balanced', max_iter=1000000.0)",0.686,0.686,0.686,0.746,0.732,0.707,0.029
LinearSVC(max_iter=1000000.0),0.686,0.686,0.686,0.746,0.732,0.707,0.029
RandomForestClassifier(),0.75,0.619,0.686,0.561,0.796,0.682,0.095
RandomForestClassifier(class_weight='balanced'),0.686,0.625,0.686,0.686,0.796,0.696,0.062
SGDClassifier(),0.561,0.619,0.686,0.686,0.866,0.684,0.115
SVC(),0.812,0.676,0.686,0.625,0.861,0.732,0.1
SVC(class_weight='balanced'),0.812,0.619,0.686,0.686,0.796,0.72,0.082


### Parameter tuning

The RBF kernel delivers good results, with C=1

In [11]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="linear", dim_reduction=None, C=[1,0.1,10])

{'linearsvc__C': 1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'linearsvc__max_iter': 100000000.0}
0.707


In [12]:
gsearch = optimize_hyperparams(X_train, y_train, kernel="rbf", dim_reduction=None, C=[1,0.1,10])
best_estimator_svc = gsearch

{'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01}
0.745


### Dimensionality reduction

PCA performs slightly better.

In [14]:
gsearch = optimize_hyperparams(
    X_train, y_train, kernel="rbf", dim_reduction="kbest", C=[1,10,100]
)
best_estimator_svc_kbest = gsearch

{'selectkbest__k': 8, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.796


In [20]:
gsearch = optimize_hyperparams(
    X_train,
    y_train,
    kernel="rbf",
    dim_reduction="pca",
    C=[1, 0.1, 10],
    gamma=["scale", 0.1, 0.01, 0.001],
)
best_estimator_svc_pca = gsearch

{'pca__n_components': 0.8400000000000001, 'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01}
0.797


### Validation

PCA leads to the best evaluation, and seems to be the most stable.

In [21]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_kbest, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,6,5
Sugar transport,2,7


In [22]:
get_classification_report(X_test, y_test, best_estimator_svc_kbest, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.75,0.545,0.632,11
Sugar transport,0.583,0.778,0.667,9
macro avg,0.667,0.662,0.649,20
weighted avg,0.675,0.65,0.647,20


In [23]:
get_confusion_matrix(X_test, y_test, best_estimator_svc_pca, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,10,1
Sugar transport,1,8


In [24]:
get_classification_report(X_test, y_test, best_estimator_svc_pca, labels=labels)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.909,0.909,0.909,11
Sugar transport,0.889,0.889,0.889,9
macro avg,0.899,0.899,0.899,20
weighted avg,0.9,0.9,0.9,20


## Checking dependence on train test split



In [43]:
df_scores, df_params = full_test(
    df_aac, labels, dim_reduction="pca", kernel="rbf", repetitions=10
)

In [44]:
df_scores.groupby(["label", "dataset"], as_index=False).mean().pivot(index="label", columns="dataset", values="F1 score").mean()

dataset
test     0.83480
train    0.87645
dtype: float64

In [45]:
df_scores.groupby(["label", "dataset"], as_index=False).mean().pivot(index="label", columns="dataset", values="F1 score").mean().mean()

0.855625

In [46]:
df_scores.groupby(["label", "dataset"], as_index=False).std().pivot(index="label", columns="dataset", values="F1 score")

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.067548,0.048607
Sugar transport,0.116889,0.054288


### Full test without the two potential sequence outliers

In notebook 2, we found two sequence outliers that only appear for PAAC. Do they influence the AAC results?

In [47]:
mask = ~df_aac.index.isin(["P56579", "P64550"])
df_scores, df_params = full_test(
    df_aac.loc[mask], labels.loc[mask], dim_reduction="pca", kernel="rbf", repetitions=10
)

In [48]:
df_scores.groupby(["label", "dataset"], as_index=False).mean().pivot(index="label", columns="dataset", values="F1 score")

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.8158,0.8952
Sugar transport,0.7537,0.883


In [49]:
df_scores.groupby(["label", "dataset"], as_index=False).mean().pivot(index="label", columns="dataset", values="F1 score").mean()

dataset
test     0.78475
train    0.88910
dtype: float64

In [50]:
df_scores.groupby(["label", "dataset"], as_index=False).mean().pivot(index="label", columns="dataset", values="F1 score").mean().mean()

0.836925

In [51]:
df_scores.groupby(["label", "dataset"], as_index=False).std().pivot(index="label", columns="dataset", values="F1 score")

dataset,test,train
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,0.107324,0.039894
Sugar transport,0.146362,0.05068


Slightly worse scores for test set, slightly better for training set. About 0.02 worse F1 score without the two proteins. Not much of a difference