# Feature performance comparison

The purpose of this notebook is to compare the classification performance of the individual features, and their combination, for E. Coli

# Imports

In [19]:
import os
import sys
from IPython.display import display

sys.path.append('../src')
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_selection import SelectKBest, RFE
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.base import clone

import matplotlib.pyplot as plt

from yellowbrick.features import ParallelCoordinates
from yellowbrick.features import Rank1D, Rank2D

import pandas as pd
import numpy as np
import seaborn as sns

from dataset.transporter_dataset import create_dataset
from dataset.cluster_fasta import cd_hit
from features.labels import fasta_to_labels
from features.compositions import calculate_composition_feature
from features.pssm import calculate_pssm_feature
from features.coexp import calculate_coexp_feature
from models.eval import nested_crossval
from visualization.feature_plots import create_plot

# Globals

In [20]:
N_THREADS = 16
IDENTITY_THRESHOLD=70
TAX_ID = 9606
ORGANISM = "human"
LOG_FILE = "../logs/human_amino_sugar.log"

# Dataset

In [21]:
# Delete previous log
if os.path.exists(LOG_FILE):
    with open(LOG_FILE, "w"):
        pass

create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Membrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    outliers=["Q9HBR0", "Q07837"],
    tax_ids_filter=[TAX_ID],
    output_tsv=f"../data/datasets/{ORGANISM}_amino_sugar.tsv",
    output_fasta=f"../data/datasets/{ORGANISM}_amino_sugar.fasta",
    output_log=LOG_FILE,
)


## Clustering

In [22]:
cd_hit(
    executable_location="cd-hit",
    input_fasta=f"../data/datasets/{ORGANISM}_amino_sugar.fasta",
    output_fasta=f"../data/datasets/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}.fasta",
    log_file=LOG_FILE,
    identity_threshold=IDENTITY_THRESHOLD,
    n_threads=N_THREADS,
    memory=4096,
    verbose=True,
)

## Annotations

In [23]:
df_annotations = pd.read_table(f"../data/datasets/{ORGANISM}_amino_sugar.tsv", index_col=0)
df_annotations.head()

Unnamed: 0_level_0,keywords_transport,keywords_location,keywords_transport_related,gene_names,protein_names,tcdb_id,organism_id,sequence
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Q9BWM7,Amino-acid transport,Membrane;Mitochondrion;Transmembrane,Transport,SFXN3,Sideroflexin-3,,9606,MGELPLDINIQEPRWDQSTFLGRARHFFTVTDPRNLLLSGAQLEAS...
Q9BRV3,Sugar transport,Cell membrane;Membrane;Transmembrane,Transport,SLC50A1 RAG1AP1 SCP,Sugar transporter SWEET1 (HsSWEET1) (RAG1-acti...,2.A.123.1.4,9606,MEAGGFLDSLIYGACVVFTLGMFSAGLSDLRHMRMTRSVDNVQFLP...
Q5M8T2,Sugar transport,Membrane;Transmembrane,Transport,SLC35D3 FRCL1,Solute carrier family 35 member D3 (Fringe con...,2.A.7.15.5,9606,MRQLCRGRVLGISVAIAHGVFSGSLNILLKFLISRYQFSFLTLVQC...
Q969S0,Sugar transport,Membrane;Transmembrane,Transport,SLC35B4 YEA4 PSEC0055,UDP-xylose and UDP-N-acetylglucosamine transpo...,2.A.7.10.2,9606,MRPALAVGLVFAGCCSNVIFLELLARKHPGCGNIVTFAQFLFIAVE...
O75387,Amino-acid transport,Membrane;Transmembrane,Transport,SLC43A1 LAT3 PB39 POV1,Large neutral amino acids transporter small su...,2.A.1.44.1,9606,MAPTLQQAYRRRWWMACTAVLENLFFSAVLLGWGSLLIILKNEGFY...


# Feature generation

## Labels

In [24]:
fasta_to_labels(
    input_fasta=f"../data/datasets/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}.fasta",
    output_tsv=f"../data/features/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}_labels.tsv",
)
df_labels = pd.read_table(
    f"../data/features/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}_labels.tsv",
    index_col=0,
)
df_labels.labels.value_counts()

Amino-acid transport    48
Sugar transport         34
Name: labels, dtype: int64

## PAAC

In [25]:
calculate_composition_feature(
    input_fasta=f"../data/datasets/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}.fasta",
    output_tsv=f"../data/features/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}_paac.tsv",
    feature_type="paac",
)

df_paac = pd.read_table(
    f"../data/features/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}_paac.tsv",
    index_col=0,
)

# Functions

In [26]:
def get_feature_stats(df_features, df_labels_, labels=["Amino-acid transport", "Sugar transport"]):
    df_stats = pd.concat(
        {
            "corr": df_features.corrwith(
                df_labels_.labels.transform(lambda x: 1.0 if x == labels[1] else 0.0)
            ),
            "mean": df_features.mean(),
            "std": df_features.std(),
        },
        axis=1,
    )

    df_stats["corr_abs"] = df_stats["corr"].abs()

    df_stats["mean0"] = df_features.loc[df_labels_[df_labels_.labels == labels[0]].index].mean()
    df_stats["mean1"] = df_features.loc[df_labels_[df_labels_.labels == labels[1]].index].mean()

    df_stats["median0"] = df_features.loc[
        df_labels_[df_labels_.labels == labels[0]].index
    ].median()
    df_stats["median1"] = df_features.loc[
        df_labels_[df_labels_.labels == labels[1]].index
    ].median()

    df_stats["mediandiff"] = (df_stats["median0"] - df_stats["median1"]).abs()
    df_stats = df_stats.sort_values("mediandiff", ascending=False)
    return df_stats

In [27]:
def get_independent_test_set(
    df_features, df_labels_, labels=["Amino-acid transport", "Sugar transport"], test_size=0.2
):
    X = df_features.to_numpy()
    y = np.where(df_labels_.labels == labels[1], 1, 0)
    feature_names = df_features.columns.to_numpy()
    sample_names = df_features.index.to_numpy()
    (
        X_train,
        X_test,
        y_train,
        y_test,
        sample_names_train,
        sample_names_test,
    ) = train_test_split(
        X, y, sample_names, stratify=y, random_state=42, shuffle=True, test_size=test_size
    )
    return (
        X_train,
        X_test,
        y_train,
        y_test,
        sample_names_train,
        sample_names_test,
        feature_names,
    )


In [28]:
def print_validation_results(y_true_, y_pred_, labels = ["Amino", "Sugar"]):
    report_dict = classification_report(y_true=y_true_, y_pred=y_pred_, output_dict=True)
    report_dict = {
        labels[0]: report_dict['0'],
        labels[1]: report_dict['1'],
        "Macro": report_dict["macro avg"],
        "Weighted": report_dict["weighted avg"]
    }
    report_df = pd.DataFrame.from_dict(report_dict)
    confusion_matrix_df = pd.DataFrame(
        confusion_matrix(y_true_, y_pred_),
        columns=labels,
        index=labels,
    )
    return report_df, confusion_matrix_df

# Individual Features

## PAAC

### Stats

In [29]:
df_stats = get_feature_stats(df_paac, df_labels).head(10)
# 10 best features
display(df_stats.head(10))

display(df_stats.sort_values("corr", ascending=False))

Unnamed: 0,corr,mean,std,corr_abs,mean0,mean1,median0,median1,mediandiff
LL,0.187241,0.01974,0.009885,0.187241,0.018191,0.021925,0.016832,0.020641,0.003809
AG,0.234006,0.006984,0.005249,0.234006,0.005957,0.008435,0.005705,0.00901,0.003305
LF,0.234161,0.007838,0.004091,0.234161,0.007036,0.008969,0.006295,0.009259,0.002964
LY,0.251269,0.003548,0.003126,0.251269,0.002891,0.004475,0.002029,0.004925,0.002896
II,-0.296286,0.005978,0.005293,0.296286,0.007289,0.004126,0.005696,0.003031,0.002665
LQ,0.178381,0.00476,0.003795,0.178381,0.004193,0.005559,0.003416,0.005866,0.00245
RE,-0.277293,0.00177,0.001813,0.277293,0.00219,0.001176,0.002165,0.0,0.002165
GG,0.303436,0.005146,0.00365,0.303436,0.00422,0.006454,0.004064,0.00618,0.002116
LS,0.206344,0.010641,0.005504,0.206344,0.009691,0.011982,0.009582,0.011633,0.002051
SA,0.176616,0.006156,0.004273,0.176616,0.005525,0.007047,0.004099,0.006146,0.002047


Unnamed: 0,corr,mean,std,corr_abs,mean0,mean1,median0,median1,mediandiff
GG,0.303436,0.005146,0.00365,0.303436,0.00422,0.006454,0.004064,0.00618,0.002116
LY,0.251269,0.003548,0.003126,0.251269,0.002891,0.004475,0.002029,0.004925,0.002896
LF,0.234161,0.007838,0.004091,0.234161,0.007036,0.008969,0.006295,0.009259,0.002964
AG,0.234006,0.006984,0.005249,0.234006,0.005957,0.008435,0.005705,0.00901,0.003305
LS,0.206344,0.010641,0.005504,0.206344,0.009691,0.011982,0.009582,0.011633,0.002051
LL,0.187241,0.01974,0.009885,0.187241,0.018191,0.021925,0.016832,0.020641,0.003809
LQ,0.178381,0.00476,0.003795,0.178381,0.004193,0.005559,0.003416,0.005866,0.00245
SA,0.176616,0.006156,0.004273,0.176616,0.005525,0.007047,0.004099,0.006146,0.002047
RE,-0.277293,0.00177,0.001813,0.277293,0.00219,0.001176,0.002165,0.0,0.002165
II,-0.296286,0.005978,0.005293,0.296286,0.007289,0.004126,0.005696,0.003031,0.002665


### Independent test set

In [30]:
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
    feature_names,
) = get_independent_test_set(df_paac, df_labels)

### Model selection

In [31]:
for estimator in [
    LinearSVC(),
    LinearSVC(class_weight="balanced"),
    SVC(),
    SVC(class_weight="balanced"),
    GaussianNB(),
    KNeighborsClassifier(),
    RandomForestClassifier(),
    SGDClassifier(),
]:
    pipe = make_pipeline(StandardScaler(), estimator)
    scores = cross_val_score(estimator, X_train, y_train, scoring="f1_macro")
    print("### ", type(estimator).__name__)
    print(f"CV folds: {scores.round(3)}")
    print(f"Mean: {scores.mean().round(3)}")
    print(f"Std: {scores.std().round(3)}")


###  LinearSVC
CV folds: [0.35  0.35  0.381 0.381 0.381]
Mean: 0.369
Std: 0.015
###  LinearSVC
CV folds: [0.513 0.764 0.675 0.606 0.675]
Mean: 0.646
Std: 0.084
###  SVC
CV folds: [0.511 0.35  0.381 0.35  0.381]
Mean: 0.395
Std: 0.06
###  SVC
CV folds: [0.575 0.764 0.819 0.745 0.819]
Mean: 0.745
Std: 0.09
###  GaussianNB
CV folds: [0.606 0.69  0.838 0.639 0.707]
Mean: 0.696
Std: 0.079
###  KNeighborsClassifier
CV folds: [0.575 0.575 0.707 0.745 0.567]
Mean: 0.634
Std: 0.076
###  RandomForestClassifier
CV folds: [0.745 0.639 0.639 0.511 0.35 ]
Mean: 0.577
Std: 0.135
###  SGDClassifier
CV folds: [0.316 0.316 0.381 0.381 0.381]
Mean: 0.355
Std: 0.032


### Hyperparameters

In [32]:
# params = {
#     "kneighborsclassifier__n_neighbors": list(range(3,10)),
#     "kneighborsclassifier__weights": ["uniform", "distance"],
#     "kneighborsclassifier__metric": ["minkowski", "euclidean", "manhattan"],
# }
params = {
    "svc__C": [0.1, 1, 10,],
    "svc__gamma": ["scale", "auto", 1e-0, 1e-1, 1e-2],
    "svc__class_weight": ["balanced", None],
}
gsearch = GridSearchCV(
    make_pipeline(StandardScaler(), SVC()),
    param_grid=params,
    cv=5,
    scoring="f1_macro",
    n_jobs=-1,
    return_train_score=True,
)
gsearch.fit(X_train, y_train)
print(gsearch.best_params_)
print(gsearch.best_score_)
best_estimator_svc = gsearch.best_estimator_


{'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.6076745786033712


### Dimensionality reduction

In [33]:
pca = PCA()
pca.fit(X_train)
csum = np.cumsum(pca.explained_variance_ratio_)
print("Number of components to explain 95% variance:", np.argmax(csum >= 0.95) + 1)

gsearch = GridSearchCV(
    estimator=Pipeline(
        [
            ("scale", StandardScaler()),
            ("pca", PCA()),
            ("scale2", StandardScaler()),
            ("svc", SVC()),
        ]
    ),
    param_grid={
        "svc__C": [0.1, 1, 10],
        "svc__gamma": [1e-0, 1e-1, 1e-2, 1e-3,"scale"],
        "svc__class_weight": ["balanced", None],
        "pca__n_components": np.linspace(0.8, 0.99, 18),
    },
    cv=5,
    scoring="f1_macro",
    n_jobs=-2,
    return_train_score=True,
)
gsearch.fit(X_train, y_train)
print(gsearch.best_params_)
print(gsearch.best_score_.round(3))
best_estimator_svc_pca = gsearch.best_estimator_


Number of components to explain 95% variance: 53
{'pca__n_components': 0.9564705882352942, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01}
0.732


### Validation

In [34]:
best_estimator = best_estimator_svc_pca

In [35]:
best_scores = cross_val_score(
    estimator=clone(best_estimator), X=X_train, y=y_train, scoring="f1_macro"
)
print(f"Train scores: {best_scores.mean().round(3)}+-{best_scores.std().round(3)}")

y_pred = best_estimator.predict(X_test)
y_true = y_test.copy()

report_df, confusion_matrix_df = print_validation_results(y_true, y_pred, labels=["Amino", "Sugar"])
display(report_df.round(3))
display(confusion_matrix_df)

Train scores: 0.732+-0.142


Unnamed: 0,Amino,Sugar,Macro,Weighted
precision,0.875,0.667,0.771,0.789
recall,0.7,0.857,0.779,0.765
f1-score,0.778,0.75,0.764,0.766
support,10.0,7.0,17.0,17.0


Unnamed: 0,Amino,Sugar
Amino,7,3
Sugar,1,6


# Alternative Eval

In [38]:
from sklearn.model_selection import LeaveOneOut

X = df_paac.values
labels = df_labels.labels
y = np.where(labels == "Sugar transport", 1, 0)


params_svc = {
    "svc__class_weight": ["balanced"],
    "svc__C": [0.1, 1, 10],
    "svc__gamma": ["scale", 1e-1, 1e-2, 1e-3],
}
gsearch = GridSearchCV(
    make_pipeline(StandardScaler(), SVC()),
    param_grid=params_svc,
    cv=LeaveOneOut(),
    scoring="f1_macro",
    # n_jobs=1,
    return_train_score=True,
    refit=True,
)
# the score does not make a difference, since it's either 1 (correct) or 0 (incorrect)
res = cross_val_predict(gsearch, X, y, n_jobs=-1, cv=LeaveOneOut())
print(f1_score(y, res, average="macro"))


params_svc = {
    "svc__class_weight": ["balanced"],
    "svc__C": [0.1, 1, 10],
    # "svc__gamma": ["scale", 1e-1, 1e-2, 1e-3],
    "pca__n_components": [0.8, 0.9],
}
gsearch = GridSearchCV(
    make_pipeline(StandardScaler(), PCA(), StandardScaler(), SVC()),
    param_grid=params_svc,
    cv=LeaveOneOut(),
    scoring="f1_macro",
    # n_jobs=1,
    return_train_score=True,
    refit=True,
)
# the score does not make a difference, since it's either 1 (correct) or 0 (incorrect)
res = cross_val_predict(gsearch, X, y, n_jobs=-1, cv=LeaveOneOut())
print(f1_score(y, res, average="macro"))


0.7781314658602578
0.816025641025641
