# Feature performance comparison

The purpose of this notebook is to compare the classification performance of the individual features, and their combination, for E. Coli

# Imports

In [1]:
import os
import sys
from IPython.display import display

sys.path.append('../src')
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_selection import SelectKBest, RFE
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.base import clone

import matplotlib.pyplot as plt

from yellowbrick.features import ParallelCoordinates
from yellowbrick.features import Rank1D, Rank2D

import pandas as pd
import numpy as np
import seaborn as sns

from dataset.transporter_dataset import create_dataset
from dataset.cluster_fasta import cd_hit
from features.labels import fasta_to_labels
from features.compositions import calculate_composition_feature
from features.pssm import calculate_pssm_feature
from features.coexp import calculate_coexp_feature
from models.eval import nested_crossval
from visualization.feature_plots import create_plot

# Globals

In [2]:
N_THREADS = 16
IDENTITY_THRESHOLD=70

LOG_FILE = "../logs/meta_amino_sugar.log"
N_THREADS = 16
ORGANISM = "meta"

# Dataset

In [3]:
# Delete previous log
if os.path.exists(LOG_FILE):
    with open(LOG_FILE, "w"):
        pass
# e coli, a thaliana, human
create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Transmembrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    outliers=["P76773", "Q47706", "P64550", "P02943", "P75733", "P69856"]
    + ["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0"]
    + ["Q9HBR0", "Q07837"],
    tax_ids_filter=[3702, 9606, 83333, 559292],
    output_tsv=f"../data/datasets/{ORGANISM}_amino_sugar.tsv",
    output_fasta=f"../data/datasets/{ORGANISM}_amino_sugar.fasta",
    output_log=LOG_FILE,
)


Unnamed: 0_level_0,keywords_transport,keywords_location,keywords_transport_related,gene_names,protein_names,tcdb_id,organism_id,sequence
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
P0CD99,Sugar transport,Cell membrane;Membrane;Transmembrane,Transport,MPH2 YDL247W,Alpha-glucosides permease MPH2 (Maltose transp...,,559292,MKNLSFLINRRKENTSDSNVYPGKAKSHEPSWIEMDDQTKKDGLDI...
P69801,Sugar transport,Cell inner membrane;Cell membrane;Membrane;Tra...,Transport,manY pel ptsP b1818 JW1807,PTS system mannose-specific EIIC component (EI...,4.A.6.1.1,83333,MEITTLQIVLVFIVACIAGMGSILDEFQFHRPLIACTLVGIVLGDM...
Q9SFG0,Sugar transport,Membrane;Transmembrane,Symport;Transport,STP6 At3g05960 F2O10.8,Sugar transport protein 6 (Hexose transporter 6),2.A.1.1.56,3702,MAVVVSNANAPAFEAKMTVYVFICVMIAAVGGLIFGYDIGISGGVS...
Q9BWM7,Amino-acid transport,Membrane;Mitochondrion;Transmembrane,Transport,SFXN3,Sideroflexin-3,,9606,MGELPLDINIQEPRWDQSTFLGRARHFFTVTDPRNLLLSGAQLEAS...
Q9ZVK6,Sugar transport,Cell membrane;Membrane;Transmembrane,Symport;Transport,SUC8 At2g14670 T6B13.9,Sucrose transport protein SUC8 (Sucrose permea...,,3702,MSDLQAKNDVVAVDRQSSSSLADLDGPSPLRKMISVASIAAGIQFG...
...,...,...,...,...,...,...,...,...
Q04162,Sugar transport,Membrane;Transmembrane,Transport,YDR387C,Probable metabolite transport protein YDR387C,2.A.1.1.101,559292,MSTDESEDVYSDLYSIISQVTSNTANDIEQLPYALTFKTSLIFVGA...
P33361,Amino-acid transport,Cell inner membrane;Cell membrane;Membrane;Tra...,Transport,yehY b2130 JW2118,Glycine betaine uptake system permease protein...,3.A.1.12.15,83333,MTYFRINPVLALLLLLTAIAAALPFISYAPNRLVSGEGRHLWQLWP...
P39328,Sugar transport,Cell inner membrane;Cell membrane;Membrane;Tra...,Transport,ytfT b4230 JW5753,Galactofuranose transporter permease protein YtfT,3.A.1.2.25,83333,MMPQSLPDTTTPKRRFRWPTGMPQLVALLLVLLVDSLVAPHFWQVV...
Q93Z80,Sugar transport,Membrane;Transmembrane,Transport,At3g05160 T12H1.13,Sugar transporter ERD6-like 10,,3702,MEEGLLRHENDRDDRRITACVILSTFVAVCSSFSYGCANGYTSGAE...


## Clustering

In [4]:
cd_hit(
    executable_location="cd-hit",
    input_fasta=f"../data/datasets/{ORGANISM}_amino_sugar.fasta",
    output_fasta=f"../data/datasets/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}.fasta",
    log_file=LOG_FILE,
    identity_threshold=IDENTITY_THRESHOLD,
    n_threads=N_THREADS,
    memory=4096,
    verbose=True,
)

## Annotations

In [5]:
df_annotations = pd.read_table(f"../data/datasets/{ORGANISM}_amino_sugar.tsv", index_col=0)
df_annotations.head()

Unnamed: 0_level_0,keywords_transport,keywords_location,keywords_transport_related,gene_names,protein_names,tcdb_id,organism_id,sequence
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
P0CD99,Sugar transport,Cell membrane;Membrane;Transmembrane,Transport,MPH2 YDL247W,Alpha-glucosides permease MPH2 (Maltose transp...,,559292,MKNLSFLINRRKENTSDSNVYPGKAKSHEPSWIEMDDQTKKDGLDI...
P69801,Sugar transport,Cell inner membrane;Cell membrane;Membrane;Tra...,Transport,manY pel ptsP b1818 JW1807,PTS system mannose-specific EIIC component (EI...,4.A.6.1.1,83333,MEITTLQIVLVFIVACIAGMGSILDEFQFHRPLIACTLVGIVLGDM...
Q9SFG0,Sugar transport,Membrane;Transmembrane,Symport;Transport,STP6 At3g05960 F2O10.8,Sugar transport protein 6 (Hexose transporter 6),2.A.1.1.56,3702,MAVVVSNANAPAFEAKMTVYVFICVMIAAVGGLIFGYDIGISGGVS...
Q9BWM7,Amino-acid transport,Membrane;Mitochondrion;Transmembrane,Transport,SFXN3,Sideroflexin-3,,9606,MGELPLDINIQEPRWDQSTFLGRARHFFTVTDPRNLLLSGAQLEAS...
Q9ZVK6,Sugar transport,Cell membrane;Membrane;Transmembrane,Symport;Transport,SUC8 At2g14670 T6B13.9,Sucrose transport protein SUC8 (Sucrose permea...,,3702,MSDLQAKNDVVAVDRQSSSSLADLDGPSPLRKMISVASIAAGIQFG...


# Feature generation

## Labels

In [6]:
fasta_to_labels(
    input_fasta=f"../data/datasets/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}.fasta",
    output_tsv=f"../data/features/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}_labels.tsv",
)
df_labels = pd.read_table(
    f"../data/features/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}_labels.tsv",
    index_col=0,
)
df_labels.labels.value_counts()

Sugar transport         181
Amino-acid transport    166
Name: labels, dtype: int64

## PAAC

In [7]:
calculate_composition_feature(
    input_fasta=f"../data/datasets/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}.fasta",
    output_tsv=f"../data/features/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}_paac.tsv",
    feature_type="paac",
)

df_paac = pd.read_table(
    f"../data/features/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}_paac.tsv",
    index_col=0,
)

# Functions

In [8]:
def get_feature_stats(df_features, df_labels_, labels=["Amino-acid transport", "Sugar transport"]):
    df_stats = pd.concat(
        {
            "corr": df_features.corrwith(
                df_labels_.labels.transform(lambda x: 1.0 if x == labels[1] else 0.0)
            ),
            "mean": df_features.mean(),
            "std": df_features.std(),
        },
        axis=1,
    )

    df_stats["corr_abs"] = df_stats["corr"].abs()

    df_stats["mean0"] = df_features.loc[df_labels_[df_labels_.labels == labels[0]].index].mean()
    df_stats["mean1"] = df_features.loc[df_labels_[df_labels_.labels == labels[1]].index].mean()

    df_stats["median0"] = df_features.loc[
        df_labels_[df_labels_.labels == labels[0]].index
    ].median()
    df_stats["median1"] = df_features.loc[
        df_labels_[df_labels_.labels == labels[1]].index
    ].median()

    df_stats["mediandiff"] = (df_stats["median0"] - df_stats["median1"]).abs()
    df_stats = df_stats.sort_values("mediandiff", ascending=False)
    return df_stats

In [9]:
def get_independent_test_set(
    df_features, df_labels_, labels=["Amino-acid transport", "Sugar transport"], test_size=0.2
):
    X = df_features.to_numpy()
    y = np.where(df_labels_.labels == labels[1], 1, 0)
    feature_names = df_features.columns.to_numpy()
    sample_names = df_features.index.to_numpy()
    (
        X_train,
        X_test,
        y_train,
        y_test,
        sample_names_train,
        sample_names_test,
    ) = train_test_split(
        X, y, sample_names, stratify=y, random_state=42, shuffle=True, test_size=test_size
    )
    return (
        X_train,
        X_test,
        y_train,
        y_test,
        sample_names_train,
        sample_names_test,
        feature_names,
    )


In [10]:
def print_validation_results(y_true_, y_pred_, labels = ["Amino", "Sugar"]):
    report_dict = classification_report(y_true=y_true_, y_pred=y_pred_, output_dict=True)
    report_dict = {
        labels[0]: report_dict['0'],
        labels[1]: report_dict['1'],
        "Macro": report_dict["macro avg"],
        "Weighted": report_dict["weighted avg"]
    }
    report_df = pd.DataFrame.from_dict(report_dict)
    confusion_matrix_df = pd.DataFrame(
        confusion_matrix(y_true_, y_pred_),
        columns=labels,
        index=labels,
    )
    return report_df, confusion_matrix_df

# Individual Features

## PAAC

### Stats

In [11]:
df_stats = get_feature_stats(df_paac, df_labels).head(10)
# 10 best features
display(df_stats.head(10))

display(df_stats.sort_values("corr", ascending=False))

Unnamed: 0,corr,mean,std,corr_abs,mean0,mean1,median0,median1,mediandiff
RF,-0.131965,0.001864,0.002134,0.131965,0.002157,0.001594,0.002073,0.0,0.002073
PK,-0.142659,0.00183,0.002002,0.142659,0.002128,0.001557,0.002022,0.0,0.002022
LP,-0.233056,0.005915,0.003765,0.233056,0.00683,0.005076,0.006515,0.004535,0.00198
KN,-0.117439,0.001845,0.002176,0.117439,0.002112,0.001601,0.001931,0.0,0.001931
GQ,-0.17284,0.001796,0.002064,0.17284,0.002168,0.001455,0.001905,0.0,0.001905
SQ,-0.111435,0.001873,0.002186,0.111435,0.002127,0.00164,0.00188,0.0,0.00188
QQ,0.233321,0.001355,0.001878,0.233321,0.000898,0.001774,0.0,0.001835,0.001835
DK,0.127839,0.00153,0.001801,0.127839,0.00129,0.00175,0.0,0.001805,0.001805
NP,-0.151699,0.001489,0.002032,0.151699,0.00181,0.001194,0.001784,0.0,0.001784
TY,-0.133844,0.001708,0.002108,0.133844,0.002002,0.001438,0.001779,0.0,0.001779


Unnamed: 0,corr,mean,std,corr_abs,mean0,mean1,median0,median1,mediandiff
QQ,0.233321,0.001355,0.001878,0.233321,0.000898,0.001774,0.0,0.001835,0.001835
DK,0.127839,0.00153,0.001801,0.127839,0.00129,0.00175,0.0,0.001805,0.001805
SQ,-0.111435,0.001873,0.002186,0.111435,0.002127,0.00164,0.00188,0.0,0.00188
KN,-0.117439,0.001845,0.002176,0.117439,0.002112,0.001601,0.001931,0.0,0.001931
RF,-0.131965,0.001864,0.002134,0.131965,0.002157,0.001594,0.002073,0.0,0.002073
TY,-0.133844,0.001708,0.002108,0.133844,0.002002,0.001438,0.001779,0.0,0.001779
PK,-0.142659,0.00183,0.002002,0.142659,0.002128,0.001557,0.002022,0.0,0.002022
NP,-0.151699,0.001489,0.002032,0.151699,0.00181,0.001194,0.001784,0.0,0.001784
GQ,-0.17284,0.001796,0.002064,0.17284,0.002168,0.001455,0.001905,0.0,0.001905
LP,-0.233056,0.005915,0.003765,0.233056,0.00683,0.005076,0.006515,0.004535,0.00198


### Independent test set

In [12]:
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
    feature_names,
) = get_independent_test_set(df_paac, df_labels)

### Model selection

In [13]:
for estimator in [
    LinearSVC(random_state=0),
    LinearSVC(class_weight="balanced", random_state=0),
    SVC(),
    SVC(class_weight="balanced"),
    GaussianNB(),
    KNeighborsClassifier(),
    RandomForestClassifier(random_state=0),
    SGDClassifier(random_state=0),
]:
    pipe = make_pipeline(StandardScaler(), estimator)
    scores = cross_val_score(estimator, X_train, y_train, scoring="f1_macro")
    print("### ", type(estimator).__name__)
    print(f"CV folds: {scores.round(3)}")
    print(f"Mean: {scores.mean().round(3)}")
    print(f"Std: {scores.std().round(3)}")


###  LinearSVC
CV folds: [0.341 0.341 0.337 0.345 0.345]
Mean: 0.342
Std: 0.003
###  LinearSVC
CV folds: [0.605 0.802 0.745 0.724 0.69 ]
Mean: 0.713
Std: 0.065
###  SVC
CV folds: [0.839 0.803 0.781 0.761 0.78 ]
Mean: 0.793
Std: 0.027
###  SVC
CV folds: [0.821 0.821 0.781 0.781 0.781]
Mean: 0.797
Std: 0.019
###  GaussianNB
CV folds: [0.768 0.857 0.818 0.577 0.854]
Mean: 0.775
Std: 0.104
###  KNeighborsClassifier
CV folds: [0.766 0.764 0.68  0.687 0.724]
Mean: 0.724
Std: 0.036
###  RandomForestClassifier
CV folds: [0.714 0.786 0.745 0.664 0.78 ]
Mean: 0.738
Std: 0.045
###  SGDClassifier
CV folds: [0.341 0.341 0.337 0.345 0.345]
Mean: 0.342
Std: 0.003


### Hyperparameters

In [14]:
# params = {
#     "kneighborsclassifier__n_neighbors": list(range(3,10)),
#     "kneighborsclassifier__weights": ["uniform", "distance"],
#     "kneighborsclassifier__metric": ["minkowski", "euclidean", "manhattan"],
# }
params = {
    "svc__C": [1, 10,100],
    "svc__gamma": ["scale", "auto", 1e-0, 1e-1, 1e-2],
    "svc__class_weight": ["balanced", None],
}
gsearch = GridSearchCV(
    make_pipeline(StandardScaler(), SVC()),
    param_grid=params,
    cv=5,
    scoring="f1_macro",
    n_jobs=-1,
    return_train_score=True,
)
gsearch.fit(X_train, y_train)
print(gsearch.best_params_)
print(gsearch.best_score_)
best_estimator_svc = gsearch.best_estimator_


{'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.7909884594496935


### Dimensionality reduction

In [15]:
pca = PCA()
pca.fit(X_train)
csum = np.cumsum(pca.explained_variance_ratio_)
print("Number of components to explain 95% variance:", np.argmax(csum >= 0.95) + 1)

gsearch = GridSearchCV(
    estimator=Pipeline(
        [
            ("scale", StandardScaler()),
            ("pca", PCA()),
            ("scale2", StandardScaler()),
            ("svc", SVC()),
        ]
    ),
    param_grid={
        "svc__C": [1, 10, 100],
        "svc__gamma": [1e-1, 1e-2, 1e-3,"scale"],
        "svc__class_weight": ["balanced", None],
        "pca__n_components": np.linspace(0.8, 0.99, 18),
    },
    cv=5,
    scoring="f1_macro",
    n_jobs=-2,
    return_train_score=True,
)
gsearch.fit(X_train, y_train)
print(gsearch.best_params_)
print(gsearch.best_score_.round(3))
best_estimator_svc_pca = gsearch.best_estimator_


Number of components to explain 95% variance: 160
{'pca__n_components': 0.8670588235294118, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01}
0.823


In [16]:

gsearch = GridSearchCV(
    estimator=Pipeline(
        [
            ("scale", StandardScaler()),
            ("kbest", SelectKBest()),
            ("scale2", StandardScaler()),
            ("svc", SVC()),
        ]
    ),
    param_grid={
        "svc__C": [1, 10, 100],
        "svc__gamma": [1e-1, 1e-2, 1e-3,"scale"],
        "svc__class_weight": ["balanced"],
        "kbest__k": range(1,400, 5),
    },
    cv=5,
    scoring="f1_macro",
    n_jobs=-2,
    return_train_score=True,
)
gsearch.fit(X_train, y_train)
print(gsearch.best_params_)
print(gsearch.best_score_.round(3))
best_estimator_svc_kbest = gsearch.best_estimator_


{'kbest__k': 371, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.814


### Validation

In [17]:
best_estimator = best_estimator_svc_pca

In [18]:
best_scores = cross_val_score(
    estimator=clone(best_estimator), X=X_train, y=y_train, scoring="f1_macro"
)
print(f"Train scores: {best_scores.mean().round(3)}+-{best_scores.std().round(3)}")

y_pred = best_estimator.predict(X_test)
y_true = y_test.copy()

report_df, confusion_matrix_df = print_validation_results(y_true, y_pred, labels=["Amino", "Sugar"])
display(report_df.round(3))
display(confusion_matrix_df)

Train scores: 0.823+-0.018


Unnamed: 0,Amino,Sugar,Macro,Weighted
precision,0.75,0.824,0.787,0.789
recall,0.818,0.757,0.787,0.786
f1-score,0.783,0.789,0.786,0.786
support,33.0,37.0,70.0,70.0


Unnamed: 0,Amino,Sugar
Amino,27,6
Sugar,9,28
