# Feature performance comparison

The purpose of this notebook is to compare the classification performance of the individual features, and their combination, for A. Thaliana

# Imports

In [1]:
import os
import sys
from IPython.display import display

sys.path.append('../src')
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_selection import SelectKBest, RFE, VarianceThreshold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.base import clone
from scipy.stats import shapiro
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator,TransformerMixin

from yellowbrick.features import ParallelCoordinates
from yellowbrick.features import Rank1D, Rank2D

import pandas as pd
import numpy as np
import seaborn as sns

from dataset.transporter_dataset import create_dataset
from dataset.cluster_fasta import cd_hit
from features.labels import fasta_to_labels
from features.compositions import calculate_composition_feature
from features.pssm import calculate_pssm_feature
from features.coexp import calculate_coexp_feature
from models.eval import nested_crossval
from visualization.feature_plots import create_plot

# Globals

In [2]:
N_THREADS = 16
IDENTITY_THRESHOLD=70
TAX_ID = 9606
ORGANISM = "human"
LOG_FILE = "../logs/human_amino_sugar.log"

# Dataset

In [3]:
# Delete previous log
if os.path.exists(LOG_FILE):
    with open(LOG_FILE, "w"):
        pass

create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Membrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    outliers=["Q9HBR0", "Q07837"],
    tax_ids_filter=[TAX_ID],
    output_tsv=f"../data/datasets/{ORGANISM}_amino_sugar.tsv",
    output_fasta=f"../data/datasets/{ORGANISM}_amino_sugar.fasta",
    output_log=LOG_FILE,
)


Unnamed: 0_level_0,keywords_transport,keywords_location,keywords_transport_related,gene_names,protein_names,tcdb_id,organism_id,sequence
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Q9BWM7,Amino-acid transport,Membrane;Mitochondrion;Transmembrane,Transport,SFXN3,Sideroflexin-3,,9606,MGELPLDINIQEPRWDQSTFLGRARHFFTVTDPRNLLLSGAQLEAS...
Q9BRV3,Sugar transport,Cell membrane;Membrane;Transmembrane,Transport,SLC50A1 RAG1AP1 SCP,Sugar transporter SWEET1 (HsSWEET1) (RAG1-acti...,2.A.123.1.4,9606,MEAGGFLDSLIYGACVVFTLGMFSAGLSDLRHMRMTRSVDNVQFLP...
Q5M8T2,Sugar transport,Membrane;Transmembrane,Transport,SLC35D3 FRCL1,Solute carrier family 35 member D3 (Fringe con...,2.A.7.15.5,9606,MRQLCRGRVLGISVAIAHGVFSGSLNILLKFLISRYQFSFLTLVQC...
Q969S0,Sugar transport,Membrane;Transmembrane,Transport,SLC35B4 YEA4 PSEC0055,UDP-xylose and UDP-N-acetylglucosamine transpo...,2.A.7.10.2,9606,MRPALAVGLVFAGCCSNVIFLELLARKHPGCGNIVTFAQFLFIAVE...
O75387,Amino-acid transport,Membrane;Transmembrane,Transport,SLC43A1 LAT3 PB39 POV1,Large neutral amino acids transporter small su...,2.A.1.44.1,9606,MAPTLQQAYRRRWWMACTAVLENLFFSAVLLGWGSLLIILKNEGFY...
...,...,...,...,...,...,...,...,...
Q9BYW1,Sugar transport,Cell membrane;Membrane;Transmembrane,Transport,SLC2A11 GLUT11,"Solute carrier family 2, facilitated glucose t...",2.A.1.1.44,9606,MRALRRLIQGRILLLTICAAGIGGTFQFGYNLSIINAPTLHIQEFT...
P11169,Sugar transport,Cell membrane;Membrane;Transmembrane,Transport,SLC2A3 GLUT3,"Solute carrier family 2, facilitated glucose t...",2.A.1.1.91,9606,MGTQKVTPALIFAITVATIGSFQFGYNTGVINAPEKIIKEFINKTL...
P14672,Sugar transport,Cell membrane;Membrane;Transmembrane,Transport,SLC2A4 GLUT4,"Solute carrier family 2, facilitated glucose t...",2.A.1.1.80,9606,MPSGFQQIGSEDGEPPQQRVTGTLVLAVFSAVLGSLQFGYNIGVIN...
Q96AA3,Sugar transport,Membrane;Transmembrane,Transport,RFT1,Protein RFT1 homolog,2.A.66.3.2,9606,MGSQEVLGHAARLASSGLLLQVLFRLITFVLNAFILRFLSKEIVGV...


## Clustering

In [4]:
cd_hit(
    executable_location="cd-hit",
    input_fasta=f"../data/datasets/{ORGANISM}_amino_sugar.fasta",
    output_fasta=f"../data/datasets/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}.fasta",
    log_file=LOG_FILE,
    identity_threshold=IDENTITY_THRESHOLD,
    n_threads=N_THREADS,
    memory=4096,
    verbose=True,
)

## Annotations

In [5]:
df_annotations = pd.read_table(f"../data/datasets/{ORGANISM}_amino_sugar.tsv", index_col=0)
df_annotations.head()

Unnamed: 0_level_0,keywords_transport,keywords_location,keywords_transport_related,gene_names,protein_names,tcdb_id,organism_id,sequence
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Q9BWM7,Amino-acid transport,Membrane;Mitochondrion;Transmembrane,Transport,SFXN3,Sideroflexin-3,,9606,MGELPLDINIQEPRWDQSTFLGRARHFFTVTDPRNLLLSGAQLEAS...
Q9BRV3,Sugar transport,Cell membrane;Membrane;Transmembrane,Transport,SLC50A1 RAG1AP1 SCP,Sugar transporter SWEET1 (HsSWEET1) (RAG1-acti...,2.A.123.1.4,9606,MEAGGFLDSLIYGACVVFTLGMFSAGLSDLRHMRMTRSVDNVQFLP...
Q5M8T2,Sugar transport,Membrane;Transmembrane,Transport,SLC35D3 FRCL1,Solute carrier family 35 member D3 (Fringe con...,2.A.7.15.5,9606,MRQLCRGRVLGISVAIAHGVFSGSLNILLKFLISRYQFSFLTLVQC...
Q969S0,Sugar transport,Membrane;Transmembrane,Transport,SLC35B4 YEA4 PSEC0055,UDP-xylose and UDP-N-acetylglucosamine transpo...,2.A.7.10.2,9606,MRPALAVGLVFAGCCSNVIFLELLARKHPGCGNIVTFAQFLFIAVE...
O75387,Amino-acid transport,Membrane;Transmembrane,Transport,SLC43A1 LAT3 PB39 POV1,Large neutral amino acids transporter small su...,2.A.1.44.1,9606,MAPTLQQAYRRRWWMACTAVLENLFFSAVLLGWGSLLIILKNEGFY...


# Feature generation

## Labels

In [6]:
fasta_to_labels(
    input_fasta=f"../data/datasets/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}.fasta",
    output_tsv=f"../data/features/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}_labels.tsv",
)
pd.read_table(
    f"../data/features/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}_labels.tsv",
    index_col=0,
).labels.value_counts()

Amino-acid transport    48
Sugar transport         34
Name: labels, dtype: int64

## PSSM

In [7]:
for uniref_cluster_threshold in [50, 90]:
    for psiblast_iterations in [1, 3]:
        calculate_pssm_feature(
            input_fasta=f"../data/datasets/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}.fasta",
            output_tsv="../data/features/{}_amino_sugar_cluster{}_pssm_ur{}_{}it.tsv".format(
                ORGANISM,
                IDENTITY_THRESHOLD,
                uniref_cluster_threshold,
                psiblast_iterations,
            ),
            tmp_folder="../data/intermediate/blast/pssm_uniref{}_{}it".format(
                uniref_cluster_threshold, psiblast_iterations
            ),
            blast_db="../data/raw/uniref/uniref{}/uniref{}.fasta".format(
                uniref_cluster_threshold, uniref_cluster_threshold
            ),
            iterations=psiblast_iterations,
            psiblast_executable="psiblast",
            psiblast_threads=N_THREADS,
            verbose=False,
        )


## Reading dataframes

In [8]:
df_labels = pd.read_table(
    f"../data/features/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}_labels.tsv",
    index_col=0,
)
df_pssm_50_1it = pd.read_table(
    f"../data/features/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}_pssm_ur50_1it.tsv",
    index_col=0,
)
df_pssm_50_3it = pd.read_table(
    f"../data/features/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}_pssm_ur50_3it.tsv",
    index_col=0,
)
df_pssm_90_1it = pd.read_table(
    f"../data/features/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}_pssm_ur90_1it.tsv",
    index_col=0,
)
df_pssm_90_3it = pd.read_table(
    f"../data/features/{ORGANISM}_amino_sugar_cluster{IDENTITY_THRESHOLD}_pssm_ur90_3it.tsv",
    index_col=0,
)

## Combining dataframes

In [9]:
df_pssm_all = pd.concat(
    [
        df_pssm_50_1it.rename(columns=lambda c: c + "_50_1"),
        df_pssm_50_3it.rename(columns=lambda c: c + "_50_3"),
        df_pssm_90_1it.rename(columns=lambda c: c + "_90_1"),
        df_pssm_90_3it.rename(columns=lambda c: c + "_90_3"),
    ],
    axis=1,
)


## Custom Transformer to try all parameters

In [10]:
class PSSMSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names, uniref_threshold="all", iterations="all"):
        self.feature_names = feature_names
        self.uniref_threshold = uniref_threshold
        self.iterations = iterations

    def fit(self, X, y=None):
        if self.uniref_threshold in {50, 90}:
            has_uniref = (
                np.char.find(self.feature_names, str(self.uniref_threshold)) >= 0
            )
        elif self.uniref_threshold == "all":
            has_uniref = np.array([True] * len(self.feature_names))
        else:
            raise ValueError(f"Incorrect uniref threshold {self.uniref_threshold}")

        if self.iterations in {1, 3}:
            has_iterations = np.char.find(self.feature_names, str(self.iterations)) >= 0
        elif self.iterations == "all":
            has_iterations = np.array([True] * len(self.feature_names))
        else:
            raise ValueError(f"Incorrect iteration count: {self.iterations}")
        self.mask = np.bitwise_and(has_uniref, has_iterations)
        return self

    def transform(self, X, y=None):
        X = np.array(X)
        X = X[:, self.mask]
        return X


# Functions

In [11]:
def get_feature_stats(df_features, df_labels_, labels=["Amino-acid transport", "Sugar transport"]):
    df_stats = pd.concat(
        {
            "corr": df_features.corrwith(
                df_labels_.labels.transform(lambda x: 1.0 if x == labels[1] else 0.0)
            ),
            "mean": df_features.mean(),
            "std": df_features.std(),
        },
        axis=1,
    )

    df_stats["sum"] = df_stats.sum(axis=1)
    df_stats["corr_abs"] = df_stats["corr"].abs()

    df_stats["mean0"] = df_features.loc[df_labels_[df_labels_.labels == labels[0]].index].mean()
    df_stats["mean1"] = df_features.loc[df_labels_[df_labels_.labels == labels[1]].index].mean()

    df_stats["median0"] = df_features.loc[
        df_labels_[df_labels_.labels == labels[0]].index
    ].median()
    df_stats["median1"] = df_features.loc[
        df_labels_[df_labels_.labels == labels[1]].index
    ].median()

    df_stats["mediandiff"] = (df_stats["median0"] - df_stats["median1"]).abs()
    df_stats = df_stats.sort_values("mediandiff", ascending=False)
    return df_stats

In [12]:
def get_independent_test_set(
    df_features, df_labels_, labels=["Amino-acid transport", "Sugar transport"], test_size=0.2
):
    X = df_features.to_numpy()
    y = np.where(df_labels_.labels == labels[1], 1, 0)
    feature_names = df_features.columns.to_numpy(dtype=str)
    sample_names = df_features.index.to_numpy(dtype=str)
    (
        X_train,
        X_test,
        y_train,
        y_test,
        sample_names_train,
        sample_names_test,
    ) = train_test_split(
        X, y, sample_names, stratify=y, random_state=42, shuffle=True, test_size=test_size
    )
    return (
        X_train,
        X_test,
        y_train,
        y_test,
        sample_names_train,
        sample_names_test,
        feature_names,
    )


In [13]:
def print_validation_results(y_true_, y_pred_, labels = ["Amino", "Sugar"]):
    report_dict = classification_report(y_true=y_true_, y_pred=y_pred_, output_dict=True)
    report_dict = {
        labels[0]: report_dict['0'],
        labels[1]: report_dict['1'],
        "Macro": report_dict["macro avg"],
        "Weighted": report_dict["weighted avg"]
    }
    report_df = pd.DataFrame.from_dict(report_dict)
    confusion_matrix_df = pd.DataFrame(
        confusion_matrix(y_true_, y_pred_),
        columns=labels,
        index=labels,
    )
    return report_df, confusion_matrix_df

# Individual Features

## PSSM

### Stats, Plots

In [14]:
df_stats = get_feature_stats(df_pssm_50_1it, df_labels)


df_stats["shapiro_p"] = df_pssm_50_1it.apply(lambda col: shapiro(col)[1], axis=0).round(4)
df_stats["shapiro"] = df_pssm_50_1it.apply(lambda col: shapiro(col)[0], axis=0)

display(df_stats[df_stats.shapiro < 0.9])

df_stats.sort_values("std")

Unnamed: 0,corr,mean,std,sum,corr_abs,mean0,mean1,median0,median1,mediandiff,shapiro_p,shapiro
KW,0.130546,0.448004,0.07192,0.650469,0.130546,0.44015,0.459091,0.45141,0.475981,0.0245713,0.0,0.872628
PY,-0.059589,0.478312,0.068809,0.487531,0.059589,0.481741,0.47347,0.501351,0.479466,0.02188521,0.0,0.881027
PP,0.097569,0.463165,0.066388,0.627122,0.097569,0.457747,0.470814,0.469259,0.485859,0.01659931,0.0,0.895989
PW,0.016948,0.473864,0.086385,0.577197,0.016948,0.472639,0.475593,0.499487,0.483958,0.01552818,0.0,0.826853
EW,0.168187,0.526445,0.049552,0.744184,0.168187,0.519474,0.536287,0.535066,0.549905,0.01483956,0.0,0.894557
PD,-0.041031,0.467217,0.061046,0.487232,0.041031,0.469312,0.464259,0.479671,0.465385,0.0142865,0.0,0.898678
PC,-0.011805,0.438355,0.074033,0.500583,0.011805,0.439086,0.437323,0.451108,0.436932,0.01417592,0.0,0.891193
PN,-0.069732,0.516856,0.052148,0.499272,0.069732,0.519898,0.512562,0.531115,0.517789,0.01332585,0.0,0.8988
FV,0.134724,0.480585,0.055893,0.671201,0.134724,0.474286,0.489477,0.487216,0.498885,0.01166886,0.0,0.896803
IL,-0.023549,0.963293,0.057869,0.997614,0.023549,0.964433,0.961684,1.0,0.990347,0.009652881,0.0,0.696228


Unnamed: 0,corr,mean,std,sum,corr_abs,mean0,mean1,median0,median1,mediandiff,shapiro_p,shapiro
ID,0.093043,0.000841,0.005932,0.099817,0.093043,0.000379,0.001493,0.000000,0.000000,0.000000,0.0000,0.128216
IE,0.072322,0.080704,0.020080,0.173106,0.072322,0.079490,0.082419,0.079188,0.079937,0.000748,0.6362,0.987819
IK,0.055065,0.129887,0.025368,0.210320,0.055065,0.128718,0.131536,0.129837,0.126276,0.003561,0.8795,0.991633
IQ,0.298550,0.206797,0.033192,0.538539,0.298550,0.198508,0.218499,0.201180,0.213179,0.011998,0.8770,0.991585
HQ,-0.072157,0.575896,0.033321,0.537059,0.072157,0.577907,0.573056,0.581999,0.580386,0.001612,0.0005,0.935785
...,...,...,...,...,...,...,...,...,...,...,...,...
GR,0.255087,0.309462,0.101014,0.665564,0.255087,0.287908,0.339891,0.283145,0.338456,0.055311,0.3512,0.983008
GN,0.248699,0.320306,0.101140,0.670145,0.248699,0.299266,0.350010,0.302953,0.349901,0.046948,0.3171,0.982266
IW,0.329764,0.455329,0.103936,0.889030,0.329764,0.426659,0.495804,0.436359,0.501225,0.064866,0.5585,0.986650
GE,0.262606,0.289759,0.105163,0.657528,0.262606,0.266658,0.322371,0.261996,0.318764,0.056768,0.4225,0.984393


### Independent test set

In [15]:
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
    feature_names,
) = get_independent_test_set(df_pssm_all, df_labels, test_size=0.2)

### Model selection

SVC (with default RBF kernel) looks the most promising.

In [16]:
for estimator in [
    LinearSVC(max_iter=1e6, class_weight="balanced", random_state=0),
    SVC(class_weight="balanced"),
    RandomForestClassifier(class_weight="balanced", random_state=0),
    LinearSVC(max_iter=1e6, random_state=0),
    SVC(),
    RandomForestClassifier(random_state=0),
    GaussianNB(),
    KNeighborsClassifier(),
    SGDClassifier(random_state=0),
]:
    pipe = make_pipeline(StandardScaler(), estimator)
    scores = cross_val_score(pipe, X_train, y_train, scoring="f1_macro")
    print("### ", str(estimator))
    print(f"CV folds: {scores.round(3)}")
    print(f"Mean: {scores.mean().round(3)}")
    print(f"Std: {scores.std().round(3)}")


###  LinearSVC(class_weight='balanced', max_iter=1000000.0)
CV folds: [0.764 0.845 0.838 0.764 0.707]
Mean: 0.783
Std: 0.052
###  SVC(class_weight='balanced')
CV folds: [0.675 0.923 0.764 0.764 0.405]
Mean: 0.706
Std: 0.17
###  RandomForestClassifier(class_weight='balanced')
CV folds: [0.675 0.838 0.838 0.675 0.458]
Mean: 0.697
Std: 0.14
###  LinearSVC(max_iter=1000000.0)
CV folds: [0.764 0.845 0.838 0.764 0.707]
Mean: 0.783
Std: 0.052
###  SVC()
CV folds: [0.675 0.745 0.838 0.764 0.458]
Mean: 0.696
Std: 0.13
###  RandomForestClassifier()
CV folds: [0.675 0.764 0.838 0.639 0.458]
Mean: 0.675
Std: 0.128
###  GaussianNB()
CV folds: [0.606 0.845 0.615 0.69  0.448]
Mean: 0.641
Std: 0.129
###  KNeighborsClassifier()
CV folds: [0.764 0.769 0.764 0.838 0.575]
Mean: 0.742
Std: 0.088
###  SGDClassifier()
CV folds: [0.69  0.845 0.764 0.764 0.405]
Mean: 0.694
Std: 0.152


### Parameter tuning

In [17]:
gsearch = GridSearchCV(
    estimator=make_pipeline(
        PSSMSelector(feature_names=feature_names),
        StandardScaler(),
        LinearSVC(max_iter=1e6, random_state=0),
    ),
    param_grid={
        "pssmselector__uniref_threshold": [50, 90, "all"],
        "pssmselector__iterations": [1, 3, "all"],
        "linearsvc__class_weight": ["balanced", None],
        "linearsvc__C": [1, 0.001, 0.01, 0.1],
        "linearsvc__dual": [True, False],
    },
    cv=5,
    scoring="f1_macro",
    n_jobs=-1,
    return_train_score=True,
)
gsearch.fit(X_train, y_train)
print(gsearch.best_params_)
print(gsearch.best_score_)
best_estimator_svc = gsearch.best_estimator_


{'linearsvc__C': 0.01, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'pssmselector__iterations': 'all', 'pssmselector__uniref_threshold': 50}
0.8104378954378955


### Dimensionality reduction

In [18]:
pca = PCA()
pca.fit(X_train)
csum = np.cumsum(pca.explained_variance_ratio_)
print("Number of components to explain 97% of variance:", np.argmax(csum >= 0.97) + 1)

Number of components to explain 97% of variance: 33


In [19]:
gsearch = GridSearchCV(
    estimator=make_pipeline(
        PSSMSelector(feature_names=feature_names),
        StandardScaler(),
        PCA(),
        StandardScaler(),
        LinearSVC(max_iter=1e6, random_state=0),
    ),
    param_grid={
        "pssmselector__uniref_threshold": [50, 90, "all"],
        "pssmselector__iterations": [1, 3, "all"],
        "linearsvc__class_weight": ["balanced", None],
        "linearsvc__C": [1, 0.01, 0.1],
        "linearsvc__dual": [True, False],
        "pca__n_components": np.linspace(0.8, 0.99, 20)
    },
    cv=5,
    scoring="f1_macro",
    n_jobs=-1,
    return_train_score=True,
)
gsearch.fit(X_train, y_train)
print(gsearch.best_params_)
print(gsearch.best_score_)
best_estimator_lsvc_pca = gsearch.best_estimator_

{'linearsvc__C': 0.1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'pca__n_components': 0.96, 'pssmselector__iterations': 3, 'pssmselector__uniref_threshold': 50}
0.8556404706404706


In [20]:
gsearch = GridSearchCV(
    estimator=make_pipeline(
        PSSMSelector(feature_names=feature_names),
        StandardScaler(),
        PCA(),
        StandardScaler(),
        SVC(),
    ),
    param_grid={
        "pssmselector__uniref_threshold": [50, 90, "all"],
        "pssmselector__iterations": [1, 3, "all"],
        "svc__class_weight": ["balanced", None],
        "svc__C": [1, 10, 100],
        "svc__gamma": ["scale", 1, 0.1, 0.01],
        "pca__n_components": np.linspace(0.8, 0.99, 20)
    },
    cv=5,
    scoring="f1_macro",
    n_jobs=-1,
    return_train_score=True,
)
gsearch.fit(X_train, y_train)
print(gsearch.best_params_)
print(gsearch.best_score_)
best_estimator_svc_pca = gsearch.best_estimator_

{'pca__n_components': 0.9299999999999999, 'pssmselector__iterations': 'all', 'pssmselector__uniref_threshold': 50, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 0.1}
0.9032338086749853


### Validation

In [21]:
best_estimator = best_estimator_svc_pca
best_scores = cross_val_score(
    estimator=clone(best_estimator), X=X_train, y=y_train, scoring="f1_macro"
)
print(f"Train scores: {best_scores.mean().round(3)}+-{best_scores.std().round(3)}")

y_pred = best_estimator.predict(X_test)
y_true = y_test.copy()

report_df, confusion_matrix_df = print_validation_results(y_true, y_pred, labels=["Amino", "Sugar"])
display(report_df.round(3))
display(confusion_matrix_df)

Train scores: 0.903+-0.033


Unnamed: 0,Amino,Sugar,Macro,Weighted
precision,0.909,1.0,0.955,0.947
recall,1.0,0.857,0.929,0.941
f1-score,0.952,0.923,0.938,0.94
support,10.0,7.0,17.0,17.0


Unnamed: 0,Amino,Sugar
Amino,10,0
Sugar,1,6


In [22]:
best_estimator = best_estimator_lsvc_pca
best_scores = cross_val_score(
    estimator=clone(best_estimator), X=X_train, y=y_train, scoring="f1_macro"
)
print(f"Train scores: {best_scores.mean().round(3)}+-{best_scores.std().round(3)}")

y_pred = best_estimator.predict(X_test)
y_true = y_test.copy()

report_df, confusion_matrix_df = print_validation_results(y_true, y_pred, labels=["Amino", "Sugar"])
display(report_df.round(3))
display(confusion_matrix_df)

Train scores: 0.856+-0.06


Unnamed: 0,Amino,Sugar,Macro,Weighted
precision,0.909,1.0,0.955,0.947
recall,1.0,0.857,0.929,0.941
f1-score,0.952,0.923,0.938,0.94
support,10.0,7.0,17.0,17.0


Unnamed: 0,Amino,Sugar
Amino,10,0
Sugar,1,6


# Alternative Eval

In [23]:
from sklearn.model_selection import LeaveOneOut

X = df_pssm_all.values
labels = df_labels.labels
y = np.where(labels == "Sugar transport", 1, 0)


params_svc = {
    "svc__class_weight": ["balanced"],
    "svc__C": [0.1, 1, 10],
    # "svc__gamma": ["scale", 1e-1, 1e-2, 1e-3],
    "pssmselector__uniref_threshold": [50, 90, "all"],
    "pssmselector__iterations": [1, 3, "all"],
}
gsearch = GridSearchCV(
    make_pipeline(PSSMSelector(feature_names=feature_names), StandardScaler(), SVC()),
    param_grid=params_svc,
    cv=LeaveOneOut(),
    scoring="f1_macro",
    return_train_score=True,
    refit=True,
)
# the score does not make a difference, since it's either 1 (correct) or 0 (incorrect)
res = cross_val_predict(gsearch, X, y, n_jobs=-1, cv=LeaveOneOut())
print(f1_score(y, res, average="macro"))


params_svc = {
    "svc__class_weight": ["balanced"],
    "svc__C": [0.1, 1, 10],
    # "svc__gamma": ["scale", 1e-1, 1e-2, 1e-3],
    "pca__n_components": [0.8, 0.9],
    "pssmselector__uniref_threshold": [50, 90, "all"],
    "pssmselector__iterations": [1, 3, "all"],
}
gsearch = GridSearchCV(
    make_pipeline(
        PSSMSelector(feature_names=feature_names),
        StandardScaler(),
        PCA(),
        StandardScaler(),
        SVC(),
    ),
    param_grid=params_svc,
    cv=LeaveOneOut(),
    scoring="f1_macro",
    return_train_score=True,
    refit=True,
)
# the score does not make a difference, since it's either 1 (correct) or 0 (incorrect)
res = cross_val_predict(gsearch, X, y, n_jobs=-1, cv=LeaveOneOut())
print(f1_score(y, res, average="macro"))


0.873222016079159
0.861209416833359
