# Feature performance comparison

The purpose of this notebook is to compare the classification performance of the individual features, and their combination, for A. Thaliana


# TODO

# Imports

In [1]:
import os
import sys
from IPython.display import display

from sklearnex import patch_sklearn
patch_sklearn()

sys.path.append('../src')
from sklearn.preprocessing import LabelEncoder, StandardScaler, Binarizer
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_selection import SelectKBest, RFE, VarianceThreshold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.base import clone
from scipy.stats import shapiro
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator,TransformerMixin

from yellowbrick.features import ParallelCoordinates
from yellowbrick.features import Rank1D, Rank2D

import pandas as pd
import numpy as np
import seaborn as sns

from dataset.transporter_dataset import create_dataset
from dataset.cluster_fasta import cd_hit
from features.labels import fasta_to_labels
from features.compositions import calculate_composition_feature
from features.pssm import calculate_pssm_feature
from features.coexp import calculate_coexp_feature
from models.eval import nested_crossval
from visualization.feature_plots import create_plot

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# Globals

In [2]:
LOG_FILE = "../logs/athaliana_amino_sugar.log"
N_THREADS = 16
IDENTITY_THRESHOLD=70

# Dataset

In [3]:
# Delete previous log
if os.path.exists(LOG_FILE):
    with open(LOG_FILE, "w"):
        pass

create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Membrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="remove",
    outliers=["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0"],
    verbose=True,
    tax_ids_filter=[3702],
    output_tsv="../data/datasets/athaliana_amino_sugar.tsv",
    output_fasta="../data/datasets/athaliana_amino_sugar.fasta",
    output_log=LOG_FILE,
)

## Clustering

In [None]:
cd_hit(
    executable_location="cd-hit",
    input_fasta=f"../data/datasets/athaliana_amino_sugar.fasta",
    output_fasta=f"../data/datasets/athaliana_amino_sugar_cluster{IDENTITY_THRESHOLD}.fasta",
    log_file=LOG_FILE,
    identity_threshold=IDENTITY_THRESHOLD,
    n_threads=N_THREADS,
    memory=4096,
    verbose=True,
)

## Annotations

In [133]:
df_annotations = pd.read_table("../data/datasets/athaliana_amino_sugar.tsv", index_col=0)
df_annotations.head()

Unnamed: 0_level_0,keywords_transport,gene_names,protein_names,tcdb_id,organism_id,sequence
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Q9SFG0,Sugar transport,STP6 At3g05960 F2O10.8,Sugar transport protein 6 (Hexose transporter 6),2.A.1.1.56,3702,MAVVVSNANAPAFEAKMTVYVFICVMIAAVGGLIFGYDIGISGGVS...
Q9ZVK6,Sugar transport,SUC8 At2g14670 T6B13.9,Sucrose transport protein SUC8 (Sucrose permea...,,3702,MSDLQAKNDVVAVDRQSSSSLADLDGPSPLRKMISVASIAAGIQFG...
Q84WN3,Sugar transport,SWEET17 At4g15920 dl4000c FCAALL.237,Bidirectional sugar transporter SWEET17 (AtSWE...,,3702,MAEASFYIGVIGNVISVLVFLSPVETFWKIVKRRSTEEYKSLPYIC...
O04249,Sugar transport,STP7 At4g02050 AGAA.1 T10M13.6,Sugar transport protein 7 (Hexose transporter 7),,3702,MAGGSFGPTGVAKERAEQYQGKVTSYVIIACLVAAIGGSIFGYDIG...
Q56ZZ7,Sugar transport,At5g16150 T21H19.70,Plastidic glucose transporter 4 (AtpGlcT),2.A.1.1.102,3702,MQSSTYAVKGNAAFAFQRRTFSSDRSTTSTGIRFAGYKSLATTGPL...


# Feature generation

## Labels

In [134]:
fasta_to_labels(
    input_fasta=f"../data/datasets/athaliana_amino_sugar_cluster{IDENTITY_THRESHOLD}.fasta",
    output_tsv=f"../data/features/athaliana_amino_sugar_cluster{IDENTITY_THRESHOLD}_labels.tsv",
)
pd.read_table(
    f"../data/features/athaliana_amino_sugar_cluster{IDENTITY_THRESHOLD}_labels.tsv",
    index_col=0,
).labels.value_counts()

Sugar transport         84
Amino-acid transport    33
Name: labels, dtype: int64

## COEXP

In [135]:
neighbor_counts = list(range(5,21,5))
selected_neighbor_counts = lambda x: range(1,x,1)

In [None]:
# TODO only transporter neighbors, or only transporters, or both.
for neighbor_count in neighbor_counts:
    for selected_neighbor_count in selected_neighbor_counts(neighbor_count):
        print(neighbor_count, selected_neighbor_count)
        calculate_coexp_feature(
            gene_expression_folder="../data/intermediate/gene_expression/athaliana",
            fasta_file_training="../data/datasets/athaliana_amino_sugar_cluster{}.fasta".format(
                IDENTITY_THRESHOLD
            ),
            log_file=LOG_FILE,
            gene_pos_file="../data/intermediate/gene_positions/gene_positions_athaliana.tsv",
            go_file="../data/intermediate/gene_ontology/goa_athaliana.tsv",
            output_folder="../data/features/coexp/athaliana_amino_sugar_cluster{}".format(
                IDENTITY_THRESHOLD
            ),
            neighbor_count=neighbor_count,
            selected_neighbor_count=selected_neighbor_count,
            go_percentage_threshold=None,
            feature_type="percentage",
            ontologies=["F", "P", "C"],
            verbose=True,
            n_threads=N_THREADS,
        )


5 1
5 2
5 3
5 4
10 1
10 2
10 3
10 4
10 5
10 6
10 7
10 8
10 9
15 1
15 2
15 3
15 4
15 5
15 6
15 7
15 8
15 9
15 10
15 11
15 12
15 13
15 14
20 1
20 2
20 3
20 4
20 5
20 6
20 7
20 8
20 9
20 10
20 11
20 12
20 13
20 14
20 15
20 16
20 17
20 18
20 19


## Reading dataframes

In [136]:
df_labels = pd.read_table(
    f"../data/features/athaliana_amino_sugar_cluster{IDENTITY_THRESHOLD}_labels.tsv",
    index_col=0,
)

coexp_dataframes = []

for neighbor_count in neighbor_counts:
    for selected_neighbor_count in selected_neighbor_counts(neighbor_count):
        for normalized_array in [True, False]:
            df = pd.read_table(
                "../data/features/coexp/athaliana_amino_sugar_cluster70/e_tabm_17{}_n{}_s{}_pNone_percentage_CFP.tsv".format(
                    "_norm" if normalized_array else "",
                    neighbor_count,
                    selected_neighbor_count,
                ),
                index_col=0,
            )
            df = df.rename(
                columns=lambda c: c.replace(" ", "_")
                + "_{}_{}_{}".format(
                    neighbor_count,
                    selected_neighbor_count,
                    "norm" if normalized_array else "notnorm",
                )
            )

            coexp_dataframes.append(df)


## Combining dataframes

In [137]:
df_coexp_all = pd.concat(
    coexp_dataframes,
    axis=1,
)
df_labels = df_labels.loc[df_coexp_all.index]
df_coexp_all.shape

(111, 2800)

## Custom Transformer to try all parameters

In [138]:
class CoexpParameterSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names, normalized_expression=True, neighbors=(5, 3)):
        self.feature_names = feature_names
        self.neighbors = neighbors
        self.normalized_expression = normalized_expression

    def fit(self, X, y=None):
        self.mask = np.char.endswith(
            self.feature_names,
            "{}_{}_{}".format(
                self.neighbors[0],
                self.neighbors[1],
                "norm" if self.normalized_expression else "notnorm",
            ),
        )
        return self

    def transform(self, X, y=None):
        X = np.array(X)
        X = X[:, self.mask]
        return X


# Functions

In [139]:
def get_feature_stats(df_features, df_labels_, labels=["Amino-acid transport", "Sugar transport"]):
    df_stats = pd.concat(
        {
            "corr": df_features.corrwith(
                df_labels_.labels.transform(lambda x: 1.0 if x == labels[1] else 0.0)
            ),
            "mean": df_features.mean(),
            "std": df_features.std(),
        },
        axis=1,
    )

    df_stats["sum"] = df_stats.sum(axis=1)
    df_stats["corr_abs"] = df_stats["corr"].abs()

    df_stats["mean0"] = df_features.loc[df_labels_[df_labels_.labels == labels[0]].index].mean()
    df_stats["mean1"] = df_features.loc[df_labels_[df_labels_.labels == labels[1]].index].mean()

    df_stats["median0"] = df_features.loc[
        df_labels_[df_labels_.labels == labels[0]].index
    ].median()
    df_stats["median1"] = df_features.loc[
        df_labels_[df_labels_.labels == labels[1]].index
    ].median()

    df_stats["mediandiff"] = (df_stats["median0"] - df_stats["median1"]).abs()
    df_stats = df_stats.sort_values("mediandiff", ascending=False)
    return df_stats

In [140]:
def get_independent_test_set(
    df_features, df_labels_, labels=["Amino-acid transport", "Sugar transport"], test_size=0.2
):
    X = df_features.to_numpy()
    y = np.where(df_labels_.labels == labels[1], 1, 0)
    feature_names = df_features.columns.to_numpy(dtype=str)
    sample_names = df_features.index.to_numpy(dtype=str)
    (
        X_train,
        X_test,
        y_train,
        y_test,
        sample_names_train,
        sample_names_test,
    ) = train_test_split(
        X, y, sample_names, stratify=y, random_state=42, shuffle=True, test_size=test_size
    )
    return (
        X_train,
        X_test,
        y_train,
        y_test,
        sample_names_train,
        sample_names_test,
        feature_names,
    )


In [141]:
def print_validation_results(y_true_, y_pred_, labels = ["Amino", "Sugar"]):
    report_dict = classification_report(y_true=y_true_, y_pred=y_pred_, output_dict=True)
    report_dict = {
        labels[0]: report_dict['0'],
        labels[1]: report_dict['1'],
        "Macro": report_dict["macro avg"],
        "Weighted": report_dict["weighted avg"]
    }
    report_df = pd.DataFrame.from_dict(report_dict)
    confusion_matrix_df = pd.DataFrame(
        confusion_matrix(y_true_, y_pred_),
        columns=labels,
        index=labels,
    )
    return report_df, confusion_matrix_df

# Individual Features

## COEXP

### Stats, Plots

In [142]:
df_stats = get_feature_stats(df_coexp_all, df_labels)


df_stats["shapiro_p"] = df_coexp_all.apply(lambda col: shapiro(col)[1], axis=0).round(4)
df_stats["shapiro"] = df_coexp_all.apply(lambda col: shapiro(col)[0], axis=0)

# display(df_stats[df_stats.shapiro < 0.9])

In [143]:
# df_pssm_50_1it = df_pssm_50_1it.drop(["ID", "IP"], axis=1)
# print(df_pssm_50_1it.shape)

### Independent test set

In [144]:
(
    X_train,
    X_test,
    y_train,
    y_test,
    sample_names_train,
    sample_names_test,
    feature_names,
) = get_independent_test_set(df_coexp_all, df_labels, test_size=0.2)

### Model selection

No models look promising

In [145]:
for estimator in [
    LinearSVC(max_iter=1e6, class_weight="balanced"),
    SVC(class_weight="balanced"),
    RandomForestClassifier(class_weight="balanced"),
    LinearSVC(max_iter=1e6),
    SVC(),
    RandomForestClassifier(),
    GaussianNB(),
    KNeighborsClassifier(),
    SGDClassifier(),
]:
    pipe = make_pipeline(StandardScaler(),VarianceThreshold(),PCA(0.90),StandardScaler(), estimator)
    scores = cross_val_score(pipe, X_train, y_train, scoring="f1_macro")
    print("### ", str(estimator))
    print(f"CV folds: {scores.round(3)}")
    print(f"Mean: {scores.mean().round(3)}")
    print(f"Std: {scores.std().round(3)}")


###  LinearSVC(class_weight='balanced', max_iter=1000000.0)
CV folds: [0.481 0.379 0.379 0.575 0.346]
Mean: 0.432
Std: 0.085
###  SVC(class_weight='balanced')
CV folds: [0.446 0.419 0.4   0.414 0.393]
Mean: 0.414
Std: 0.018
###  RandomForestClassifier(class_weight='balanced')
CV folds: [0.419 0.419 0.4   0.414 0.414]
Mean: 0.413
Std: 0.007
###  LinearSVC(max_iter=1000000.0)
CV folds: [0.518 0.4   0.379 0.575 0.346]
Mean: 0.444
Std: 0.088
###  SVC()
CV folds: [0.419 0.419 0.4   0.414 0.414]
Mean: 0.413
Std: 0.007
###  RandomForestClassifier()
CV folds: [0.419 0.419 0.4   0.414 0.414]
Mean: 0.413
Std: 0.007
###  GaussianNB()
CV folds: [0.357 0.419 0.4   0.393 0.37 ]
Mean: 0.388
Std: 0.022
###  KNeighborsClassifier()
CV folds: [0.419 0.419 0.4   0.414 0.414]
Mean: 0.413
Std: 0.007
###  SGDClassifier()
CV folds: [0.518 0.419 0.379 0.575 0.346]
Mean: 0.448
Std: 0.086


### Parameter tuning

In [146]:
neighbors_parameters = []
for neighbor_count in neighbor_counts:
    for selected_neighbor_count in selected_neighbor_counts(neighbor_count):
        neighbors_parameters.append((neighbor_count, selected_neighbor_count))

#### Frequencies

In [147]:
gsearch = GridSearchCV(
    estimator=make_pipeline(
        CoexpParameterSelector(feature_names=feature_names),
        StandardScaler(),
        LinearSVC(max_iter=1e6),
    ),
    param_grid={
        "coexpparameterselector__normalized_expression": [True, False],
        "coexpparameterselector__neighbors": neighbors_parameters,
        "linearsvc__class_weight": ["balanced"],
        "linearsvc__C": [0.1, 1, 10],
        "linearsvc__dual": [True, False],
    },
    cv=5,
    scoring="f1_macro",
    n_jobs=-1,
    return_train_score=True,
)
gsearch.fit(X_train, y_train)
print(gsearch.best_params_)
print(gsearch.best_score_)
best_estimator_lsvc_freq = gsearch.best_estimator_

{'coexpparameterselector__neighbors': (20, 10), 'coexpparameterselector__normalized_expression': False, 'linearsvc__C': 1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True}
0.7008305961349439


#### Binary

In [148]:
gsearch = GridSearchCV(
    estimator=make_pipeline(
        CoexpParameterSelector(feature_names=feature_names),
        Binarizer(),
        StandardScaler(),
        LinearSVC(max_iter=1e6),
    ),
    param_grid={
        "coexpparameterselector__normalized_expression": [True, False],
        "coexpparameterselector__neighbors": neighbors_parameters,
        "binarizer__threshold": np.linspace(0.1, 0.9, 9),
        "linearsvc__class_weight": ["balanced"],
        "linearsvc__C": [0.1, 1, 10],
        "linearsvc__dual": [True, False],
    },
    cv=5,
    scoring="f1_macro",
    n_jobs=-1,
    return_train_score=True,
)
gsearch.fit(X_train, y_train)
print(gsearch.best_params_)
print(gsearch.best_score_)
best_estimator_lsvc_binary = gsearch.best_estimator_

{'binarizer__threshold': 0.2, 'coexpparameterselector__neighbors': (5, 4), 'coexpparameterselector__normalized_expression': True, 'linearsvc__C': 1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True}
0.6973427699514655


### Dimensionality reduction

#### Frequencies

In [149]:
gsearch = GridSearchCV(
    estimator=make_pipeline(
        CoexpParameterSelector(feature_names=feature_names),
        StandardScaler(),
        PCA(),
        StandardScaler(),
        LinearSVC(max_iter=1e6),
    ),
    param_grid={
        "coexpparameterselector__normalized_expression": [True, False],
        "coexpparameterselector__neighbors": neighbors_parameters,
        "linearsvc__class_weight": ["balanced"],
        "linearsvc__C": [0.1, 1, 10],
        "linearsvc__dual": [True, False],
        "pca__n_components": np.linspace(0.4, 0.95, 11)

    },
    cv=5,
    scoring="f1_macro",
    n_jobs=-1,
    return_train_score=True,
)
gsearch.fit(X_train, y_train)
print(gsearch.best_params_)
print(gsearch.best_score_)
best_estimator_lsvc_freq_pca = gsearch.best_estimator_

{'coexpparameterselector__neighbors': (20, 2), 'coexpparameterselector__normalized_expression': False, 'linearsvc__C': 0.1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'pca__n_components': 0.565}
0.6696138643964731


#### Binary

In [150]:
gsearch = GridSearchCV(
    estimator=make_pipeline(
        CoexpParameterSelector(feature_names=feature_names),
        Binarizer(),
        StandardScaler(),
        PCA(),
        StandardScaler(),
        LinearSVC(max_iter=1e6),
    ),
    param_grid={
        "coexpparameterselector__normalized_expression": [True, False],
        "coexpparameterselector__neighbors": neighbors_parameters,
        "binarizer__threshold": np.linspace(0.1, 0.9, 9),
        "linearsvc__class_weight": ["balanced"],
        "linearsvc__C": [0.1, 1, 10],
        "linearsvc__dual": [True, False],
        "pca__n_components": np.linspace(0.4, 0.95, 11)

    },
    cv=5,
    scoring="f1_macro",
    n_jobs=-1,
    return_train_score=True,
)
gsearch.fit(X_train, y_train)
print(gsearch.best_params_)
print(gsearch.best_score_)
best_estimator_lsvc_binary_pca = gsearch.best_estimator_

{'binarizer__threshold': 0.5, 'coexpparameterselector__neighbors': (20, 8), 'coexpparameterselector__normalized_expression': False, 'linearsvc__C': 1, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': True, 'pca__n_components': 0.7849999999999999}
0.7494384842210928


### Validation

In [151]:
best_estimator = best_estimator_lsvc_freq
best_scores = cross_val_score(
    estimator=clone(best_estimator), X=X_train, y=y_train, scoring="f1_macro"
)
print(f"Train scores: {best_scores.mean().round(3)}+-{best_scores.std().round(3)}")

y_pred = best_estimator.predict(X_test)
y_true = y_test.copy()

report_df, confusion_matrix_df = print_validation_results(y_true, y_pred, labels=["Amino", "Sugar"])
display(report_df.round(3))
display(confusion_matrix_df)

Train scores: 0.701+-0.146


Unnamed: 0,Amino,Sugar,Macro,Weighted
precision,0.375,0.733,0.554,0.624
recall,0.429,0.688,0.558,0.609
f1-score,0.4,0.71,0.555,0.615
support,7.0,16.0,23.0,23.0


Unnamed: 0,Amino,Sugar
Amino,3,4
Sugar,5,11
