In [3]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from subpred.transporter_dataset import create_dataset
from subpred.cdhit import cd_hit
from subpred.eval import (
    optimize_hyperparams,
    preprocess_pandas,
    get_confusion_matrix,
    get_classification_report,
)
from subpred.compositions import calculate_paac
from subpred.pssm import calculate_pssms_notebook

LOG_FILE = "../logs/cross_organism_amino_sugar_paac.log"


# Training on A thaliana dataset

In [4]:
df_at = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Membrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="remove",
    outliers=["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0"],
    verbose=True,
    tax_ids_filter=[3702],
    output_log=LOG_FILE,
    sequence_clustering=70
)

cd-hit: clustered 165 sequences into 117 clusters at threshold 70


In [5]:
df_at_paac = calculate_paac(df_at.sequence)

In [6]:
X_at, y_at = preprocess_pandas(
    df_at_paac, df_at.keywords_transport
)

In [7]:
gsearch_at = optimize_hyperparams(X_at,y_at)
gsearch_at

{'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.794


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('svc', SVC())]),
             n_jobs=-1,
             param_grid={'svc__C': [1, 0.1, 10],
                         'svc__class_weight': ['balanced', None],
                         'svc__gamma': ['scale', 0.01, 0.1, 1]},
             return_train_score=True, scoring='f1_macro')

In [8]:
best_estimator_at = gsearch_at.best_estimator_

In [9]:
get_confusion_matrix(X_at, y_at, best_estimator_at, labels=df_at.keywords_transport)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,33,0
Sugar transport,0,84


# Testing on human dataset

In [10]:
df_human = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Membrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    outliers=["Q9HBR0", "Q07837"],
    tax_ids_filter=[9606],
    output_log=LOG_FILE,
    sequence_clustering=70
)

cd-hit: clustered 87 sequences into 82 clusters at threshold 70


In [11]:
df_human.keywords_transport.value_counts()

Amino-acid transport    48
Sugar transport         34
Name: keywords_transport, dtype: int64

In [12]:
df_human_paac = calculate_paac(df_human.sequence)

In [13]:
labels = df_human.keywords_transport
labels.value_counts()

Amino-acid transport    48
Sugar transport         34
Name: keywords_transport, dtype: int64

In [14]:
X_human, y_human = preprocess_pandas(df_human_paac, labels)

In [15]:
get_confusion_matrix(X_test=X_human, y_test=y_human, clf=best_estimator_at, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,4,44
Sugar transport,0,34


In [16]:
get_classification_report(
    X_test=X_human, y_test=y_human, clf=best_estimator_at, labels=labels
)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,1.0,0.083,0.154,48
Sugar transport,0.436,1.0,0.607,34
macro avg,0.718,0.542,0.38,82
weighted avg,0.766,0.463,0.342,82


# Testing on E Coli

In [17]:
df_ecoli = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Transmembrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    outliers=['P76773', 'Q47706', 'P64550', 'P02943', 'P75733', 'P69856'],
    verbose=True,
    tax_ids_filter=[83333],
    output_log=LOG_FILE,
    sequence_clustering=70
)

cd-hit: clustered 99 sequences into 98 clusters at threshold 70


In [18]:
df_ecoli_paac = calculate_paac(df_ecoli.sequence)

In [19]:
X_ecoli, y_ecoli = preprocess_pandas(df_ecoli_paac, labels=df_ecoli.keywords_transport)

In [20]:
get_confusion_matrix(X_test=X_ecoli, y_test=y_ecoli, clf=best_estimator_at, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,1,50
Sugar transport,2,45


In [21]:
get_classification_report(
    X_test=X_ecoli, y_test=y_ecoli, clf=best_estimator_at, labels=labels
)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.333,0.02,0.037,51
Sugar transport,0.474,0.957,0.634,47
macro avg,0.404,0.489,0.335,98
weighted avg,0.401,0.469,0.323,98


# Testing on Yeast

In [22]:
df_yeast = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Membrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    tax_ids_filter=[559292],
    output_log=LOG_FILE,
    sequence_clustering=70,
)

cd-hit: clustered 64 sequences into 51 clusters at threshold 70


In [23]:
df_yeast_paac = calculate_paac(df_yeast.sequence)

In [24]:
X_yeast, y_yeast = preprocess_pandas(df_yeast_paac, labels=df_yeast.keywords_transport)

In [25]:
get_confusion_matrix(X_test=X_yeast, y_test=y_yeast, clf=best_estimator_at, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,8,26
Sugar transport,0,17


In [26]:
get_classification_report(
    X_test=X_ecoli, y_test=y_ecoli, clf=best_estimator_at, labels=labels
)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.333,0.02,0.037,51
Sugar transport,0.474,0.957,0.634,47
macro avg,0.404,0.489,0.335,98
weighted avg,0.401,0.469,0.323,98


## Conclusion

As with AAC, the models are not transferrable between organisms. This is because the models show very different rankings for the most important feature regarding the classificaiton task.