In [1]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from subpred.transporter_dataset import create_dataset
from subpred.cdhit import cd_hit
from subpred.eval import (
    optimize_hyperparams,
    preprocess_pandas,
    get_confusion_matrix,
    get_classification_report,
    get_independent_test_set,
)
from subpred.compositions import calculate_aac
from subpred.pssm import calculate_pssms_notebook

LOG_FILE = "../logs/cross_organism_amino_sugar_aac.log"

# Training on A thaliana dataset

In [2]:
df_at = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Membrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="remove",
    outliers=["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0"],
    verbose=True,
    tax_ids_filter=[3702],
    output_log=LOG_FILE,
    sequence_clustering=70
)

cd-hit: clustered 165 sequences into 117 clusters at threshold 70


In [3]:
df_at_aac = calculate_aac(df_at.sequence)

In [4]:
X_at, y_at = preprocess_pandas(
    df_at_aac, df_at.keywords_transport
)

In [5]:
gsearch_at = optimize_hyperparams(X_at,y_at)
gsearch_at

{'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.839


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('svc', SVC())]),
             n_jobs=-1,
             param_grid={'svc__C': [1, 0.1, 10],
                         'svc__class_weight': ['balanced', None],
                         'svc__gamma': ['scale', 0.01, 0.1, 1]},
             return_train_score=True, scoring='f1_macro')

In [6]:
best_estimator_at = gsearch_at.best_estimator_

In [7]:
get_confusion_matrix(X_at, y_at, best_estimator_at, labels=df_at.keywords_transport)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,33,0
Sugar transport,2,82


# Testing on human dataset

In [8]:
df_human = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Membrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    outliers=["Q9HBR0", "Q07837"],
    tax_ids_filter=[9606],
    output_log=LOG_FILE,
    sequence_clustering=70
)

cd-hit: clustered 87 sequences into 82 clusters at threshold 70


In [9]:
df_human.keywords_transport.value_counts()

Amino-acid transport    48
Sugar transport         34
Name: keywords_transport, dtype: int64

In [10]:
df_human_aac = calculate_aac(df_human.sequence)

In [11]:
labels = df_human.keywords_transport
labels.value_counts()

Amino-acid transport    48
Sugar transport         34
Name: keywords_transport, dtype: int64

In [12]:
X_human, y_human = preprocess_pandas(df_human_aac, labels)

In [13]:
get_confusion_matrix(X_test=X_human, y_test=y_human, clf=best_estimator_at, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,20,28
Sugar transport,5,29


In [14]:
get_classification_report(
    X_test=X_human, y_test=y_human, clf=best_estimator_at, labels=labels
)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.8,0.417,0.548,48
Sugar transport,0.509,0.853,0.637,34
macro avg,0.654,0.635,0.593,82
weighted avg,0.679,0.598,0.585,82


# Testing on E Coli

In [16]:
df_ecoli = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Transmembrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    outliers=['P76773', 'Q47706', 'P64550', 'P02943', 'P75733', 'P69856'],
    verbose=True,
    tax_ids_filter=[83333],
    output_log=LOG_FILE,
    sequence_clustering=70
)

cd-hit: clustered 99 sequences into 98 clusters at threshold 70


In [17]:
df_ecoli_aac = calculate_aac(df_ecoli.sequence)

In [18]:
X_ecoli, y_ecoli = preprocess_pandas(df_ecoli_aac, labels=df_ecoli.keywords_transport)

In [19]:
get_confusion_matrix(X_test=X_ecoli, y_test=y_ecoli, clf=best_estimator_at, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,14,37
Sugar transport,2,45


In [20]:
get_classification_report(
    X_test=X_ecoli, y_test=y_ecoli, clf=best_estimator_at, labels=labels
)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.875,0.275,0.418,51
Sugar transport,0.549,0.957,0.698,47
macro avg,0.712,0.616,0.558,98
weighted avg,0.719,0.602,0.552,98


# Testing on Yeast

In [21]:
df_yeast = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Membrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    tax_ids_filter=[559292],
    output_log=LOG_FILE,
    sequence_clustering=70,
)

cd-hit: clustered 64 sequences into 51 clusters at threshold 70


In [22]:
df_yeast_aac = calculate_aac(df_yeast.sequence)

In [23]:
X_yeast, y_yeast = preprocess_pandas(df_yeast_aac, labels=df_yeast.keywords_transport)

In [24]:
get_confusion_matrix(X_test=X_yeast, y_test=y_yeast, clf=best_estimator_at, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,10,24
Sugar transport,3,14


In [26]:
get_classification_report(
    X_test=X_yeast, y_test=y_yeast, clf=best_estimator_at, labels=labels
)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.769,0.294,0.426,34
Sugar transport,0.368,0.824,0.509,17
macro avg,0.569,0.559,0.467,51
weighted avg,0.636,0.471,0.453,51


# Training on E Coli

In [27]:
gsearch_ecoli = optimize_hyperparams(X_ecoli, y_ecoli)

{'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.825


In [28]:
best_estimator_ecoli = gsearch_ecoli.best_estimator_

In [29]:
get_confusion_matrix(X_ecoli, y_ecoli, best_estimator_ecoli, labels=df_ecoli.keywords_transport)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,51,0
Sugar transport,3,44


In [30]:
get_confusion_matrix(X_human, y_human, best_estimator_ecoli, labels=df_human.keywords_transport)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,28,20
Sugar transport,26,8


In [31]:
get_confusion_matrix(X_yeast, y_yeast, best_estimator_ecoli, labels=df_yeast.keywords_transport)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,24,10
Sugar transport,8,9


In [32]:
get_confusion_matrix(X_at, y_at, best_estimator_ecoli, labels=df_at.keywords_transport)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,28,5
Sugar transport,56,28


# Training on Human

In [33]:
gsearch_human = optimize_hyperparams(X_human, y_human)
best_estimator_human = gsearch_human.best_estimator_

{'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.624


In [34]:
get_confusion_matrix(X_human, y_human, best_estimator_human, labels=df_human.keywords_transport)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,45,3
Sugar transport,2,32


In [35]:
get_confusion_matrix(X_yeast, y_yeast, best_estimator_human, labels=df_yeast.keywords_transport)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,15,19
Sugar transport,8,9


In [36]:
get_confusion_matrix(X_at, y_at, best_estimator_human, labels=df_at.keywords_transport)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,25,8
Sugar transport,26,58


In [37]:
get_confusion_matrix(X_ecoli, y_ecoli, best_estimator_human, labels=df_ecoli.keywords_transport)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,17,34
Sugar transport,11,36


# Testing on Yeast

In [38]:
gsearch_yeast = optimize_hyperparams(X_yeast, y_yeast)
best_estimator_yeast = gsearch_yeast.best_estimator_

{'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.821


In [39]:
get_confusion_matrix(X_yeast, y_yeast, best_estimator_yeast, labels=df_yeast.keywords_transport)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,34,0
Sugar transport,0,17


In [40]:
get_confusion_matrix(X_human, y_human, best_estimator_yeast, labels=df_human.keywords_transport)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,42,6
Sugar transport,29,5


In [41]:
get_confusion_matrix(X_at, y_at, best_estimator_yeast, labels=df_at.keywords_transport)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,31,2
Sugar transport,61,23


In [42]:
get_confusion_matrix(X_ecoli, y_ecoli, best_estimator_yeast, labels=df_ecoli.keywords_transport)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,51,0
Sugar transport,47,0


## Conclusion

Models trained on one organism using the AAC feature can not be used to classify sugar/AA proteins in other organisms. This result makes sense, considering that amino acids often had vastly different feature importances between organisms. In case of sugar/amino acid, the specificity seems to be determined by different residue types.