In [2]:
import sys

from subpred.transporter_dataset import create_dataset
from subpred.eval import (
    optimize_hyperparams,
    preprocess_pandas,
    get_confusion_matrix,
    get_classification_report,
)
from subpred.pssm import calculate_pssms_notebook

LOG_FILE = "../logs/cross_organism_amino_sugar_pssm.log"

# Training on A thaliana dataset

In [3]:
df_at = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Membrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="remove",
    outliers=["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0"],
    verbose=True,
    tax_ids_filter=[3702],
    output_log=LOG_FILE,
    sequence_clustering=70
)

cd-hit: clustered 165 sequences into 117 clusters at threshold 70


In [4]:
df_at_pssm = calculate_pssms_notebook(df_at.sequence)

df_at_pssm

Unnamed: 0_level_0,AA_50_1,AR_50_1,AN_50_1,AD_50_1,AC_50_1,AQ_50_1,AE_50_1,AG_50_1,AH_50_1,AI_50_1,...,VL_90_3,VK_90_3,VM_90_3,VF_90_3,VP_90_3,VS_90_3,VT_90_3,VW_90_3,VY_90_3,VV_90_3
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q9SFG0,0.784223,0.252900,0.327146,0.238979,0.394432,0.350348,0.276102,0.545244,0.227378,0.317865,...,0.434307,0.381387,0.421533,0.578467,0.357664,0.390511,0.392336,0.512774,0.656934,0.417883
Q84WN3,0.664740,0.416185,0.462428,0.427746,0.624277,0.445087,0.456647,0.526012,0.479769,0.543353,...,0.383260,0.264317,0.374449,0.726872,0.215859,0.286344,0.312775,0.493392,0.982379,0.352423
O04249,0.735484,0.286022,0.352688,0.281720,0.479570,0.352688,0.318280,0.531183,0.279570,0.417204,...,0.476898,0.415842,0.471947,0.592409,0.387789,0.415842,0.422442,0.514851,0.702970,0.450495
Q56ZZ7,0.798913,0.173913,0.217391,0.153986,0.389493,0.251812,0.190217,0.481884,0.148551,0.367754,...,0.488636,0.433442,0.472403,0.608766,0.420455,0.461039,0.462662,0.514610,0.657468,0.470779
Q8H184,0.652482,0.308511,0.372340,0.269504,0.510638,0.365248,0.301418,0.457447,0.368794,0.425532,...,0.486772,0.391534,0.473545,0.595238,0.370370,0.417989,0.431217,0.505291,0.679894,0.473545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q94B65,0.814126,0.304833,0.408922,0.271375,0.579926,0.408922,0.349442,0.516729,0.334572,0.539033,...,0.542056,0.489097,0.529595,0.716511,0.426791,0.523364,0.504673,0.613707,0.894081,0.520249
Q0WWW9,0.760736,0.388037,0.424847,0.380368,0.556748,0.412577,0.423313,0.532209,0.375767,0.469325,...,0.544582,0.465021,0.515775,0.632373,0.455418,0.486968,0.482853,0.539095,0.703704,0.514403
Q2V4B9,0.800373,0.294776,0.341418,0.298507,0.468284,0.386194,0.386194,0.537313,0.261194,0.414179,...,0.521151,0.495770,0.510998,0.566836,0.477157,0.502538,0.502538,0.509306,0.602369,0.500846
Q94EI9,0.807471,0.396552,0.425287,0.350575,0.718391,0.465517,0.410920,0.589080,0.393678,0.591954,...,0.469697,0.412121,0.463636,0.660606,0.378788,0.433333,0.445455,0.633333,0.903030,0.463636


In [5]:
X_at, y_at, feature_names, sample_names = preprocess_pandas(
    df_at_pssm, df_at.keywords_transport, return_names=True
)

In [7]:
gsearch_at = optimize_hyperparams(X_at,y_at, feature_transformer="pssm", feature_names=feature_names)
gsearch_at

{'pssmselector__iterations': 3, 'pssmselector__uniref_threshold': 50, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
0.977


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('pssmselector',
                                        PSSMSelector(feature_names=array(['AA_50_1', 'AR_50_1', 'AN_50_1', ..., 'VW_90_3', 'VY_90_3',
       'VV_90_3'], dtype='<U7'))),
                                       ('standardscaler', StandardScaler()),
                                       ('svc', SVC())]),
             n_jobs=-1,
             param_grid={'pssmselector__iterations': [1, 3, 'all'],
                         'pssmselector__uniref_threshold': [50, 90, 'all'],
                         'svc__C': [1, 0.1, 10],
                         'svc__class_weight': ['balanced', None],
                         'svc__gamma': ['scale', 0.01, 0.1, 1]},
             return_train_score=True, scoring='f1_macro')

In [8]:
best_estimator_at = gsearch_at.best_estimator_

best_estimator_at

Pipeline(steps=[('pssmselector',
                 PSSMSelector(feature_names=array(['AA_50_1', 'AR_50_1', 'AN_50_1', ..., 'VW_90_3', 'VY_90_3',
       'VV_90_3'], dtype='<U7'),
                              iterations=3, uniref_threshold=50)),
                ('standardscaler', StandardScaler()),
                ('svc', SVC(C=10, class_weight='balanced'))])

In [9]:
get_confusion_matrix(X_at, y_at, best_estimator_at, labels=df_at.keywords_transport)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,33,0
Sugar transport,0,84


# Testing on human dataset

In [11]:
df_human = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Membrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    outliers=["Q9HBR0", "Q07837"],
    tax_ids_filter=[9606],
    output_log=LOG_FILE,
    sequence_clustering=70
)

cd-hit: clustered 87 sequences into 82 clusters at threshold 70


In [12]:
df_human.keywords_transport.value_counts()

Amino-acid transport    48
Sugar transport         34
Name: keywords_transport, dtype: int64

In [13]:
df_human_pssm = calculate_pssms_notebook(df_human.sequence)

In [14]:
labels = df_human.keywords_transport
labels.value_counts()

Amino-acid transport    48
Sugar transport         34
Name: keywords_transport, dtype: int64

In [15]:
X_human, y_human, feature_names, sample_names = preprocess_pandas(
    df_human_pssm, df_human.keywords_transport, return_names=True
)

In [16]:
get_confusion_matrix(X_test=X_human, y_test=y_human, clf=best_estimator_at, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,21,27
Sugar transport,3,31


In [17]:
get_classification_report(
    X_test=X_human, y_test=y_human, clf=best_estimator_at, labels=labels
)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,0.875,0.438,0.583,48
Sugar transport,0.534,0.912,0.674,34
macro avg,0.705,0.675,0.629,82
weighted avg,0.734,0.634,0.621,82


# Testing on E Coli

In [19]:
df_ecoli = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Transmembrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    outliers=['P76773', 'Q47706', 'P64550', 'P02943', 'P75733', 'P69856'],
    verbose=True,
    tax_ids_filter=[83333],
    output_log=LOG_FILE,
    sequence_clustering=70
)

cd-hit: clustered 99 sequences into 98 clusters at threshold 70


In [20]:
df_ecoli_pssm = calculate_pssms_notebook(df_ecoli.sequence)

In [21]:
X_ecoli, y_ecoli, feature_names, sample_names = preprocess_pandas(df_ecoli_pssm, labels=df_ecoli.keywords_transport, return_names=True)

In [22]:
get_confusion_matrix(X_test=X_ecoli, y_test=y_ecoli, clf=best_estimator_at, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,8,43
Sugar transport,0,47


In [23]:
get_classification_report(
    X_test=X_ecoli, y_test=y_ecoli, clf=best_estimator_at, labels=labels
)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,1.0,0.157,0.271,51
Sugar transport,0.522,1.0,0.686,47
macro avg,0.761,0.578,0.479,98
weighted avg,0.771,0.561,0.47,98


# Testing on Yeast

In [24]:
df_yeast = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Membrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    tax_ids_filter=[559292],
    output_log=LOG_FILE,
    sequence_clustering=70,
)

cd-hit: clustered 64 sequences into 51 clusters at threshold 70


In [25]:
df_yeast_pssm = calculate_pssms_notebook(df_yeast.sequence)

In [26]:
X_yeast, y_yeast= preprocess_pandas(df_yeast_pssm, labels=df_yeast.keywords_transport)

In [27]:
get_confusion_matrix(X_test=X_yeast, y_test=y_yeast, clf=best_estimator_at, labels=labels)

predicted,Amino-acid transport,Sugar transport
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
Amino-acid transport,9,25
Sugar transport,0,17


In [29]:
get_classification_report(
    X_test=X_ecoli, y_test=y_ecoli, clf=best_estimator_at, labels=labels
)

Unnamed: 0,precision,recall,f1-score,support
Amino-acid transport,1.0,0.157,0.271,51
Sugar transport,0.522,1.0,0.686,47
macro avg,0.761,0.578,0.479,98
weighted avg,0.771,0.561,0.47,98


## Conclusion

As with the other two features, transferring PSSM-based models between organisms is not really possible. The decision function seems to be in the wrong position when providing data from another organism.