- Create long table with columns accession, feature type, feature column name, feature value, and feature generation parameters (json)
- Write transformer that can read the correct feature into pipeline
- Or alternative: A feature generation transformer? Takes sequences, transforms them automatically. Then use selectormixin?

In [1]:
from subpred.transporter_dataset import create_dataset
from subpred.feature_generator import FeatureGenerator
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from collections import OrderedDict

In [2]:
df = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Membrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="remove",
    outliers=["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0"],
    verbose=True,
    tax_ids_filter=[3702],
    output_log="../logs/athaliana_sugar_amino.log",
    sequence_clustering=70
)

cd-hit: clustered 165 sequences into 117 clusters at threshold 70


In [3]:
fg = FeatureGenerator()

df_features = fg.generate_features(df.sequence)

df_features.loc[:,df_features.columns.str.startswith("AAC")]

Unnamed: 0_level_0,AAC__A,AAC__C,AAC__D,AAC__E,AAC__F,AAC__G,AAC__H,AAC__I,AAC__K,AAC__L,AAC__M,AAC__N,AAC__P,AAC__Q,AAC__R,AAC__S,AAC__T,AAC__V,AAC__W,AAC__Y
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Q9SFG0,0.090730,0.015779,0.033531,0.035503,0.086785,0.088757,0.015779,0.092702,0.033531,0.104536,0.033531,0.033531,0.039448,0.029586,0.039448,0.055227,0.041420,0.086785,0.017751,0.025641
Q84WN3,0.066390,0.004149,0.020747,0.053942,0.070539,0.074689,0.008299,0.070539,0.041494,0.120332,0.029046,0.033195,0.041494,0.016598,0.029046,0.087137,0.058091,0.103734,0.016598,0.053942
O04249,0.079922,0.013645,0.017544,0.042885,0.066277,0.107212,0.011696,0.076023,0.035088,0.120858,0.031189,0.038986,0.035088,0.035088,0.037037,0.064327,0.058480,0.081871,0.017544,0.029240
Q56ZZ7,0.128205,0.009158,0.020147,0.038462,0.051282,0.106227,0.003663,0.058608,0.031136,0.120879,0.020147,0.018315,0.027473,0.027473,0.040293,0.115385,0.051282,0.089744,0.014652,0.027473
Q8H184,0.065217,0.012077,0.045894,0.057971,0.084541,0.062802,0.009662,0.050725,0.050725,0.111111,0.038647,0.026570,0.031401,0.028986,0.026570,0.072464,0.079710,0.096618,0.021739,0.026570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q94B65,0.095975,0.009288,0.015480,0.027864,0.055728,0.077399,0.012384,0.068111,0.055728,0.148607,0.043344,0.030960,0.030960,0.021672,0.015480,0.086687,0.065015,0.099071,0.003096,0.037152
Q0WWW9,0.082437,0.010753,0.028674,0.037634,0.066308,0.114695,0.007168,0.060932,0.039427,0.154122,0.014337,0.014337,0.044803,0.026882,0.041219,0.093190,0.041219,0.075269,0.012545,0.034050
Q2V4B9,0.092929,0.020202,0.024242,0.038384,0.062626,0.105051,0.008081,0.050505,0.036364,0.137374,0.026263,0.020202,0.032323,0.020202,0.044444,0.101010,0.050505,0.101010,0.012121,0.016162
Q94EI9,0.082353,0.014706,0.014706,0.041176,0.058824,0.064706,0.011765,0.088235,0.052941,0.132353,0.044118,0.044118,0.029412,0.011765,0.020588,0.073529,0.052941,0.105882,0.017647,0.038235


In [4]:
# # this should be used this in combination with a standardscaler, since the features are not in the same ranges.
# class FeatureCombinator(BaseEstimator, TransformerMixin):
#     # feature_names have the shape "feature_type__feature_name", separated by two underscores.
#     def __init__(self, feature_names: np.array, feature_types: np.array):
#         # grid search sets these fields with the values from the parameter grid, then fit is called.
#         self.feature_names = feature_names
#         self.feature_types = feature_types

#     def __filter_column_names(self):
#         for feature_name in self.feature_names:
#             feature_type = feature_name.split("__")[0]
#             yield feature_type in self.feature_types

#     def fit(self, X: np.array, y: np.array = None):
#         self.mask = list(self.__filter_column_names())
#         return self

#     def transform(self, X: np.array, y: np.array = None):
#         # would throw error with pandas dataframes because .loc is missing
#         if isinstance(X, pd.DataFrame):
#             X = np.array(X)
#         X = X[:, self.mask]
#         return X


In [5]:
# feature_names

In [6]:
from subpred.eval import preprocess_pandas, full_test, get_independent_test_set
df_features.loc[:,df_features.columns.str.startswith("AAC")]

full_test(
    df_features.loc[:,df_features.columns.str.startswith("PSSM_50")],
    df.keywords_transport,
    repetitions=1,
    feature_combination_method="pssm_combinations"
    # feature_types=[
    #     ["AAC"],
    #     ["PAAC"],
    #     [
    #         "PSSM_50_1",
    #         "PSSM_50_3",
    #         "PSSM_90_1",
    #         "PSSM_90_3",
    #     ],
    # ],
)


(                  label  F1 score dataset
 0       Sugar transport     0.978   train
 1       Sugar transport     0.944    test
 2  Amino-acid transport     0.939   train
 3  Amino-acid transport     0.833    test,
                                             0
 featurecombinator__feature_types  [PSSM_50_3]
 svc__C                                     10
 svc__class_weight                    balanced
 svc__gamma                              scale)

In [7]:
full_test(df_features, df.keywords_transport, repetitions=1)

(                  label  F1 score dataset
 0       Sugar transport     0.993   train
 1       Sugar transport     0.919    test
 2  Amino-acid transport     0.980   train
 3  Amino-acid transport     0.727    test,
                                             0
 featurecombinator__feature_types  [PSSM_90_3]
 svc__C                                      1
 svc__class_weight                    balanced
 svc__gamma                              scale)

In [9]:
X,y,feature_names, sample_names = preprocess_pandas(df_features, labels=df.keywords_transport, return_names=True)
from subpred.custom_transformers import FeatureCombinator
fc = FeatureCombinator(feature_names, feature_types=["AAC","PSSM_50_3"])

In [10]:
pd.DataFrame(fc.fit_transform(X))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,410,411,412,413,414,415,416,417,418,419
0,0.090730,0.015779,0.033531,0.035503,0.086785,0.088757,0.015779,0.092702,0.033531,0.104536,...,0.549356,0.469957,0.536481,0.643777,0.459227,0.484979,0.484979,0.637339,0.701717,0.508584
1,0.066390,0.004149,0.020747,0.053942,0.070539,0.074689,0.008299,0.070539,0.041494,0.120332,...,0.485000,0.290000,0.490000,0.840000,0.205000,0.385000,0.385000,0.545000,1.000000,0.435000
2,0.079922,0.013645,0.017544,0.042885,0.066277,0.107212,0.011696,0.076023,0.035088,0.120858,...,0.567468,0.495379,0.548983,0.639556,0.493530,0.502773,0.512015,0.606285,0.709797,0.530499
3,0.128205,0.009158,0.020147,0.038462,0.051282,0.106227,0.003663,0.058608,0.031136,0.120879,...,0.530612,0.459184,0.525510,0.664966,0.438776,0.511905,0.510204,0.554422,0.695578,0.511905
4,0.065217,0.012077,0.045894,0.057971,0.084541,0.062802,0.009662,0.050725,0.050725,0.111111,...,0.610592,0.492212,0.626168,0.700935,0.467290,0.538941,0.557632,0.613707,0.722741,0.613707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,0.095975,0.009288,0.015480,0.027864,0.055728,0.077399,0.012384,0.068111,0.055728,0.148607,...,0.598870,0.525424,0.587571,0.751412,0.460452,0.550847,0.545198,0.692090,0.858757,0.584746
113,0.082437,0.010753,0.028674,0.037634,0.066308,0.114695,0.007168,0.060932,0.039427,0.154122,...,0.598837,0.502907,0.590116,0.703488,0.497093,0.555233,0.547965,0.590116,0.725291,0.566860
114,0.092929,0.020202,0.024242,0.038384,0.062626,0.105051,0.008081,0.050505,0.036364,0.137374,...,0.609174,0.565138,0.607339,0.653211,0.541284,0.583486,0.585321,0.596330,0.678899,0.579817
115,0.082353,0.014706,0.014706,0.041176,0.058824,0.064706,0.011765,0.088235,0.052941,0.132353,...,0.644886,0.465909,0.602273,0.741477,0.426136,0.517045,0.525568,0.710227,0.855114,0.585227


In [12]:
from subpred.eval import _get_feature_type_combinations

feature_types = [
        "AAC",
        "PAAC",
        "PSSM_50_1",
        "PSSM_50_3",
        "PSSM_90_1",
        "PSSM_90_3",
    ]

# print(_get_feature_type_combinations("individual"))
# print(_get_feature_type_combinations("all_combinations"))
print(_get_feature_type_combinations("pssm_combinations", feature_types))
# print(_get_feature_type_combinations("asddf"))

[['AAC'], ['PAAC'], ['PSSM_50_1'], ['PSSM_50_3'], ['PSSM_90_1'], ['PSSM_90_3'], ['PSSM_50_1', 'PSSM_50_3'], ['PSSM_50_1', 'PSSM_90_1'], ['PSSM_50_1', 'PSSM_90_3'], ['PSSM_50_3', 'PSSM_90_1'], ['PSSM_50_3', 'PSSM_90_3'], ['PSSM_90_1', 'PSSM_90_3'], ['PSSM_50_1', 'PSSM_50_3', 'PSSM_90_1'], ['PSSM_50_1', 'PSSM_50_3', 'PSSM_90_3'], ['PSSM_50_1', 'PSSM_90_1', 'PSSM_90_3'], ['PSSM_50_3', 'PSSM_90_1', 'PSSM_90_3']]
