In [1]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path
from subpred.transporter_dataset import create_dataset
import re

In [2]:
# df = create_dataset(
#     keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
#     keywords_component_filter=["Membrane"],
#     keywords_transport_filter=["Transport"],
#     input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
#     multi_substrate="remove",
#     outliers=["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0"],
#     verbose=True,
#     tax_ids_filter=[3702],
#     output_log="../logs/athaliana_sugar_amino.log",
#     sequence_clustering=70
# )

In [5]:
def read_raw(input_file: str, force_update: bool = False):
    input_path = Path(input_file)
    pickle_path = Path(input_path.parent, input_path.name + ".pkl")
    print(pickle_path)
    if pickle_path.exists() and not force_update:
        print("Found pickle, reading...")
        df = pd.read_pickle(pickle_path)
    else:
        print("Reading text file...")
        if not force_update:
            print("Did not find pickle, creating new version...")
        else:
            print("Overwriting existing pickle...")
        df = pd.read_table(input_file, index_col=0, dtype=str)
        df.to_pickle(pickle_path)
    return df

In [6]:
old_file = "../data/raw/swissprot/uniprot-reviewed_yes.tab.gz"
new_file = "../data/raw/swissprot/uniprot_data_2022_04.tab.gz"

In [7]:
df_old = read_raw(old_file)

../data/raw/swissprot/uniprot-reviewed_yes.tab.gz.pkl
Found pickle, reading...


In [8]:
df_new = read_raw(new_file)

../data/raw/swissprot/uniprot_data_2022_04.tab.gz.pkl
Found pickle, reading...


In [66]:
def get_keywords_df(df: pd.DataFrame, use_keyword_names: bool = True):
    srs_keywords = (
        df.Keywords.str.split(";").explode().str.strip()
        if use_keyword_names
        else df.keyword_ids.str.split(";").explode().str.strip()
    )

    df_keywords = srs_keywords.to_frame(name="keyword").reset_index(drop=False)
    df_keywords = df_keywords[~df_keywords.keyword.isnull()]

    return df_keywords

In [117]:
keywords_substrate_filter=["Amino-acid transport", "Sugar transport"]
keywords_keep=["Transmembrane", "Transport"]
keywords_remove=["Transmembrane"]
# keywords_transport_filter=[],

In [111]:
df_keywords = get_keywords_df(df_new)

In [125]:
# accessions_or = df_keywords[df_keywords.keyword.isin(set(keywords_substrate_filter))].Entry.tolist()
# df_new.loc[accessions_or]

In [113]:
accessions_or = df_keywords[df_keywords.keyword.isin(set(keywords_substrate_filter))].Entry.tolist()
df_new.loc[accessions_or]

filter for keywords, then check how many matches exist for each protein
keyword_matches = df_keywords[df_keywords.keyword.isin(set(keywords_keep))].groupby("Entry").apply(len)
accessions_and = keyword_matches[keyword_matches == len(keywords_keep)].index

df_new.loc[accessions_and]

In [127]:
from subpred.dataset import create_dataset

outliers = (
        ["Q9HBR0", "Q07837"]
        + ["P76773", "Q47706", "P02943", "P75733", "P69856", "P64550"]
        + ["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0"]
    )
df = create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Transmembrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="integrate",
    verbose=True,
    tax_ids_filter=[3702, 9606, 83333, 559292],
    outliers=outliers,
    sequence_clustering=70,
    evidence_code=2
    # force_update=True
)

../data/raw/swissprot/uniprot-reviewed_yes.tab.gz.pkl
Found pickle, reading...
cd-hit: clustered 413 sequences into 347 clusters at threshold 70


See ticktick for commands

Goals: 
- Faster reading through pickle
- No hardcoding
- More flexibility: filter for any keyword/go term
- Result: Cleaned dataframe with class labels and annotations


In [9]:
table_new_test = pd.read_table("~/up_newdata_test.tsv.gz", index_col=0, dtype=str)

In [None]:
table_new_test.columns

Index(['Gene Names', 'Protein names', 'Organism', 'Organism (ID)', 'Keywords',
       'Keyword ID', 'Gene Ontology (biological process)',
       'Gene Ontology (cellular component)',
       'Gene Ontology (molecular function)', 'Gene Ontology IDs',
       'Protein existence', 'Fragment', 'Sequence',
       'Subcellular location [CC]', 'Intramembrane', 'Topological domain',
       'Transmembrane', 'Protein families', 'TCDB', 'Function [CC]',
       'Active site', 'Binding site', 'Kinetics'],
      dtype='object')

In [None]:
table_new_test["TCDB"].str.split(";").explode()

Entry
A0A024B7W1    NaN
A0A024SC78    NaN
A0A024SH76    NaN
A0A026W182    NaN
A0A044RE18    NaN
             ... 
W6JGV7        NaN
W6JHZ8        NaN
W6JQK2        NaN
X6R8D5        NaN
X6R8R1        NaN
Name: TCDB, Length: 576525, dtype: object

In [None]:



outliers = ["A0A0B5AC19"]
not_outlier_mask = ~table_new_test.index.isin(set(outliers))


In [None]:
table_new_test[table_new_test["Gene Ontology IDs"] == ""].index

Index(['A0A0B5AC19', 'A0A6M7H989', 'A0JP26', 'A4D161', 'A6H8Z2', 'A6NEM1',
       'M1MR49', 'O34860', 'P0CG20', 'P0DV56',
       ...
       'Q9ZW38', 'Q9ZWX5', 'U3H040', 'U3H0A9', 'V5QPS4', 'W5X2N3', 'W6JHZ8',
       'W6JQK2', 'X6R8D5', 'X6R8R1'],
      dtype='object', name='Entry', length=24076)

In [None]:
table_new_test[table_new_test["Keywords"] == ""].index

Index(['P0DUE9', 'Q6U2P6', 'A0A023PZI1', 'A0A023PZL7', 'A0A0H2XI17',
       'A0A0H3PIP6', 'A0A0U2WCB2', 'A0A140JWS6', 'A0A1X9IRT6', 'A0A2U8U2L2',
       ...
       'Q9ZKF3', 'Q9ZKH6', 'Q9ZKQ6', 'Q9ZL19', 'Q9ZLU4', 'Q9ZLX9', 'Q9ZM67',
       'Q9ZMG1', 'Q9ZWX5', 'W5X2N3'],
      dtype='object', name='Entry', length=8334)

In [None]:
table_new_test.columns

Index(['Gene Names', 'Protein names', 'Organism', 'Organism (ID)', 'Keywords',
       'Keyword ID', 'Gene Ontology (biological process)',
       'Gene Ontology (cellular component)',
       'Gene Ontology (molecular function)', 'Gene Ontology IDs',
       'Protein existence', 'Fragment', 'Sequence',
       'Subcellular location [CC]', 'Intramembrane', 'Topological domain',
       'Transmembrane', 'Protein families', 'TCDB', 'Function [CC]',
       'Active site', 'Binding site', 'Kinetics'],
      dtype='object')

In [None]:
table_new_test[table_new_test.Sequence.str.contains("X")].Sequence

Entry
A1YES6    MPKRGKKGAVAEDGDELKTEPEAKKSKTAAKKNDKEAAGEGPALYE...
A1Z651    MGQTVTTPLSLTLQHWGDVQRIASNQSVDVKKRRWVTFCSAEWPTF...
A7X8B3    MTELKAKGPRAPHVAGGPPSPEVGSPLLCRPAAGPFPGSQTSDTLP...
B2LT61    MPRALWTAWVWAXIILSTEGASDQASSLSCDSTGVCDGHSRSLNSI...
B5T267    MPRALWTAWVWAVIILSTEGASDQASSLSCDPTGVCDGHSRSLNSI...
                                ...                        
Q9R4P3                                  AEKKTVTVKQTGSPIXXXK
Q9R4P4                                TKIADLRSQTVDQLSDXLXKL
Q9R4P6                                      VQIFVXDNNVDQALK
Q9R5V8                                  MKATELREKSAQQLNXQLL
Q9R5V9                             ANTVKVTLIKSTNGRLANHKAXVK
Name: Sequence, Length: 2266, dtype: object

In [None]:
table_new_test.loc["A1YES6","Sequence"]

'MPKRGKKGAVAEDGDELKTEPEAKKSKTAAKKNDKEAAGEGPALYEDPPDQKTSPSGKPATLKICSWNVDGLRAWIKKKGLDWVKEEAPDILCLQETKCSENKLPAELQELPGLSYQYWSAPXXKEGYSGVGLLSRQCPLKVSYGIGEEEHDQEGRVIVAEFDSFVLVTAYVPNAGRGLVRLEYRQRWDEAFRRFLKGLASRKPLVLCGDLNVAHEEIDLRNPKGNKKNAGFTPQERQGFGELLQAVPLADSFRHLYPNTPYAYTFWTYMMNARSKNVGWRLDYFLLSHSLLPALCDSKIRSKALGSDHCPITLYLAL'

In [None]:
# table_new_test['Transmembrane'].unique()[0]

table_new_test.Sequence.str.replace(re.compile("[^ACDEFGHIKLMNPQRSTVWY]+"),"")["A1YES6"]

'MPKRGKKGAVAEDGDELKTEPEAKKSKTAAKKNDKEAAGEGPALYEDPPDQKTSPSGKPATLKICSWNVDGLRAWIKKKGLDWVKEEAPDILCLQETKCSENKLPAELQELPGLSYQYWSAPKEGYSGVGLLSRQCPLKVSYGIGEEEHDQEGRVIVAEFDSFVLVTAYVPNAGRGLVRLEYRQRWDEAFRRFLKGLASRKPLVLCGDLNVAHEEIDLRNPKGNKKNAGFTPQERQGFGELLQAVPLADSFRHLYPNTPYAYTFWTYMMNARSKNVGWRLDYFLLSHSLLPALCDSKIRSKALGSDHCPITLYLAL'

Plan: 

Input: Custom Uniprot download with many columns
Output: 
    - Machine learning training dataset
        - Sequence
        - Labels
        - Filtered by labels

In [166]:
import re

"""
1. Cleanup
1.1 Drop/rename cols
2. Filtering
3. Descriptions/Annotations
"""
# TODO use go terms or keywords or both
# TODO doc string
"""
Cleanup steps:
- Replace NaN with ""
- Remove trailing ; from TCDB (pay attention to multiple TCDB ids)
- Remove amino acid types that describe multiple AAs (X,U), or the entire protein?
- No gene names
- Protein existence proof
- Sequence fragments
- Tax ids
- TODO No keywords/GO terms
Filters:
- Pre-determined outliers
- Transport annotations
- Location annotations
- Substrate class annotations
- Multi-substrate handling
- Clustering with cd-hit as last step
- TODO Re-check script
"""


class UniprotReader:
    def __init__(
        self,
        raw_file_name: str = "~/up_newdata_test.tsv.gz",
        force_update: bool = False,
    ):
        df_raw = self.read_raw(input_file=raw_file_name, force_update=force_update)
        df_raw = self.parse_table(df_raw)
        print("done")
        self.go_df = self.create_go_df(df_raw)
        self.kw_df = self.create_keywords_df(df_raw.Keywords, df_raw["Keyword ID"])
        pass

    def parse_table(self, df_raw: pd.DataFrame):
        # 1: protein level, 2: transcript 3: inferred from homology, 4: Predicted, 5: Uncertain
        evidence_code = 1
        organism_ids = [9606, 3702, 83333, 559292]

        df_raw = df_raw.fillna("")

        gene_names_exist = df_raw["Gene Names"] != ""
        protein_names_exist = df_raw["Protein names"] != ""
        not_sequence_fragment = df_raw["Fragment"] != "fragment"

        assert evidence_code > 0 and evidence_code <= 5
        evidence_levels_filter = [
            "Evidence at protein level",
            "Evidence at transcript level",
            "Inferred from homology",
            "Predicted",
            "Uncertain",
        ][:evidence_code]

        protein_existence_mask = df_raw["Protein existence"].isin(
            set(evidence_levels_filter)
        )

        natural_amino_acids = re.compile("[ACDEFGHIKLMNPQRSTVWY]+")
        valid_amino_acids = df_raw.Sequence.str.match(natural_amino_acids)

        df_raw["Organism (ID)"] = df_raw["Organism (ID)"].astype(int)
        correct_organism_mask = df_raw["Organism (ID)"].isin(set(organism_ids))

        df_raw = df_raw[
            (
                gene_names_exist
                & protein_names_exist
                & not_sequence_fragment
                & protein_existence_mask
                & valid_amino_acids
                & correct_organism_mask
            )
        ]
        df_raw = df_raw.drop(["Fragment"], axis=1)
        return df_raw

    def get_dataset():

        pass

    def read_raw(self, input_file: str, force_update: bool = False):
        input_path = Path(input_file)
        pickle_path = Path(input_path.parent, input_path.name + ".pkl")
        print(pickle_path)
        if pickle_path.exists() and not force_update:
            print("Found pickle, reading...")
            df = pd.read_pickle(pickle_path)
        else:
            print("Reading text file...")
            if not force_update:
                print("Did not find pickle, creating new version...")
            else:
                print("Overwriting existing pickle...")
            df = pd.read_table(input_file, index_col=0, dtype=str)
            df.to_pickle(pickle_path)
        return df

    def create_keywords_df(self, keywords: pd.Series, keyword_ids: pd.Series):
        keywords = keywords.astype(str)
        srs_keywords = keywords.str.split(";").explode()
        srs_keyword_ids = keyword_ids.str.split(";").explode()

        df_keywords = pd.concat([srs_keywords, srs_keyword_ids], axis=1)
        df_keywords.columns = ["keyword", "keyword_id"]

        return df_keywords

    def __parse_go_column(self, go_column: pd.Series, ontology_name: str):
        go_column = go_column.astype(str)
        go_id_pattern = re.compile("\[(GO\:[0-9]{7})\]")

        srs_go_long = go_column.str.split(";").explode()

        srs_go_long_ids = srs_go_long.str.extract(go_id_pattern)
        srs_go_long_terms = srs_go_long.str.replace(go_id_pattern, "").str.strip()

        df_go = pd.concat([srs_go_long_ids, srs_go_long_terms], axis=1)
        df_go.columns = ["go_id", "go_term"]
        df_go["ontology"] = ontology_name

        return df_go

    def create_go_df(self, df: pd.DataFrame):
        return pd.concat(
            [
                self.__parse_go_column(df["Gene Ontology (biological process)"], "bp"),
                self.__parse_go_column(df["Gene Ontology (molecular function)"], "mf"),
                self.__parse_go_column(df["Gene Ontology (cellular component)"], "cc"),
            ]
        )


# df_go = create_go_df(table_new_test)


In [167]:
upr = UniprotReader("/home/ad/up_newdata_test.tsv.gz",force_update=False)

/home/ad/up_newdata_test.tsv.gz.pkl
Found pickle, reading...
done


In [None]:
upr

In [23]:
upr = UniprotReader(force_update=True)

Reading text file...
Overwriting existing pickle...


In [14]:
df_go[df_go.go_term.str.contains("membrane")].go_term.unique().tolist()

['plasma membrane fusion',
 'cation transmembrane transport',
 'regulation of membrane potential',
 'fusion of virus membrane with host endosome membrane',
 'Golgi to plasma membrane protein transport',
 'Golgi to plasma membrane transport',
 'xenobiotic transmembrane transport',
 'import across plasma membrane',
 'inorganic cation transmembrane transport',
 'iron ion transmembrane transport',
 'manganese ion transmembrane transport',
 'zinc ion import across plasma membrane',
 'zinc ion transmembrane transport',
 'chloride transmembrane transport',
 'potassium ion import across plasma membrane',
 'sodium ion transmembrane transport',
 'calcium ion transmembrane transport',
 'heme transmembrane transport',
 'calcium ion import across plasma membrane',
 'thylakoid membrane organization',
 'ion transmembrane transport',
 'endoplasmic reticulum tubular network membrane organization',
 'vesicle-mediated transport to the plasma membrane',
 'protein localization to plasma membrane',
 'endopl

In [15]:
string = table_new_transport.loc["P0AAM1"    ,'Gene Ontology (cellular component)']
elements = string.split(";")
print(elements)



[re.search(go_id_pattern,element).group(0) for element in elements]
[re.sub("\[|\]","",re.search(go_id_pattern,element).group(0)) for element in elements]

# [re.findall(go_id_pattern,element)[0] for element in elements]
# re.sub("\[GO\:[0-9]{7}\]","", string).split(";")
# # re.match("\[GO\:[0-9]{7}\]", string)

# re.sub("\[GO\:[0-9]{7}\]","", string).split(";")


NameError: name 'table_new_transport' is not defined