In [1]:
import os
import sys

# TODO turn this into script, proper debugging!

# from sklearnex import patch_sklearn
# patch_sklearn()
sys.path.append('../src')

import pandas as pd
import numpy as np

from util.fasta import read_fasta
from dataset.transporter_dataset import create_dataset
from dataset.cluster_fasta import cd_hit

In [2]:
LOG_FILE = "../logs/athaliana_amino_sugar.log"
N_THREADS = 16
IDENTITY_THRESHOLD=70

# Delete previous log
if os.path.exists(LOG_FILE):
    with open(LOG_FILE, "w"):
        pass

create_dataset(
    keywords_substrate_filter=["Amino-acid transport", "Sugar transport"],
    keywords_component_filter=["Membrane"],
    keywords_transport_filter=["Transport"],
    input_file="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    multi_substrate="remove",
    outliers=["O81775", "Q9SW07", "Q9FHH5", "Q8S8A0", "Q3E965", "Q3EAV6", "Q3E8L0"],
    verbose=True,
    tax_ids_filter=[3702],
    output_tsv="../data/datasets/athaliana_amino_sugar.tsv",
    output_fasta="../data/datasets/athaliana_amino_sugar.fasta",
    output_log=LOG_FILE,
)

cd_hit(
    executable_location="cd-hit",
    input_fasta=f"../data/datasets/athaliana_amino_sugar.fasta",
    output_fasta=f"../data/datasets/athaliana_amino_sugar_cluster{IDENTITY_THRESHOLD}.fasta",
    log_file=LOG_FILE,
    identity_threshold=IDENTITY_THRESHOLD,
    n_threads=N_THREADS,
    memory=4096,
    verbose=True,
)

In [43]:
from scipy.stats import pearsonr

# Whole Genome
class CoexpWG:
    #################
    # Preprocessing #
    #################

    def __init__(
        self,
        expression_tsv: str,
        go_tsv: str,
        swissprot_tsv: str,
        tax_id: int,
    ):
        df_exp = pd.read_table(expression_tsv, index_col=0)
        df_go_long = pd.read_table(go_tsv)

        proteins_swissprot = self.__get_swissprot_proteins(
            swissprot_tsv=swissprot_tsv, tax_id=tax_id
        )

        proteins_enough_data = (
            set(proteins_swissprot) & set(df_exp.index) & set(df_go_long.Uniprot)
        )
        df_exp = df_exp[df_exp.index.isin(proteins_enough_data)]
        df_go_long = df_go_long[
            df_go_long.Uniprot.isin(proteins_enough_data)
        ].reset_index(drop=True)

        # df_go_long = df_go_long.merge()

        group_sizes_go = df_go_long.groupby("Uniprot").size().rename("n_proteins")

        df_go_long = df_go_long.merge(
            group_sizes_go, left_on="Uniprot", right_index=True
        )

        self.__proteins_whitelist = proteins_enough_data
        self.__df_coexp = df_exp.T.corr()
        self.__df_go_long = df_go_long

    def __get_swissprot_proteins(self, swissprot_tsv: str, tax_id: int):
        df_swissprot = pd.read_table(
            swissprot_tsv,
            index_col=0,
            usecols=["Entry", "Organism ID", "Protein existence", "Fragment"],
        )
        df_swissprot_filtered = df_swissprot.copy()
        df_swissprot_filtered = df_swissprot_filtered[
            (df_swissprot_filtered["Organism ID"] == tax_id)
        ]
        df_swissprot_filtered = df_swissprot_filtered[
            ~df_swissprot_filtered["Protein existence"].isin(["Predicted", "Uncertain"])
        ]
        df_swissprot_filtered = df_swissprot_filtered[
            df_swissprot_filtered.Fragment.isnull()
        ]
        return df_swissprot_filtered.index.values

    #################
    # Calculation   #
    #################

    def get_selected_neighbors(self, accession: str, n_selected: int) -> pd.DataFrame:
        selected_neighbors = self.__df_coexp.loc[accession].sort_values(
            ascending=False
        )[:n_selected]
        return selected_neighbors

    def __get_percentages(
        self, go_profiles: dict, selected_neighbors_go: pd.DataFrame
    ) -> pd.DataFrame:
        records = []
        for label, profile in sorted(go_profiles.items()):
            i = 0
            for neighbor, neighbor_go in selected_neighbors_go.iteritems():
                percentage = (
                    neighbor_go[np.isin(neighbor_go, profile)].size / neighbor_go.size
                )
                records.append([f"{label}_{i}", neighbor, percentage])
                i += 1
        df_percentages = pd.DataFrame.from_records(
            records, columns=["profile", "neighbor", "percentage"]
        )
        return df_percentages

    def __check_inputs(
        self, accession, training_accessions, training_labels, n_selected
    ):
        assert training_accessions.size == training_labels.size
        assert accession not in training_accessions
        assert isinstance(accession, str)
        assert isinstance(training_accessions, np.ndarray)
        assert isinstance(training_labels, np.ndarray)
        assert isinstance(n_selected, int)

    def __percentage_to_aggregate(self, df_percentages: pd.DataFrame) -> pd.Series:
        df_percentages.profile = (
            df_percentages.profile.str.split("_")
            .str[:3]
            .transform(lambda x: "_".join(x))
        )
        df_percentages = df_percentages.groupby("profile").percentage.max()
        df_percentages = df_percentages.reset_index()
        return df_percentages

    def get_feature(
        self,
        accession: str,
        training_accessions: np.ndarray,
        training_labels: np.ndarray,
        # n_neighbors: int,
        n_selected: int,
        threshold: float,
        # method: str = "binary",
        aggregate: bool = False,
        binary: bool = True,
    ):
        self.__check_inputs(accession, training_accessions, training_labels, n_selected)
        if accession not in self.__proteins_whitelist:
            return None

        df_training = pd.DataFrame(training_accessions, columns=["Uniprot"])
        df_training["Label"] = training_labels

        # TODO filter by valid proteins?
        df_training = df_training[df_training.Uniprot.isin(self.__proteins_whitelist)]

        #################
        # Get neighbors #
        #################

        selected_neighbors_go = (
            self.__df_coexp.loc[accession]
            .sort_values(ascending=False)[:n_selected]
            .index.values
        )
        df_training_exp = df_training.copy()
        # TODO bug: protein itself is still in vector
        df_training_exp["neighbor"] = df_training.Uniprot.apply(
            lambda x: self.__df_coexp[x]
            .sort_values(ascending=False)[:n_selected]
            .index.values
        )
        df_training_exp = df_training_exp.explode("neighbor")

        print(df_training_exp)

        #################
        # Get GO terms  #
        #################
        # TODO filter go terms?

        df_accession_neighbors_go = self.__df_go_long[
            self.__df_go_long.Uniprot.isin(selected_neighbors_go)
        ]
        df_training_go = df_training.merge(self.__df_go_long, on="Uniprot", how="inner")

        print(df_training_exp.neighbor.drop_duplicates())
        df_training_neighbors_go = self.__df_go_long.merge(df_training_exp[["Label", "neighbor"]].drop_duplicates())
        print(df_training_neighbors_go)
        # self.__df_go_long[
        #     self.__df_go_long.Uniprot.isin(df_training_exp.neighbor.drop_duplicates())
        # ].reset_index(drop=True)


        ##########################
        # Calculate percentages  #
        ##########################

        # for label in df_training_neighbors_go.labels.unique:
        print(df_training_neighbors_go)

        print(df_accession_neighbors_go[["Uniprot", "go_id"]].groupby("Uniprot").apply(lambda gr: gr.go_id))

        ##########################
        # Aggregate              #
        ##########################

        # TODO refactor
        # TODO ability to choose other methods for encoding than GO terms

        # return result

    def get_features(
        self,
        training_accessions: np.ndarray,
        training_labels: np.ndarray,
        n_selected: int,
        threshold: float,
        aggregate: bool = False,
        binary: bool = True,
    ):
        res = []
        for accession in training_accessions:
            mask = training_accessions != accession
            accessions_subset = training_accessions[mask]
            labels_subset = training_labels[mask]

            feature = self.get_feature(
                accession=accession,
                training_accessions=accessions_subset,
                training_labels=labels_subset,
                n_selected=n_selected,
                threshold=threshold,
                aggregate=aggregate,
                binary=binary,
            )
            if isinstance(feature, pd.Series):
                res.append(feature)
        res = pd.concat(res, axis=1).transpose()
        return res


In [44]:
coexp5 = CoexpWG(
    expression_tsv="../data/intermediate/gene_expression/athaliana/athaliana_columbia_flower.tsv",
    go_tsv="../data/intermediate/gene_ontology/goa_athaliana.tsv",
    swissprot_tsv="../data/raw/swissprot/uniprot-reviewed_yes.tab.gz",
    tax_id=3702,
)

In [45]:
IDENTITY_THRESHOLD=70
def get_training_data(fasta_file: str):
    fasta_data = read_fasta(fasta_file_name=fasta_file)
    fasta_data = [header.split("|")[1:] + [sequence] for header, sequence in fasta_data]
    df_training = pd.DataFrame.from_records(
        fasta_data,
        columns=[
            "Uniprot",
            "Symbols",
            "Taxid",
            "TCDB",
            "Substrate",
            "Description",
            "Sequence",
        ],
        exclude=["Symbols", "Taxid", "TCDB", "Description", "Sequence"],
    )
    return df_training

df_training = get_training_data(fasta_file=f"../data/datasets/athaliana_amino_sugar_cluster{IDENTITY_THRESHOLD}.fasta")
accession = "Q9LUE3"
accession_label = df_training.loc[df_training.Uniprot == accession].Substrate.values
print(accession_label)
training_df = df_training[df_training.Uniprot != accession]
training_accessions = training_df.Uniprot.values
training_labels = training_df.Substrate.values

result = coexp5.get_feature(
    accession=accession,
    training_accessions=training_accessions,
    training_labels=training_labels,
    n_selected=3,
    threshold=0.8,
    aggregate=True,
    binary=False,
)
result

['Sugar transport']
    Uniprot            Label neighbor
0    Q9SFG0  Sugar transport   Q9SFG0
0    Q9SFG0  Sugar transport   O48776
0    Q9SFG0  Sugar transport   Q8W036
1    Q84WN3  Sugar transport   Q84WN3
1    Q84WN3  Sugar transport   Q9XFH9
..      ...              ...      ...
113  Q2V4B9  Sugar transport   Q9FWR2
113  Q2V4B9  Sugar transport   Q8LF05
114  Q94EI9  Sugar transport   Q94EI9
114  Q94EI9  Sugar transport   Q9M1S3
114  Q94EI9  Sugar transport   Q9CAP4

[333 rows x 3 columns]
0      Q9SFG0
0      O48776
0      Q8W036
1      Q84WN3
1      Q9XFH9
        ...  
113    Q9FWR2
113    Q8LF05
114    Q94EI9
114    Q9M1S3
114    Q9CAP4
Name: neighbor, Length: 322, dtype: object


MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False