In [1]:

import sys 
from collections import defaultdict
from itertools import permutations
from datetime import datetime

In [2]:
class StringDB:
    def __init__(self, fname):
        db = self._parse_string_db(fname)
        self._db = db
        self.size = "{} MB".format(round(sys.getsizeof(db)/1024**2, 4))

    def _parse_string_db(self, fname: str) -> dict:
        """ Internally parses the string.db file and converts it into a dictionary"""
        db_contents = defaultdict(lambda: None)
        with open(fname, "r") as infile:
            record_contents = infile.readlines()
            for records in record_contents:
                data = records.replace("\n", "").split("\t")
                protein_pair = "{} {}".format(data[0], data[1])
                score = data[2]
                db_contents[protein_pair] = score
        return db_contents

    def get_pair_score(self, locus_name: str, proteins: list, interaction_type="pp") -> list:
        """ Accepts a list of genes and queries to database

        Summary:
        -------
        Creates all possible permutations of protein interactions within the protein list.
        Each pair will be queried into database and returns a score. The database will return
        None if the score is not found and will not be recorded into the results. In addition,
        a "tracking" list is also implemented to prevent repetitive query. This means that
        reversed queries will be ignored if the original query has been recorded.

        Argument
        -------
        genes : list
            list of genes found in the locus

        interaction_type : str (choices=["pp", "pd", "pr", "rc", "cr", "gl", "pm", "mp"])
            Interaction type of both genes/molecules. Supported interaction types are:
            - p:protein - protein interaction
            - pd: protein -> DNA
            - pr: protein -> reaction
            - rc: reaction -> compound
            - cr: compound -> reaction
            - gl: genetic lethal relationship
            - pm: protein-metabolite interaction
            - mp: metabolite-protein interaction

        Returns
        -------
        list
            Contains a list of strings that describes the interaction type
            between two genes and its score. This cotnents is what is going
            to be used to produce the sif file

        """

        # type checking
        known_interaction_types = ["pp", "pd", "pr", "rc", "cr", "gl", "pm", "mp"]
        if interaction_type not in known_interaction_types:
            raise ValueError("'{}' is an unsupported interaction type. Supported interaction: {}".format(interaction_type, ", ".join(known_interaction_types)))
        if not isinstance(proteins, list):
            genes = [proteins]

        # getting all possible combinations
        results = []
        searched = []
        pairs = permutations(proteins, 2)
        for gene1, gene2 in pairs:
            query = "{} {}".format(gene1, gene2)
            query_rev = "{} {}".format(gene2, gene1)
            score = self._db[query]
            if score == None:
                continue
            if  query_rev in searched:
                continue
            result = "{} {} {} {}".format(gene1, interaction_type, gene2, score)
            # results.append(result)
            results.append(result)
            searched.append(query)
        return results

# single function for parsing input file
def parse_input(input_file):
    """ Documentations later"""
    locus_genes = defaultdict(lambda: None)
    with open(input_file, 'r') as infile:
        lines = infile.readlines()
        for line in lines:
            data = line.strip("\n").split("\t")
            locus = data[1].split()[-1]
            genes = data[2:]
            locus_genes[locus] = genes

    return locus_genes

def _flatten_data(data):
    """ Flattens data into 1D array

    This is usefull and cleaner processing for the data. embeded lists
    is a result of multple loci being present in the input. This will
    flatten the list of list into one single list.

    Arguments:
    ---------
    data : dict
        Labled data containing the locus name paried with all protein protein
        interaction scores. 

    Returns:
    --------
    list
        a flatten list conining all proteins pair interactions scores
    """

    flatten_data = []
    for gene_list in data.values():
        flatten_data += gene_list

    return flatten_data

def save_as_sif(data):
    """ Converts data into sif format. """
    unique_id = datetime.today().strftime("%m%d%y-%H%M%S")
    for locus_name, interactions in data.items():
        outname = "{}-{}.sif".format(locus_name, unique_id)
        with open(outname, "w") as sifile:
            sifile.write("gene1 interaction gene2 score\n")
            for interaction in interactions:
                sifile.write("{}\n".format(interaction))

    interactions_flatten = _flatten_data(data)
    global_outname = "all_nodes-{}.sif".format(unique_id)
    with open(global_outname, "w") as globalout:
        globalout.write("gene1 interaction gene2 score\n")
        for interaction in interactions_flatten:
            globalout.write("{}\n".format(interaction))


In [3]:
# paths
string_path = "../Data/String.txt"
input_path = "../input/Input.gmt.txt"
db = StringDB(string_path)
input_data = parse_input(input_path)

In [4]:
# loading string database \n"
count = 0
results = defaultdict(lambda: None)
for locus_name, gene_list in input_data.items():
    result = db.get_pair_score(locus_name, gene_list, interaction_type="pp")
    results[locus_name] = result

In [5]:
save_as_sif(results)

In [6]:
datetime.today().strftime("%m%d%y-%H%M%S")

'092221-202756'