# STRING DB project 

## layout of the data 
- the data will be converted into a SIF format. 
- SIF requires 3 things 
    - Node A
    - Interactions
    - Node B
    - Score

The types of interactions that the SIF handles in cyotscape is 
  - pp .................. protein - protein interaction <-- Using this one
  - pd .................. protein -> DNA
  - pr .................. protein -> reaction
  - rc .................. reaction -> compound
  - cr .................. compound -> reaction
  - gl .................. genetic lethal relationship
  - pm .................. protein-metabolite interaction
  - mp .................. metabolite-protein interaction

since our STRING  DATABASE is protein protein intearctions there pp will be the selected interaction type

## Downsides
---
The main disadvantage is that this format does not include any layout information, forcing Cytoscape to re-compute a new layout of the network each time it is loaded.


In [2]:
import warnings 
import pandas as pd
import numpy as np
from collections import defaultdict


In [3]:
class StringDB:
    def __init__(self, fname):

        # automatically loading in database when instantiating StringDB object
        self.fname = fname
        db = self._load_string_data(fname)
        self.db = db


    def _load_string_data(self, f_path: str) -> pd.DataFrame:
        """ Loads in the STRING database and converts it into a pandas dataframe object"""
        string_df = pd.read_csv(f_path, sep="\t")
        string_df.columns = ["gene1", "gene2", "score"]
        return string_df


    def _cross_ref(self, locus: str, target: list, reference: list) -> None:
        """Cross references matches with initial inputs to see which pairs
        were not found"""

        target_set = set(target)
        ref_set = set(reference)

        
        missing_set = ref_set - target_set
        for missing in missing_set:
            msg = "Not found {} - {}".format(locus, missing)
            warnings.warn(msg)
    

    def _to_adjacency_dict(self, locus, selected_pairs):
        """ converts the selected pairs into a adjacency dict
        
        Arguments
        ---------
        locus: str
            Targeted gene

        selected_pairs: list, np.ndarray
            An array of genes that were identified in the STRING database 
        
        Returns
        -------
        adjacency_dict: dict {str : {str : float}}
            Returns an adjacency dict where the locus is the main key and the
            value is a sub dictionary containing the protein gene and interaction
            score (float)

        >>> # Example result
        >>> adj_dict = {"locus gene":{"matched_gene1":score, "matched_gene2":score, "matched_gene3":score}}
        
        """

        main_result = {} # stores locus and all genes and scores
        gene_score = {} # --> stores gene and score
        print(locus, selected_pairs)
        for idx in range(len(selected_pairs.index.tolist())):
            data = selected_pairs.iloc[idx]
            gene, score = (data["gene2"], data["score"])
            gene_score[gene] = score

        main_result[locus] = gene_score
        return main_result
    

    def find_pairs(self, locus, genes, verbose=False):
        """ Attempts to find all pairs with a given locus.
        
        arguments
        ----------
        locus: str
            Main gene that will be used to compare all genes
        genes: list, np.array
            An array genes names

        returns
        -------
        dict
            A adjacency_dict containing the locus as the main key and the sub dictionary 
            containing the queryed gene along with its score. If the score is not found,
            then it will not be included in the adjacency_dict
        
        """
        # data type checking
        if not isinstance(locus, str):
            raise TypeError("locus must be a string. you have provided {}". type(locus))
        if not isinstance(genes, list) and not isinstance(genes, np.ndarray):
            raise TypeError("'genes' data must be a list or numpy array, you have provided {}".format(type(genes)))

        # query searches all given genes with one locus
        # -- this process is vectorized does not use a for loop to find every single match
        # -- pairs that are NOT found will not be included in the results
        # -- -- We use the _cross_ref() function to let the user know which pairs where not found
        query = self.db.loc[(self.db["gene1"] == locus) & (self.db["gene2"].isin(genes))] 
        selected_genes = query["gene2"].values.tolist()
        print(query)
        # checking for missing pairs
        if verbose is True:
            self._cross_ref(locus, selected_genes, genes)
        
        # return results
        results = self._to_adjacency_dict(locus, query)
        return results


# functions
def save_as_sif(adjacency_dict, interaction, outname, path="."):
    """ Converts adjacency_list into SIF file
    
    Argument
    -------
    adjacency_dict : dict
        Contains all adjacency_dict pathways 
    interaction : str (choices=["pp", "pd", "pr", "rc", "cr", "gl", "pm", "mp"])
        Interaction type of both genes/molecules. Supported interaction types are:
        - p:protein - protein interaction 
        - pd: protein -> DNA
        - pr: protein -> reaction
        - rc: reaction -> compound
        - cr: compound -> reaction
        - gl: genetic lethal relationship
        - pm: protein-metabolite interaction
        - mp: metabolite-protein interaction

    outname : str
        name of the output SIF file
    path : string, optional (default=".")
        Path where the SIF file is going to be written. By default it will be 
        created at current dictory 
    optional

    Returns:
    -------
    ValueError 
        Raised when an incorrect interaction is provided
    File
        SIF file written at provided path (default: current path)
    """
    # type checking 
    known_interaction_types = ["pp", "pd", "pr", "rc", "cr", "gl", "pm", "mp"]
    if interaction not in known_interaction_types:
        raise ValueError("'{}' is an unsupported interaction type. Supported interaction: {}".format(interaction, ", ".join(known_interaction_types)))

    # writing out SIF output file
    full_path = "{}/{}".format(path, outname)
    with open(full_path, "w") as outfile:
        for locus, matches in adjacency_dict.items():
            for gene, score in matches.items():
                result = "{} {} {} {}\n".format(locus, interaction, gene, score)
                outfile.write(result)

def parse_input(input_file):
    locus_genes = defaultdict(None)
    with open(input_file, 'r') as infile:
        lines = infile.readlines()
        for line in lines:
            data = line.split("\t")
            locus = data[1].split()[-1]
            genes = data[2:]
            locus_genes[locus] = genes

    return locus_genes

In [8]:
# loading and parsing input file
parsed_input = parse_input("../input/Input.gmt.txt")

# connecting to string database
string_db = StringDB("../Data/STRING.txt")
db = string_db.db

In [5]:
query_results = []
for locus, genes in parsed_input.items():
    result = string_db.find_pairs(locus, genes)
    query_results.append(result)
    

Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
PALB2 Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
FANCF Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
BRIP1 Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
FANCC Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
FANCA Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
UBE2T Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
FANCD2 Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
FANCE Empty DataFrame
Columns: [gene1, gene2, score]
Index: []
         gene1  gene2  score
1536241  B

In [9]:
db.loc[(db["gene1"] == "PALB2") & (db["gene2"].isin(parsed_input["PPALB2ALB2"]))]

KeyError: 'PPALB2ALB2'

In [99]:
np.array(parsed_input.values()).ndim

0

In [106]:
ls = []
ks = []
for k, lss in parsed_input.items():
    ls += lss
    ks += [k]

In [14]:
df1 = db.loc[(db["gene1"] == "PALB2") & (db["gene2"].isin(parsed_input["PALB2"]))]
df2 = db.loc[(db["gene1"].isin(parsed_input["PALB2"])) & (db["gene2"] == "PALB2")]

In [11]:
parsed_input

defaultdict(None,
            {'PALB2': ['NUPR1',
              'CTB-134H23.2',
              'SLC5A11',
              'KIAA0556',
              'CD19',
              'SH2B1',
              'CCDC101',
              'GTF3C1',
              'IL27',
              'ARHGAP17',
              'ERN2',
              'DCTN5',
              'NSMCE1',
              'AQP8',
              'RABEP2',
              'XPO6',
              'ATP2A1',
              'CHP2',
              'BOLA2',
              'KDM8',
              'EIF3C',
              'ATXN2L',
              'LAT',
              'ZKSCAN2',
              'SULT1A1',
              'HS3ST4',
              'EIF3CL',
              'TUFM',
              'NPIPL1',
              'SNX29P2',
              'IL21R',
              'PRKCB',
              'SPNS1',
              'TNRC6A',
              'CACNG3',
              'PLK1',
              'RBBP6',
              'NFATC2IP',
              'APOBR',
              'IL4R',
              'PALB2',
      

In [18]:
df2

Unnamed: 0,gene1,gene2,score
