# STRING DB project 

## layout of the data 
- the data will be converted into a SIF format. 
- SIF requires 3 things 
    - Node A
    - Interactions
    - Node B
    - Score

The types of interactions that the SIF handles in cyotscape is 
  pp .................. protein - protein interaction <-- Using this one
  pd .................. protein -> DNA
  pr .................. protein -> reaction
  rc .................. reaction -> compound
  cr .................. compound -> reaction
  gl .................. genetic lethal relationship
  pm .................. protein-metabolite interaction
  mp .................. metabolite-protein interaction

since our STRING  DATABASE is protein protein intearctions there pp will be the selected interaction type

## Downsides
---
The main disadvantage is that this format does not include any layout information, forcing Cytoscape to re-compute a new layout of the network each time it is loaded.


In [132]:
import pandas as pd
import numpy as np

In [165]:
class StringDB:
    def __init__(self, fname):

        # automatically loading in database when instantiating StringDB object
        self.fname = fname
        db = self._load_string_data(fname)
        self.db = db


    def _load_string_data(self, f_path: str) -> pd.DataFrame:
        """ Loads in the STRING database and converts it into a pandas dataframe object"""
        string_df = pd.read_csv(f_path, sep="\t")
        string_df.columns = ["gene1", "gene2", "score"]
        return string_df


    def _cross_ref(self, locus: str, target: list, reference: list) -> None:
        """Cross references matches with initial inputs to see which pairs
        were not found"""

        target_set = set(target)
        ref_set = set(reference)

        
        missing_set = ref_set - target_set
        for missing in missing_set:
            print("WARNING: Not found {} - {}".format(locus, missing))
    

    def _to_adjacency_dict(self, selected_pairs):
        """ converts the selected pairs into a adjacency dict"""

        main_result = {} # stores locus and all genes and scores
        gene_score = {} # --> stores gene and score
        for idx in range(len(selected_pairs.index.tolist())):
            data = selected_pairs.iloc[idx]
            gene, score = (data["gene2"], data["score"])
            gene_score[gene] = score

        main_result[locus] = gene_score
        return main_result
    

    def find_pairs(self, locus, genes):
        """ Attempts to find all pairs with a given locus.
        
        arguments
        ----------
        locus: str
            Main gene that will be used to compare all genes
        genes: list, np.array
            An array genes names

        returns
        -------
        dict
            A nested dictionary containing the locus as the main key and the sub dictionary 
            containing the queryed gene along with its score. NaN will be placed if gene
            pairs are not found
        
        """
        # data type checking
        if not isinstance(locus, str):
            raise TypeError("locus must be a string. you have provided {}". type(locus))
        if not isinstance(genes, list) and not isinstance(genes, np.ndarray):
            raise TypeError("'genes' data must be a list or numpy array, you have provided {}".format(type(genes)))
        # query searches all given genes with one locus
        # -- this process is vectorized does not use a for loop to find every single match
        # -- pairs that are NOT found will not be included in the results
        # -- -- We use the _cross_ref() function to let the user know which pairs where not found
        query = self.db.loc[(self.db["gene1"] == locus) & (self.db["gene2"].isin(genes))] 
        selected_genes = query["gene2"].values.tolist()
        
        # checking for missing pairs
        self._cross_ref(locus, selected_genes, genes)
        
        # return results
        results = self._to_adjacency_dict(query)
        return results

In [166]:
path = "../Data/STRING.txt"
string_db = StringDB(path)

In [167]:
locus = "ARF5"
genes = np.array("DYRK4 PPP5C MAP4K5 RALBP1 PKP2 NOGENE".split())


string_db.find_pairs(locus=locus, genes=genes)

<class 'numpy.ndarray'>


{'ARF5': {'DYRK4': 0.166,
  'PPP5C': 0.254968,
  'MAP4K5': 0.157276,
  'RALBP1': 0.156,
  'PKP2': 0.16021}}

In [79]:
def save_to_sif()

numpy.ndarray