# BioGRID Protein Interactions

### Setup

In [2]:
import os
import sys
sys.path[0] = '../'
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import re
from py2neo import Graph, Node, Relationship
from py2neo.ogm import GraphObject, Property

In [3]:
# Display options
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columns

In [4]:
# Set access key for BioGRID REST API
load_dotenv()
BIOGRID_ACCESS_KEY = os.getenv('BIOGRID_ACCESS_KEY')
NEO4J_HOME = os.getenv('NEO4J_HOME')
importDir = '/Users/gregory/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-d0b780b3-ce77-46cc-b7ed-bd0f78a46581/installation-3.5.14/import/'

### Import and Clean Data

In [156]:
# Loads data and formats columns
def load_data(gene_specifier):

    if not isinstance(genes, list): 
        gene_specifier = gene_specifier
    else:
        gene_specifier = '|'.join(gene_specifier)
    
    url = f"https://webservice.thebiogrid.org/interactions/?searchNames=true&geneList={gene_specifier}" \
    "&taxId=9606&includeInteractors=true&includeInteractorInteractions=true&includeHeader=true" \
    f"&accesskey={BIOGRID_ACCESS_KEY}"

    # Load data
    data = pd.read_csv(url, sep='\t', header=0)

    # Remove leading hash character
    data.rename(columns={"#BioGRID Interaction ID":"BioGRID Interaction ID"}, inplace=True)

    # Replace pipe separators with commas
    data = data.replace('\|', ',', regex=True)

    # Select str columns and replace '-' with np.nan
    cols = ['Systematic Name Interactor A', 
          'Systematic Name Interactor B', 
          'Score', 
          'Modification', 
          'Phenotypes',
          'Qualifications',
          'Tags']

    data[cols] = data[cols].applymap(lambda col: re.sub(r'^-$', str(np.NaN), col))
       
    return data

In [157]:
# Selects and transforms columns
def preprocess_data(data):
    
     # Convert Score column to float
    data['Score'] = data['Score'].astype('float64')
    
    # Select columns of interest for graph
    data = data[['BioGRID Interaction ID', 'Official Symbol Interactor A', 'Entrez Gene Interactor A', 
                       'Synonyms Interactor A', 'Organism Interactor A', 'Official Symbol Interactor B', 
                       'Entrez Gene Interactor B', 'Synonyms Interactor B', 'Organism Interactor B', 
                       'Author', 'Pubmed ID', 'Experimental System', 'Experimental System Type', 'Throughput']]

    # Create Year column
    data['Publication Year'] = data['Author'].str.split(' ').str[-1].str.strip('()')    
    
    # Remove Year from Author column
    data['Author'] = data['Author'].str.split(' ').str[:2]
    data['Author'] = data['Author'].apply(lambda x: ', '.join(x))
    
    return data

In [161]:
# Select gene; create for loop and append dataframes for multiple genes
gene = 'CYP1A2'
genes = ['FECH', 'POR', 'PGRMC1', 'CYP1A2', 'CYB5A']

# Fetch and clean data
data = load_data(genes)
data = preprocess_data(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [163]:
data.head(100)

Unnamed: 0,BioGRID Interaction ID,Official Symbol Interactor A,Entrez Gene Interactor A,Synonyms Interactor A,Organism Interactor A,Official Symbol Interactor B,Entrez Gene Interactor B,Synonyms Interactor B,Organism Interactor B,Author,Pubmed ID,Experimental System,Experimental System Type,Throughput,Publication Year
0,18063,MAP2K2,5605,"CFC4,MAPKK2,MEK2,MKK2,PRKMK2",9606,ARAF,369,"A-RAF,ARAF1,PKS2,RAFA1",9606,"Yin, XL",11909642,Two-hybrid,physical,Low Throughput,2002
1,119392,NELFE,7936,"D6S45,NELF-E,RD,RDBP,RDP",9606,NELFB,25920,"COBRA1,NELF-B",9606,"Lehner, B",14667819,Two-hybrid,physical,Low Throughput,2004
2,119626,EMC2,9694,"KIAA0103,TTC35",9606,EMC8,10328,"C16orf2,C16orf4,COX4NB,FAM158B,NOC4",9606,"Rual, JF",16189514,Two-hybrid,physical,High Throughput,2005
3,120272,RCBTB2,1102,"CHC1L,RLG",9606,RCBTB2,1102,"CHC1L,RLG",9606,"Rual, JF",16189514,Two-hybrid,physical,High Throughput,2005
4,120639,NDUFA5,4698,"B13,CI-13KD-B,CI-13kB,NUFM,UQOR13",9606,NDUFB1,4707,"CI-MNLL,CI-SGDH,MNLL",9606,"Rual, JF",16189514,Two-hybrid,physical,High Throughput,2005
5,120801,EWSR1,2130,"EWS,bK984G1.4",9606,NDUFV1,4723,"CI-51K,CI51KD,UQOR1",9606,"Rual, JF",16189514,Two-hybrid,physical,High Throughput,2005
6,241612,UQCRC1,7384,"D3S3191,QCR1,UQCR1",9606,UQCRC2,7385,"MC3DN5,QCR2,UQCR2",9606,"Ewing, RM",17353931,Affinity Capture-MS,physical,High Throughput,2007
7,241834,PGRMC1,10857,"HPR6.6,MPR",9606,EFHD1,80303,"MST133,MSTP133,SWS2",9606,"Ewing, RM",17353931,Affinity Capture-MS,physical,High Throughput,2007
8,241928,NELFCD,51497,"NELF-C,NELF-D,TH1,TH1L",9606,NELFE,7936,"D6S45,NELF-E,RD,RDBP,RDP",9606,"Ewing, RM",17353931,Affinity Capture-MS,physical,High Throughput,2007
9,242114,NELFCD,51497,"NELF-C,NELF-D,TH1,TH1L",9606,NELFB,25920,"COBRA1,NELF-B",9606,"Ewing, RM",17353931,Affinity Capture-MS,physical,High Throughput,2007


In [56]:
# Save in project folder
data.to_csv('../data/clean/' + gene + '.csv', index=False)

# Save to Neo4j imports folder for LOAD CSV command
data.to_csv(importDir + gene + '.csv', index=False)

In [57]:
graph_data.groupby('Author')['Pubmed ID'].value_counts()

Author                     Pubmed ID
Guengerich FP (1997)       9398194      1
Huttlin EL (2017)          28514442     5
Luck K (2020)              32296183     1
Moutaoufik MT (2019)       31536960     1
Ozalp C (2005)             15980100     1
Reed JR (2012)             22738171     1
Shimada T (2005)           15680923     3
Szczesna-Skorupa E (2011)  21081644     2
Wu CK (2001)               11175906     1
Name: Pubmed ID, dtype: int64

In [150]:
genes = list(graph_data['Official Symbol Interactor A'].unique())
genes

['FECH', 'POR', 'PGRMC1', 'CYP1A2', 'CYB5A']

In [153]:
genes = 'CYP1A2'

In [154]:
genes

'CYP1A2'

In [155]:
if not isinstance(genes, list): 
    genes = genes
else:
    genes = '|'.join(genes)
    
genes

'CYP1A2'

In [115]:
genes = ['|'.join(genes)]

In [116]:
genes

['C|Y|P|1|A|2']