# BioGRID Protein Interactions

### Setup

In [135]:
import os
import sys
sys.path[0] = '../'
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import re
from py2neo import Graph, Node, Relationship
from py2neo.ogm import GraphObject, Property

In [67]:
# Display options
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columns

In [68]:
# Set access key for BioGRID REST API
load_dotenv()
BIOGRID_ACCESS_KEY = os.getenv('BIOGRID_ACCESS_KEY')
NEO4J_HOME = os.getenv('NEO4J_HOME')
importDir = '/Users/gregory/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-d0b780b3-ce77-46cc-b7ed-bd0f78a46581/installation-3.5.14/import/'

In [69]:
# Select gene
gene = 'CYP1A2'
url = f'https://webservice.thebiogrid.org/interactions/?searchNames=true&geneList={gene}&taxId=9606&includeInteractors=true&includeInteractorInteractions=true&includeHeader=true&accesskey={BIOGRID_ACCESS_KEY}'


### Import and Clean Data

In [70]:
# Load data
data = pd.read_csv(url, sep='\t', header=0)

# Remove leading hash character
data.rename(columns={"#BioGRID Interaction ID":"BioGRID Interaction ID"}, inplace=True)

# Replace pipe separators with commas
data = data.replace('\|', ',', regex=True)

# Concatenate all synonyms of genes
#data['Synonyms Interactor A'] = data['Official Symbol Interactor A'] + ',' + data['Synonyms Interactor A']
#data['Synonyms Interactor B'] = data['Official Symbol Interactor B'] + ',' + data['Synonyms Interactor B']

# Select str columns and replace '-' with np.nan
cols = ['Systematic Name Interactor A', 
      'Systematic Name Interactor B', 
      'Score', 
      'Modification', 
      'Phenotypes',
      'Qualifications',
      'Tags']

data[cols] = data[cols].applymap(lambda col: re.sub(r'^-$', str(np.NaN), col))

# Convert Score column to float
data['Score'] = data['Score'].astype('float64')

In [71]:
#data.reset_index()

In [72]:
data.to_csv('../data/clean/'+gene+'.csv', index=False)
data.to_csv(importDir+gene+'.csv', index=False)

In [73]:
pd.read_csv('../data/clean/'+gene+'.csv');

In [74]:
data.head()

Unnamed: 0,BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,Synonyms Interactor B,Experimental System,Experimental System Type,Author,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,303872,2235,2235,108526,108526,,,FECH,FECH,"EPP,FCE","EPP,FCE",Co-crystal Structure,physical,Wu CK (2001),11175906,9606,9606,Low Throughput,,,,,,BIOGRID
1,305778,5447,1544,111443,107924,,,POR,CYP1A2,"CPR,CYPOR,P450R","CP12,P3-450,P450(PA)",Reconstituted Complex,physical,Guengerich FP (1997),9398194,9606,9606,Low Throughput,,,,,,BIOGRID
2,831166,10857,5447,116068,111443,,,PGRMC1,POR,"HPR6.6,MPR","CPR,CYPOR,P450R",Affinity Capture-Western,physical,Szczesna-Skorupa E (2011),21081644,9606,9606,Low Throughput,,,,,,BIOGRID
3,831168,5447,10857,111443,116068,,,POR,PGRMC1,"CPR,CYPOR,P450R","HPR6.6,MPR",Affinity Capture-Western,physical,Szczesna-Skorupa E (2011),21081644,9606,9606,Low Throughput,,,,,,BIOGRID
4,833260,5447,1544,111443,107924,,,POR,CYP1A2,"CPR,CYPOR,P450R","CP12,P3-450,P450(PA)",Reconstituted Complex,physical,Shimada T (2005),15680923,9606,9606,Low Throughput,,,,,,BIOGRID


In [75]:
# Select columns of interest for graph
graph_data = data[['BioGRID Interaction ID', 'Official Symbol Interactor A', 'Entrez Gene Interactor A', 
                   'Synonyms Interactor A', 'Organism Interactor A', 'Official Symbol Interactor B', 
                   'Entrez Gene Interactor B', 'Synonyms Interactor B', 'Organism Interactor B', 
                   'Author', 'Pubmed ID', 'Experimental System', 'Experimental System Type', 'Throughput']]

In [76]:
# Columns not selected for graph
set(data.columns)-set(graph_data.columns)

{'BioGRID ID Interactor A',
 'BioGRID ID Interactor B',
 'Modification',
 'Phenotypes',
 'Qualifications',
 'Score',
 'Source Database',
 'Systematic Name Interactor A',
 'Systematic Name Interactor B',
 'Tags'}

In [77]:
graph_data

Unnamed: 0,BioGRID Interaction ID,Official Symbol Interactor A,Entrez Gene Interactor A,Synonyms Interactor A,Organism Interactor A,Official Symbol Interactor B,Entrez Gene Interactor B,Synonyms Interactor B,Organism Interactor B,Author,Pubmed ID,Experimental System,Experimental System Type,Throughput
0,303872,FECH,2235,"EPP,FCE",9606,FECH,2235,"EPP,FCE",9606,Wu CK (2001),11175906,Co-crystal Structure,physical,Low Throughput
1,305778,POR,5447,"CPR,CYPOR,P450R",9606,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,Guengerich FP (1997),9398194,Reconstituted Complex,physical,Low Throughput
2,831166,PGRMC1,10857,"HPR6.6,MPR",9606,POR,5447,"CPR,CYPOR,P450R",9606,Szczesna-Skorupa E (2011),21081644,Affinity Capture-Western,physical,Low Throughput
3,831168,POR,5447,"CPR,CYPOR,P450R",9606,PGRMC1,10857,"HPR6.6,MPR",9606,Szczesna-Skorupa E (2011),21081644,Affinity Capture-Western,physical,Low Throughput
4,833260,POR,5447,"CPR,CYPOR,P450R",9606,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,Shimada T (2005),15680923,Reconstituted Complex,physical,Low Throughput
5,833263,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,POR,5447,"CPR,CYPOR,P450R",9606,Shimada T (2005),15680923,Reconstituted Complex,physical,Low Throughput
6,838618,CYB5A,1528,"CYB5,MCB5",9606,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,Shimada T (2005),15680923,Reconstituted Complex,physical,Low Throughput
7,1527656,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,Reed JR (2012),22738171,FRET,physical,Low Throughput
8,1870216,POR,5447,"CPR,CYPOR,P450R",9606,POR,5447,"CPR,CYPOR,P450R",9606,Ozalp C (2005),15980100,PCA,physical,Low Throughput
9,2231171,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,ASAH1,427,"AC,ACDase,ASAH,PHP,PHP32,SMAPME",9606,Huttlin EL (2017),28514442,Affinity Capture-MS,physical,High Throughput


In [139]:
graph_data.to_csv('../data/clean/graph_data.csv', index=False)

In [78]:
graph = Graph("bolt://localhost:7687", auth=("neo4j", "test"))

In [79]:
cyp1a2 = Node("Protein", name="CYP1A2")
fech = Node("Protein", name="FECH")

In [80]:
cyp1a2_fech = Relationship(cyp1a2, "INTERACTS_WITH", fech)

In [81]:
cyp1a2_fech["biogrid_interaction_id"] = "2253759"

In [82]:
tx = graph.begin()

In [83]:
entities = [cyp1a2, fech, cyp1a2_fech]

for entity in entities:
    tx.create(entity)
    
tx.commit()

In [84]:
graph.delete_all()

In [152]:
graph_data[graph_data['Official Symbol Interactor A']=='CYP1A2']

Unnamed: 0,BioGRID Interaction ID,Official Symbol Interactor A,Entrez Gene Interactor A,Synonyms Interactor A,Organism Interactor A,Official Symbol Interactor B,Entrez Gene Interactor B,Synonyms Interactor B,Organism Interactor B,Author,Pubmed ID,Experimental System,Experimental System Type,Throughput
5,833263,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,POR,5447,"CPR,CYPOR,P450R",9606,Shimada T (2005),15680923,Reconstituted Complex,physical,Low Throughput
7,1527656,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,Reed JR (2012),22738171,FRET,physical,Low Throughput
9,2231171,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,ASAH1,427,"AC,ACDase,ASAH,PHP,PHP32,SMAPME",9606,Huttlin EL (2017),28514442,Affinity Capture-MS,physical,High Throughput
10,2247600,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,EPHA4,2043,"HEK8,SEK,TYRO1",9606,Huttlin EL (2017),28514442,Affinity Capture-MS,physical,High Throughput
11,2253759,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,FECH,2235,"EPP,FCE",9606,Huttlin EL (2017),28514442,Affinity Capture-MS,physical,High Throughput
13,2270857,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,PGRMC1,10857,"HPR6.6,MPR",9606,Huttlin EL (2017),28514442,Affinity Capture-MS,physical,High Throughput
15,2727673,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,ABHD16A,7920,"BAT5,D6S82E,NG26,PP199",9606,Luck K (2020),32296183,Two-hybrid,physical,High Throughput


### Interactors

In [86]:
interactor_a = graph_data['Official Symbol Interactor A'].unique()
interactor_a

array(['FECH', 'POR', 'PGRMC1', 'CYP1A2', 'CYB5A'], dtype=object)

In [87]:
interactor_b = graph_data['Official Symbol Interactor B'].unique()
interactor_b

array(['FECH', 'CYP1A2', 'POR', 'PGRMC1', 'ASAH1', 'EPHA4', 'CYB5A',
       'ABHD16A'], dtype=object)

In [88]:
# Genes not in interactor_a
set(interactor_a).symmetric_difference(set(interactor_b))

{'ABHD16A', 'ASAH1', 'EPHA4'}

In [150]:
set(interactor_a).union(set(interactor_b))

{'ABHD16A', 'ASAH1', 'CYB5A', 'CYP1A2', 'EPHA4', 'FECH', 'PGRMC1', 'POR'}

### Entrez ID

In [154]:
entrez_id_a = graph_data['Entrez Gene Interactor A'].unique()
entrez_id_b = graph_data['Entrez Gene Interactor B'].unique()
set(entrez_id_a).union(set(entrez_id_b))

{427, 1528, 1544, 2043, 2235, 5447, 7920, 10857}

### Create Nodes

In [89]:
# Create Interaction nodes
interaction_nodes = list(graph_data['BioGRID Interaction ID'].unique())

# Create Gene nodes
gene_nodes = list(set(interactor_a).union(set(interactor_b)))

# Create Article nodes
article_nodes = list(graph_data['Pubmed ID'].unique())

# Create Author nodes
author_nodes = list(graph_data['Author'].unique())
author_nodes = [' '.join(author.split()[:2]) for author in author_nodes]

In [132]:
nodes = [interaction_nodes, gene_nodes, article_nodes, author_nodes]

for node in nodes:
    print(node)

[303872, 305778, 831166, 831168, 833260, 833263, 838618, 1527656, 1870216, 2231171, 2247600, 2253759, 2264891, 2270857, 2625925, 2727673]
['FECH', 'CYB5A', 'ABHD16A', 'PGRMC1', 'CYP1A2', 'ASAH1', 'POR', 'EPHA4']
[11175906, 9398194, 21081644, 15680923, 22738171, 15980100, 28514442, 31536960, 32296183]
['Wu CK', 'Guengerich FP', 'Szczesna-Skorupa E', 'Shimada T', 'Reed JR', 'Ozalp C', 'Huttlin EL', 'Moutaoufik MT', 'Luck K']


#### Interactions Nodes

In [145]:
# author property
interaction_author = graph_data[graph_data['BioGRID Interaction ID']==interaction_nodes[0]]['Author']
interaction_author = ', '.join(interaction_author.str.split(' ').str[:-1].loc[0])
interaction_author

'Wu, CK'

In [147]:
# Build nodes
interaction = Node(biogrid_id=interaction_nodes[0], author = interaction_author)

In [None]:
# pubmed id property
interaction_pubmed_id = graph

In [90]:
class Interactions(GraphObject):
    __primarykey__ = "biogrid_id"
    
    author = Property()
    pubmed_id = Property()
    experimental_system = Property()
    experimental_system_type = Property()

#### Gene Nodes

In [91]:
class Gene(GraphObject):
    __primarykey__ = "name"
    
    name = Property()
    description = Property()
    entrez_id = Property()
    url = Property()
    synonyms = Property()
    organism = Property()
    locus_type = Property()
    wikipedia = Property()