# BioGRID Protein Interactions

## Setup

In [1]:
import os
import sys
sys.path[0] = '../'
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import re
import requests

In [2]:
# Display options
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columns

In [3]:
# Set access key for BioGRID REST API
load_dotenv()
BIOGRID_ACCESS_KEY = os.getenv('BIOGRID_ACCESS_KEY')
NEO4J_HOME = os.getenv('NEO4J_HOME')
importDir = '/Users/gregory/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-d0b780b3-ce77-46cc-b7ed-bd0f78a46581/installation-3.5.14/import/'

## Import and Clean Data

In [20]:
# Loads data and formats columns
def load_data(gene_specifier):

    if not isinstance(gene_specifier, list): 
        gene_specifier = gene_specifier
    else:
        gene_specifier = '|'.join(gene_specifier)
    
    url = f"https://webservice.thebiogrid.org/interactions/?searchNames=true&geneList={gene_specifier}" \
    "&taxId=9606&includeInteractors=true&includeInteractorInteractions=true&includeHeader=true" \
    f"&accesskey={BIOGRID_ACCESS_KEY}"

    # Load data
    data = pd.read_csv(url, sep='\t', header=0)

    # Remove leading hash character
    data.rename(columns={"#BioGRID Interaction ID":"BioGRID Interaction ID"}, inplace=True)

    # Replace pipe separators with commas
    data = data.replace('\|', ',', regex=True)

    # Select str columns and replace '-' with np.nan
    cols = ['Systematic Name Interactor A', 
          'Systematic Name Interactor B', 
          'Score', 
          'Modification', 
          'Phenotypes',
          'Qualifications',
          'Tags']

    data[cols] = data[cols].applymap(lambda col: re.sub(r'^-$', str(np.NaN), col))
       
    return data

In [21]:
# Selects and transforms columns
def preprocess_data(data):
    
     # Convert Score column to float
    data['Score'] = data['Score'].astype('float64')
    
    # Select columns of interest for graph
    data = data[['BioGRID Interaction ID', 'Official Symbol Interactor A', 'Entrez Gene Interactor A', 
                       'Synonyms Interactor A', 'Organism Interactor A', 'Official Symbol Interactor B', 
                       'Entrez Gene Interactor B', 'Synonyms Interactor B', 'Organism Interactor B', 
                       'Author', 'Pubmed ID', 'Experimental System', 'Experimental System Type', 'Throughput']]

    # Create Year column
    data['Publication Year'] = data['Author'].str.split(' ').str[-1].str.strip('()')    
    
    # Remove Year from Author column
    data['Author'] = data['Author'].str.split(' ').str[:2]
    data['Author'] = data['Author'].apply(lambda x: ', '.join(x))
    
    return data

In [30]:
# Select gene; create for loop and append dataframes for multiple genes
gene = 'CYP1A2'
genes = ['FECH', 'CYP1A2']

# Fetch and clean data
data = load_data(genes)
data = preprocess_data(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [31]:
data.shape

(377, 15)

In [32]:
data.head()

Unnamed: 0,BioGRID Interaction ID,Official Symbol Interactor A,Entrez Gene Interactor A,Synonyms Interactor A,Organism Interactor A,Official Symbol Interactor B,Entrez Gene Interactor B,Synonyms Interactor B,Organism Interactor B,Author,Pubmed ID,Experimental System,Experimental System Type,Throughput,Publication Year
0,119392,NELFE,7936,"D6S45,NELF-E,RD,RDBP,RDP",9606,NELFB,25920,"COBRA1,NELF-B",9606,"Lehner, B",14667819,Two-hybrid,physical,Low Throughput,2004
1,120272,RCBTB2,1102,"CHC1L,RLG",9606,RCBTB2,1102,"CHC1L,RLG",9606,"Rual, JF",16189514,Two-hybrid,physical,High Throughput,2005
2,241928,NELFCD,51497,"NELF-C,NELF-D,TH1,TH1L",9606,NELFE,7936,"D6S45,NELF-E,RD,RDBP,RDP",9606,"Ewing, RM",17353931,Affinity Capture-MS,physical,High Throughput,2007
3,242114,NELFCD,51497,"NELF-C,NELF-D,TH1,TH1L",9606,NELFB,25920,"COBRA1,NELF-B",9606,"Ewing, RM",17353931,Affinity Capture-MS,physical,High Throughput,2007
4,245113,ATM,472,"AT1,ATA,ATC,ATD,ATDC,ATE,TEL1,TELO1",9606,ATM,472,"AT1,ATA,ATC,ATD,ATDC,ATE,TEL1,TELO1",9606,"Kim, ST",10608806,Protein-peptide,physical,Low Throughput,1999


In [33]:
# Save in project folder
data.to_csv('../data/clean/biogrid_ppi_data.csv', index=False)

# Save to Neo4j imports folder for LOAD CSV command
data.to_csv(importDir + 'biogrid_ppi_data.csv', index=False)

In [34]:
data.groupby('Author')['Pubmed ID'].value_counts()[:5]

Author          Pubmed ID
Abdelmohsen, K  19322201     6
Alkhaja, AK     22114354     1
Arroyo, R       25640309     1
Arumughan, A    27762274     2
Banks, CA       27609421     7
Name: Pubmed ID, dtype: int64

## HGNC API

Parses XML from request to:
https://www.genenames.org/help/rest/

In [11]:
### Create new columns for gene: Gene Description, NCBI url, Locus type, Uniprot url, Wikipedia url, full sequence url, chromosome location, #bp
### Create new columns for article: Article Title, Publication, Pubmed url
# Create new column: subcellular location, condition, PTM/Processing

In [183]:
gene = 'CYP1A2'

url = f"http://rest.genenames.org/fetch/symbol/{gene}";
r = requests.get(url)

In [184]:
from xml.etree.ElementTree import fromstring, ElementTree
tree = ElementTree(fromstring(r.text))
root = tree.getroot()

In [185]:
str_attribs = ['symbol', 'name', 'entrez_id', 'locus_type', 'location', 'ensembl_gene_id', 
                'locus_group']

arr_attribs = ['pubmed_id', 'gene_group', 'uniprot_ids']

In [195]:
gene_dict = dict()

# retrieve <str> attributes
for index, name in enumerate(str_attribs):
    elements = root.findall(f".//str[@name='{name}']")
    gene_dict[name] = [elements[0].text]

# retrieve <arr> attributes
for index, name in enumerate(arr_attribs):
    elements = root.findall(f".//arr[@name='{name}']/*")
    gene_dict[name] = [elements[0].text]

In [196]:
gene_dict

{'symbol': ['CYP1A2'],
 'name': ['cytochrome P450 family 1 subfamily A member 2'],
 'entrez_id': ['1544'],
 'locus_type': ['gene with protein product'],
 'location': ['15q24.1'],
 'ensembl_gene_id': ['ENSG00000140505'],
 'locus_group': ['protein-coding gene'],
 'pubmed_id': ['15128046'],
 'gene_group': ['Cytochrome P450 family 1'],
 'uniprot_ids': ['P05177']}

In [197]:
pd.DataFrame.from_dict(gene_dict)

Unnamed: 0,symbol,name,entrez_id,locus_type,location,ensembl_gene_id,locus_group,pubmed_id,gene_group,uniprot_ids
0,CYP1A2,cytochrome P450 family 1 subfamily A member 2,1544,gene with protein product,15q24.1,ENSG00000140505,protein-coding gene,15128046,Cytochrome P450 family 1,P05177
