# BioGRID Protein Interactions

## Setup

In [232]:
import os
import sys
sys.path[0] = '../'
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import re
import requests
from xml.etree.ElementTree import fromstring, ElementTree
import time

In [2]:
# Display options
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columns

In [3]:
# Set access key for BioGRID REST API
load_dotenv()
BIOGRID_ACCESS_KEY = os.getenv('BIOGRID_ACCESS_KEY')
NEO4J_HOME = os.getenv('NEO4J_HOME')
importDir = '/Users/gregory/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-d0b780b3-ce77-46cc-b7ed-bd0f78a46581/installation-3.5.14/import/'

## Import and Clean Data

In [20]:
# Loads data and formats columns
def load_data(gene_specifier):

    if not isinstance(gene_specifier, list): 
        gene_specifier = gene_specifier
    else:
        gene_specifier = '|'.join(gene_specifier)
    
    url = f"https://webservice.thebiogrid.org/interactions/?searchNames=true&geneList={gene_specifier}" \
    "&taxId=9606&includeInteractors=true&includeInteractorInteractions=true&includeHeader=true" \
    f"&accesskey={BIOGRID_ACCESS_KEY}"

    # Load data
    data = pd.read_csv(url, sep='\t', header=0)

    # Remove leading hash character
    data.rename(columns={"#BioGRID Interaction ID":"BioGRID Interaction ID"}, inplace=True)

    # Replace pipe separators with commas
    data = data.replace('\|', ',', regex=True)

    # Select str columns and replace '-' with np.nan
    cols = ['Systematic Name Interactor A', 
          'Systematic Name Interactor B', 
          'Score', 
          'Modification', 
          'Phenotypes',
          'Qualifications',
          'Tags']

    data[cols] = data[cols].applymap(lambda col: re.sub(r'^-$', str(np.NaN), col))
       
    return data

In [21]:
# Selects and transforms columns
def preprocess_data(data):
    
     # Convert Score column to float
    data['Score'] = data['Score'].astype('float64')
    
    # Select columns of interest for graph
    data = data[['BioGRID Interaction ID', 'Official Symbol Interactor A', 'Entrez Gene Interactor A', 
                       'Synonyms Interactor A', 'Organism Interactor A', 'Official Symbol Interactor B', 
                       'Entrez Gene Interactor B', 'Synonyms Interactor B', 'Organism Interactor B', 
                       'Author', 'Pubmed ID', 'Experimental System', 'Experimental System Type', 'Throughput']]

    # Create Year column
    data['Publication Year'] = data['Author'].str.split(' ').str[-1].str.strip('()')    
    
    # Remove Year from Author column
    data['Author'] = data['Author'].str.split(' ').str[:2]
    data['Author'] = data['Author'].apply(lambda x: ', '.join(x))
    
    return data

In [214]:
# Select gene; create for loop and append dataframes for multiple genes
gene = 'CYP1A2'
genes = ['FECH', 'CYP1A2']

# Fetch and clean data
data = load_data(gene)
data = preprocess_data(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [215]:
data.shape

(16, 15)

In [216]:
data.head()

Unnamed: 0,BioGRID Interaction ID,Official Symbol Interactor A,Entrez Gene Interactor A,Synonyms Interactor A,Organism Interactor A,Official Symbol Interactor B,Entrez Gene Interactor B,Synonyms Interactor B,Organism Interactor B,Author,Pubmed ID,Experimental System,Experimental System Type,Throughput,Publication Year
0,303872,FECH,2235,"EPP,FCE",9606,FECH,2235,"EPP,FCE",9606,"Wu, CK",11175906,Co-crystal Structure,physical,Low Throughput,2001
1,305778,POR,5447,"CPR,CYPOR,P450R",9606,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,"Guengerich, FP",9398194,Reconstituted Complex,physical,Low Throughput,1997
2,831166,PGRMC1,10857,"HPR6.6,MPR",9606,POR,5447,"CPR,CYPOR,P450R",9606,"Szczesna-Skorupa, E",21081644,Affinity Capture-Western,physical,Low Throughput,2011
3,831168,POR,5447,"CPR,CYPOR,P450R",9606,PGRMC1,10857,"HPR6.6,MPR",9606,"Szczesna-Skorupa, E",21081644,Affinity Capture-Western,physical,Low Throughput,2011
4,833260,POR,5447,"CPR,CYPOR,P450R",9606,CYP1A2,1544,"CP12,P3-450,P450(PA)",9606,"Shimada, T",15680923,Reconstituted Complex,physical,Low Throughput,2005


In [217]:
# Save in project folder
data.to_csv('../data/clean/biogrid_ppi_data.csv', index=False)

# Save to Neo4j imports folder for LOAD CSV command
data.to_csv(importDir + 'biogrid_ppi_data.csv', index=False)

In [218]:
data.groupby('Author')['Pubmed ID'].value_counts()[:5]

Author          Pubmed ID
Guengerich, FP  9398194      1
Huttlin, EL     28514442     5
Luck, K         32296183     1
Moutaoufik, MT  31536960     1
Ozalp, C        15980100     1
Name: Pubmed ID, dtype: int64

## APIs

Fetching data from various APIs to build a table for genes.

In [219]:
### Create new columns for gene: Gene Description, NCBI url, Locus type, Uniprot url, Wikipedia url, full sequence url, chromosome location, #bp
### Create new columns for article: Article Title, Publication, Pubmed url
# Create new column: subcellular location, condition, PTM/Processing, chromosome

### HGNC 

Parses XML from request to:
https://www.genenames.org/help/rest/

In [373]:
gene = 'CYP1A2'

def fetch_hgnc_data(gene, data):
    
    def build_gene_dict(gene):

        url = f"http://rest.genenames.org/fetch/symbol/{gene}";
        r = requests.get(url)

        tree = ElementTree(fromstring(r.text))
        root = tree.getroot()

        str_attribs = ['symbol', 'name', 'entrez_id', 'locus_type', 'location', 'ensembl_gene_id', 
                    'locus_group']

        arr_attribs = ['pubmed_id', 'gene_group', 'uniprot_ids', 'omim_id']

        gene_dict = dict()

        # retrieve <str> attributes
        for index, name in enumerate(str_attribs):
            elements = root.findall(f".//str[@name='{name}']")
            gene_dict[name] = [elements[0].text]

        # retrieve <arr> attributes
        for index, name in enumerate(arr_attribs):
            elements = root.findall(f".//arr[@name='{name}']/*")
            try:
                gene_dict[name] = [elements[0].text]
            except Exception as e:
                print(f"Error: {e}") # Log error
                gene_dict[name] = ['NULL']
                continue

        return gene_dict
    
    def build_gene_dataframe(data):

        genes = list(data['Official Symbol Interactor A'].unique())

        gene_dict_list = []

        for i, gene in enumerate(genes):
            gene_dict = build_gene_dict(gene)
            gene_dict_list.append(gene_dict)

            if i != 0 and i % 10 == 0:
                time.sleep(2)

        merge_gene_dict_list = {}

        for key in gene_dict_list[0].keys():
            merge_gene_dict_list[key] = [gene_dict_list[i][key][0] for i in range(len(gene_dict_list))]

        return pd.DataFrame(merge_gene_dict_list)
    
    return build_gene_dataframe(data)

In [374]:
gene = 'CYP1A2'

fetch_hgnc_data(gene, data)

Error: list index out of range
Error: list index out of range


Unnamed: 0,symbol,name,entrez_id,locus_type,location,ensembl_gene_id,locus_group,pubmed_id,gene_group,uniprot_ids,omim_id
0,FECH,ferrochelatase,2235,gene with protein product,18q21.31,ENSG00000066926,protein-coding gene,1838349,,P22830,612386
1,POR,cytochrome p450 oxidoreductase,5447,gene with protein product,7q11.23,ENSG00000127948,protein-coding gene,2516426,MicroRNA protein coding host genes,P16435,124015
2,PGRMC1,progesterone receptor membrane component 1,10857,gene with protein product,Xq24,ENSG00000101856,protein-coding gene,9705155,Membrane associated progesterone receptor family,O00264,300435
3,CYP1A2,cytochrome P450 family 1 subfamily A member 2,1544,gene with protein product,15q24.1,ENSG00000140505,protein-coding gene,15128046,Cytochrome P450 family 1,P05177,124060
4,CYB5A,cytochrome b5 type A,1528,gene with protein product,18q22.3,ENSG00000166347,protein-coding gene,1840560,,P00167,613218


In [363]:
def build_gene_dict(gene):

    url = f"http://rest.genenames.org/fetch/symbol/{gene}";
    r = requests.get(url)

    tree = ElementTree(fromstring(r.text))
    root = tree.getroot()

    str_attribs = ['symbol', 'name', 'entrez_id', 'locus_type', 'location', 'ensembl_gene_id', 
                'locus_group']

    arr_attribs = ['pubmed_id', 'gene_group', 'uniprot_ids', 'omim_id']

    gene_dict = dict()

    # retrieve <str> attributes
    for index, name in enumerate(str_attribs):
        elements = root.findall(f".//str[@name='{name}']")
        gene_dict[name] = [elements[0].text]

    # retrieve <arr> attributes
    for index, name in enumerate(arr_attribs):
        elements = root.findall(f".//arr[@name='{name}']/*")
        try:
            gene_dict[name] = [elements[0].text]
        except Exception as e:
            print(f"Error: {e}") # Log error
            gene_dict[name] = ['NULL']
            continue

    return gene_dict

In [369]:
def build_gene_dataframe(data):
    
    genes = list(data['Official Symbol Interactor A'].unique())
    
    gene_dict_list = []

    for i, gene in enumerate(genes):
        gene_dict = build_gene_dict(gene)
        gene_dict_list.append(gene_dict)

        if i != 0 and i % 10 == 0:
            time.sleep(2)
            
    merge_gene_dict_list = {}

    for key in gene_dict_list[0].keys():
        merge_gene_dict_list[key] = [gene_dict_list[i][key][0] for i in range(len(gene_dict_list))]
    
    return pd.DataFrame(merge_gene_dict_list)
    

In [370]:
build_gene_dataframe(data)

0 FECH
Error: list index out of range
1 POR
2 PGRMC1
3 CYP1A2
4 CYB5A
Error: list index out of range


Unnamed: 0,symbol,name,entrez_id,locus_type,location,ensembl_gene_id,locus_group,pubmed_id,gene_group,uniprot_ids,omim_id
0,FECH,ferrochelatase,2235,gene with protein product,18q21.31,ENSG00000066926,protein-coding gene,1838349,,P22830,612386
1,POR,cytochrome p450 oxidoreductase,5447,gene with protein product,7q11.23,ENSG00000127948,protein-coding gene,2516426,MicroRNA protein coding host genes,P16435,124015
2,PGRMC1,progesterone receptor membrane component 1,10857,gene with protein product,Xq24,ENSG00000101856,protein-coding gene,9705155,Membrane associated progesterone receptor family,O00264,300435
3,CYP1A2,cytochrome P450 family 1 subfamily A member 2,1544,gene with protein product,15q24.1,ENSG00000140505,protein-coding gene,15128046,Cytochrome P450 family 1,P05177,124060
4,CYB5A,cytochrome b5 type A,1528,gene with protein product,18q22.3,ENSG00000166347,protein-coding gene,1840560,,P00167,613218


In [364]:
temp_gene_df = pd.DataFrame.from_dict(gene_dict)
gene_list = list(data['Official Symbol Interactor A'].unique())

gene_list

['FECH', 'POR', 'PGRMC1', 'CYP1A2', 'CYB5A']

In [365]:
gene_dict_list = []

for i, gene in enumerate(genes):
    print(i, gene)
    gene_dict = build_gene_dict(gene)
    gene_dict_list.append(gene_dict)
    
    if i != 0 and i % 10 == 0:
        time.sleep(2)

0 FECH
Error: list index out of range
1 POR
2 PGRMC1
3 CYP1A2
4 CYB5A
Error: list index out of range


In [366]:
merge = {}

for key in gene_dict_list[0].keys():
    merge[key] = [gene_dict_list[i][key][0] for i in range(len(gene_dict_list))]

In [368]:
pd.DataFrame(merge)

Unnamed: 0,symbol,name,entrez_id,locus_type,location,ensembl_gene_id,locus_group,pubmed_id,gene_group,uniprot_ids,omim_id
0,FECH,ferrochelatase,2235,gene with protein product,18q21.31,ENSG00000066926,protein-coding gene,1838349,,P22830,612386
1,POR,cytochrome p450 oxidoreductase,5447,gene with protein product,7q11.23,ENSG00000127948,protein-coding gene,2516426,MicroRNA protein coding host genes,P16435,124015
2,PGRMC1,progesterone receptor membrane component 1,10857,gene with protein product,Xq24,ENSG00000101856,protein-coding gene,9705155,Membrane associated progesterone receptor family,O00264,300435
3,CYP1A2,cytochrome P450 family 1 subfamily A member 2,1544,gene with protein product,15q24.1,ENSG00000140505,protein-coding gene,15128046,Cytochrome P450 family 1,P05177,124060
4,CYB5A,cytochrome b5 type A,1528,gene with protein product,18q22.3,ENSG00000166347,protein-coding gene,1840560,,P00167,613218
