In [1]:
from network_evaluation_tools import gene_conversion_tools as gct
from network_evaluation_tools import data_import_tools as dit
import pandas as pd
import time

## Load Pathway Commons Raw Data (All interactions)
#### Source: http://www.pathwaycommons.org/archives/PC2/v9/PathwayCommons9.All.hgnc.txt.gz
Downloaded: June 15, 2017  
Last Updated: May 25, 2017  
Citation: Pathway Commons, a web resource for biological pathway data. Cerami E et al. Nucleic Acids Research (2011).  
A Note about filtering interactions: Pathway Commons also contains interactions between proteins and small molecules from the CHEBI database. These interactions will need to be filtered out as they are not protein-protein interactions.  
Also note: The text file has more lines than the sif file in Pathway Commons. However, the text file has some interactions that are unclear how to resolve so for this case we will use the sif file provided by Pathway Commons

In [2]:
wd = '/data/'
PC_Raw = pd.read_csv(wd+'PathwayCommons9.All.hgnc.sif', sep='\t', header=None)
print('Raw interactions in Pathway Commons v9:', PC_Raw.shape[0])

Raw interactions in Pathway Commons v9: 1546602


In [3]:
# Filter all interactions that contain a CHEBI: item
PC_filt = PC_Raw[(~PC_Raw[0].str.contains(':')) & (~PC_Raw[2].str.contains(':'))]
PC_edgelist = PC_filt[[0, 2]].values.tolist()
print('Protein-Protein interactions in Pathway Commons v9:', len(PC_edgelist))

Protein-Protein interactions in Pathway Commons v9: 987778


In [4]:
# Sort each edge representation for filtering
PC_edgelist_sorted = [sorted(edge) for edge in PC_edgelist]

In [5]:
# Filter edgelist for duplicate nodes and for self-edges
PC_edgelist_filt = gct.filter_converted_edgelist(PC_edgelist_sorted)

In [6]:
# Save genelist to file
outdir = '/data/'
gct.write_edgelist(PC_edgelist_filt, outdir+'PathwayCommons_Symbol.sif')