In [1]:
import numpy as np
import pandas as pd
import re
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Reading in the several .txt files from STRING as a .csv file to import into pandas

input_file = "inputs/input_string_network_files/9606.protein.links.full.v11.0.txt"
if not os.path.exists(input_file):
    sys.exit("Can't locate input file %s" % input_file)
string_protein_links_df = pd.read_csv(input_file, delim_whitespace=True)


input_file = "inputs/input_string_network_files/9606.protein.info.v11.0.txt"
if not os.path.exists(input_file):
    sys.exit("Can't locate input file %s" % input_file)
string_protein_info_df = pd.read_csv(input_file, sep='\t')

In [3]:
string_protein_links_df.head(3)

Unnamed: 0,protein1,protein2,neighborhood,neighborhood_transferred,fusion,cooccurence,homology,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score
0,9606.ENSP00000000233,9606.ENSP00000272298,0,0,0,332,0,0,62,0,181,0,0,0,125,490
1,9606.ENSP00000000233,9606.ENSP00000253401,0,0,0,0,0,0,0,0,186,0,0,0,56,198
2,9606.ENSP00000000233,9606.ENSP00000401445,0,0,0,0,0,0,0,0,160,0,0,0,0,159


In [4]:
string_protein_info_df.head(3)

Unnamed: 0,protein_external_id,preferred_name,protein_size,annotation
0,9606.ENSP00000000233,ARF5,180,ADP-ribosylation factor 5; GTP-binding protein...
1,9606.ENSP00000000412,M6PR,277,Cation-dependent mannose-6-phosphate receptor;...
2,9606.ENSP00000001008,FKBP4,459,Peptidyl-prolyl cis-trans isomerase FKBP4; Imm...


In [5]:
# list of proteins preferred_name's with 2 protein_external_id's
proteins_with_redundant_ids = ["ENSG00000258947", "ENSG00000239810", "ENSG00000253117", "ENSG00000183628", "ENSG00000205457", 
                               "ENSG00000166160", "ENSG00000216937", "ENSG00000197054", "ENSG00000242852", "ENSG00000243667"]

# Displaying specific proteins with 2 protein_external_id's
string_protein_info_df[string_protein_info_df["preferred_name"].isin(proteins_with_redundant_ids)].sort_values(by='preferred_name', ascending=False)

Unnamed: 0,protein_external_id,preferred_name,protein_size,annotation
6807,9606.ENSP00000320295,ENSG00000258947,450,Tubulin beta-3 chain; Tubulin is the major con...
17817,9606.ENSP00000451560,ENSG00000258947,797,Uncharacterized protein; Tubulin is the major ...
1825,9606.ENSP00000254627,ENSG00000253117,477,Otoconin 90; It is unlikely that this protein ...
2500,9606.ENSP00000262283,ENSG00000253117,689,Otoconin-90; It is unlikely that this protein ...
4579,9606.ENSP00000295121,ENSG00000243667,357,Uncharacterized protein DKFZp434B156; Seems to...
18716,9606.ENSP00000477980,ENSG00000243667,765,WD repeat-containing protein 92; Seems to act ...
15763,9606.ENSP00000404127,ENSG00000242852,641,Zinc finger protein 709; May be involved in tr...
14173,9606.ENSP00000380840,ENSG00000242852,641,Zinc finger protein 709; May be involved in tr...
9156,9606.ENSP00000347211,ENSG00000239810,436,PRAME family member 11
18900,9606.ENSP00000480027,ENSG00000239810,478,PRAME family member 11


In [None]:
# The Code below shows how I found out the discrepancy 

In [28]:
# Count the number of DISTINCT Protein IDs from 9606.protein.info.v11.0.txt and 9606.protein.links.full.v11.0.txt

# "Interaction Table" - 9606.protein.links.full.v11.0.txt
# "Protein Info Table" - 9606.protein.info.v11.0.txt

string_unique_id_interaction_list = []
string_unique_id_interaction_one_list = string_protein_links_df["protein1"].unique()
string_unique_id_interaction_two_list = string_protein_links_df["protein2"].unique()
string_unique_id_interaction_list.extend(string_unique_id_interaction_one_list)
string_unique_id_interaction_list.extend(string_unique_id_interaction_two_list)
string_unique_id_interaction_list = pd.Series(string_unique_id_interaction_list).unique()

print()
print("Number of unique Protein 1 IDs from Interaction table:", len(string_unique_id_interaction_one_list))
print("Number of unique Protein 2 IDs from Interaction table:", len(string_unique_id_interaction_two_list))
print("Number of unique Protein IDs from Interaction table:", len(string_unique_id_interaction_list))
print("Number of unique Protein IDs from Protein Info Table:", len(string_protein_info_df["protein_external_id"]))
print("Number of unique Proteins from Protein Info Table:", len(string_protein_info_df["preferred_name"]))

print()

string_unique_id_interaction_set = set(string_unique_id_interaction_list)
string_unique_id_info_set = set(string_protein_info_df["protein_external_id"])

intersecting_nodes = string_unique_id_interaction_set & string_unique_id_info_set

string_unique_id_interaction_disjoint_set = string_unique_id_interaction_set - intersecting_nodes
string_unique_id_info_disjoint_set = string_unique_id_info_set - intersecting_nodes

print("Number of Protein IDs only in the Interaction table:", len(string_unique_id_interaction_disjoint_set))
print("Number of Protein IDs only in the Info table:",len(string_unique_id_info_disjoint_set))
print()


Number of unique Protein 1 IDs from Interaction table: 19354
Number of unique Protein 2 IDs from Interaction table: 19354
Number of unique Protein IDs from Interaction table: 19354
Number of unique Protein IDs from Protein Info Table: 19566
Number of unique Proteins from Protein Info Table: 19566

Number of Protein IDs only in the Interaction table: 0
Number of Protein IDs only in the Info table: 212



In [None]:
# From this, we know there are 212 listed Proteins NOT in any interaction (isolated vertices in the network graph)
# 212 Protein IDs are listed in 9606.protein.info.v11.0.txt but NOT in 9606.protein.links.full.v11.0.txt

In [23]:
string_protein_info_df

Unnamed: 0,protein_external_id,preferred_name,protein_size,annotation
0,9606.ENSP00000000233,ARF5,180,ADP-ribosylation factor 5; GTP-binding protein...
1,9606.ENSP00000000412,M6PR,277,Cation-dependent mannose-6-phosphate receptor;...
2,9606.ENSP00000001008,FKBP4,459,Peptidyl-prolyl cis-trans isomerase FKBP4; Imm...
3,9606.ENSP00000001146,CYP26B1,512,Cytochrome P450 26B1; Involved in the metaboli...
4,9606.ENSP00000002125,NDUFAF7,441,"Protein arginine methyltransferase NDUFAF7, mi..."
...,...,...,...,...
19561,9606.ENSP00000485671,ENSG00000280273,120,HCG1991042
19562,9606.ENSP00000485672,ENSG00000279458,86,annotation not available
19563,9606.ENSP00000485673,ENSG00000279988,243,annotation not available
19564,9606.ENSP00000485675,ENSG00000280116,84,annotation not available


In [30]:
# There are 19566 records/rows in the 9606.protein.links.full.v11.0.txt file. 

# If there existed a 1-to-1 correspondence between a protein's IDs and a protein's preferred name, then there should be 19566 * 2 = 39132 distinct elements. 
# There is no record of a protein where their ID is the same as their preferred name because all protein IDs for Homo Sapiens start with 9606. 

# 19566 * 2 = 39132

lst = []
lst.extend(list(string_protein_info_df["protein_external_id"]))
lst.extend(list(string_protein_info_df["preferred_name"]))
lst = pd.Series(lst).unique()
print("The number of distinct elements in 'protein_external_id' + 'preferred_name' columns is:", len(lst))

The number of distinct elements in 'protein_external_id' + 'preferred_name' columns is: 39122


In [12]:
# 39132 - 39122 = 10
# 10 Proteins 

# There should have been a 1-to-1 correspondance in terms of counting the number of "protein_external_id" and "preferred_name"
# If every protein's preferred name had a distinct protein ID then this means there should have been 39132 elements in the list...
# ...not 39122...this means a redundancy exists: more protein IDs than Protein Preferred names

In [24]:
# Finding Proteins with more than 1 protein ID in 9606.protein.info.v11.0.txt

df = string_protein_info_df[["preferred_name", "protein_size"]]
df = df.groupby(["preferred_name"]).agg(["count"])
df.columns = df.columns.droplevel(0)
duplicate_protein_ids_count = df.reset_index().sort_values(by='count', ascending=False)

# only display the first 20 records of 9606.protein.info.v11.0.txt .... to show count
duplicate_protein_ids_count.head(20)

Unnamed: 0,preferred_name,count
5207,ENSG00000258947,2
5115,ENSG00000239810,2
5161,ENSG00000253117,2
5083,ENSG00000183628,2
5097,ENSG00000205457,2
5076,ENSG00000166160,2
5102,ENSG00000216937,2
5090,ENSG00000197054,2
5119,ENSG00000242852,2
5121,ENSG00000243667,2


In [33]:
# We now know only these 10 proteins have 2 protein IDs

"""

ENSG00000258947
ENSG00000239810
ENSG00000253117
ENSG00000183628
ENSG00000205457
ENSG00000166160
ENSG00000216937
ENSG00000197054
ENSG00000242852
ENSG00000243667

"""
print()




In [None]:
"""

The Protein: ENSG00000258947 has 2 IDs:

- 9606.ENSP00000320295
- 9606.ENSP00000451560

.....yet there exists interaction with similar and distinct proteins....

Protein IDs 2: (See below)
- 9606.ENSP00000485586
- 9606.ENSP00000484841
- 9606.ENSP00000484803
- 9606.ENSP00000484789
- 9606.ENSP00000000233
"""
print()

In [37]:
string_protein_links_df[string_protein_links_df["protein1"] == "9606.ENSP00000320295"].sort_values(by='protein2', ascending=False)

Unnamed: 0,protein1,protein2,neighborhood,neighborhood_transferred,fusion,cooccurence,homology,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score
4487786,9606.ENSP00000320295,9606.ENSP00000485663,0,0,0,0,0,145,78,0,0,0,0,0,49,184
4487709,9606.ENSP00000320295,9606.ENSP00000485586,0,0,0,0,0,0,61,0,85,0,0,222,0,273
4486345,9606.ENSP00000320295,9606.ENSP00000484841,0,0,0,0,0,0,63,0,157,0,0,56,98,237
4487086,9606.ENSP00000320295,9606.ENSP00000484803,0,0,0,0,0,0,0,0,77,0,0,89,85,163
4487742,9606.ENSP00000320295,9606.ENSP00000484789,0,0,0,0,0,0,63,0,109,0,0,60,68,170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4487327,9606.ENSP00000320295,9606.ENSP00000062104,0,0,0,0,0,0,63,0,0,0,0,114,81,170
4487519,9606.ENSP00000320295,9606.ENSP00000044462,0,0,0,0,0,183,0,0,50,0,0,53,0,200
4486578,9606.ENSP00000320295,9606.ENSP00000013807,0,0,0,0,0,0,62,0,0,0,0,669,0,676
4486446,9606.ENSP00000320295,9606.ENSP00000001008,0,0,0,0,0,87,62,0,0,0,0,87,73,178


In [38]:
string_protein_links_df[string_protein_links_df["protein1"] == "9606.ENSP00000451560"].sort_values(by='protein2', ascending=False)

Unnamed: 0,protein1,protein2,neighborhood,neighborhood_transferred,fusion,cooccurence,homology,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score
10937078,9606.ENSP00000451560,9606.ENSP00000485586,0,0,0,0,0,0,61,0,85,0,0,217,0,268
10936115,9606.ENSP00000451560,9606.ENSP00000485478,0,0,0,0,0,0,0,0,0,0,0,221,0,221
10935518,9606.ENSP00000451560,9606.ENSP00000484841,0,0,0,0,0,0,63,0,157,0,0,58,98,239
10936371,9606.ENSP00000451560,9606.ENSP00000484803,0,0,0,0,0,0,0,0,84,0,0,124,85,201
10937121,9606.ENSP00000451560,9606.ENSP00000484789,0,0,0,0,0,0,63,0,109,0,0,56,68,167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10935711,9606.ENSP00000451560,9606.ENSP00000043402,0,0,0,0,0,76,62,0,50,0,0,0,107,166
10935788,9606.ENSP00000451560,9606.ENSP00000013807,0,0,0,0,0,0,62,0,0,0,0,647,0,654
10935940,9606.ENSP00000451560,9606.ENSP00000005286,0,0,0,0,0,152,0,0,0,0,0,0,0,151
10937239,9606.ENSP00000451560,9606.ENSP00000005257,0,0,0,0,0,0,0,0,167,0,0,0,49,173
