In [26]:
# Import libraries
import pandas as pd
import numpy as np

In [27]:
# Original DIP dataset ver 20160430
dip_orig = pd.read_csv('dip_pairs.txt', sep='\t', header=None)
dip_orig.columns = ['first', 'second']
dip_orig['label'] = 1
dip_orig

Unnamed: 0,first,second,label
0,DIP-617N,DIP-617N,1
1,DIP-493N,DIP-147N,1
2,DIP-582N,DIP-472N,1
3,DIP-1078N,DIP-51N,1
4,DIP-189N,DIP-199N,1
...,...,...,...
6908,DIP-50081N,DIP-29011N,1
6909,DIP-29011N,DIP-39768N,1
6910,DIP-39768N,DIP-48673N,1
6911,DIP-46409N,DIP-46411N,1


In [28]:
# Load dip-uni dataframe
dip_uni = pd.read_csv('uniprot_protein_set_dip.tab', header=None, sep='\t')
dip_uni.columns = ['uni', 'dip']
dip_uni


Unnamed: 0,uni,dip
0,Q15435,DIP-1005N
1,P19174,DIP-100N
2,P53779,DIP-1015N
3,P22897,DIP-101N
4,P11277,DIP-1021N
...,...,...
3195,P17947,DIP-953N
3196,P40189,DIP-95N
3197,Q13424,DIP-966N
3198,P29317,DIP-96N


In [29]:
# Create dip uni dictionary
dict_dip_uni = {dip: uni for dip, uni in zip(dip_uni['dip'].values, dip_uni['uni'].values)}
dict_dip_uni

{'DIP-1005N': 'Q15435',
 'DIP-100N': 'P19174',
 'DIP-1015N': 'P53779',
 'DIP-101N': 'P22897',
 'DIP-1021N': 'P11277',
 'DIP-1022N': 'P01241',
 'DIP-102N': 'P61024',
 'DIP-1030N': 'P08571',
 'DIP-1037N': 'P07996',
 'DIP-103N': 'Q96Q89',
 'DIP-1042N': 'P00519',
 'DIP-1043N': 'P10415',
 'DIP-1045N': 'P15056',
 'DIP-1047N': 'P01100',
 'DIP-1048N': 'P04049',
 'DIP-1049N': 'P09769',
 'DIP-104N': 'P27797',
 'DIP-1050N': 'P01112',
 'DIP-1051N': 'P08631',
 'DIP-1052N': 'P17275',
 'DIP-1053N': 'P17535',
 'DIP-1055N': 'P10721',
 'DIP-1056N': 'P07948',
 'DIP-1057N': 'P10242',
 'DIP-1058N': 'P01111',
 'DIP-1059N': 'P12931',
 'DIP-1061N': 'P15498',
 'DIP-1062N': 'P29597',
 'DIP-1063N': 'P62820',
 'DIP-106N': 'P19838',
 'DIP-1077N': 'Q00403',
 'DIP-1078N': 'P20226',
 'DIP-1080N': 'P01589',
 'DIP-1083N': 'P02766',
 'DIP-1098N': 'P12004',
 'DIP-109N': 'P15924',
 'DIP-1107N': 'P01033',
 'DIP-1108N': 'Q01081',
 'DIP-1118N': 'P01023',
 'DIP-1119N': 'P22303',
 'DIP-1120N': 'P02649',
 'DIP-1121N': 'P16070',

In [30]:
def dip_to_uni(dip_id):
    if dip_id in dict_dip_uni.keys():
        return dict_dip_uni[dip_id]
    else:
        return 0

In [38]:
# Create DIP interaction dataset ver 20160430 of what uniprot id available, reset index of orig to create new index, try to set
# index to False to see what happens
dip_transformed = dip_orig.applymap(lambda key: dip_to_uni(key))
dip_transformed = dip_transformed[(dip_transformed['first'] != 0)  & (dip_transformed['second'] != 0)].reset_index(drop=True)
dip_transformed['label'] = 1
dip_transformed

Unnamed: 0,first,second,label
0,P01730,P01730,1
1,P06400,P29375,1
2,P20226,P09086,1
3,P22681,P46108,1
4,P06213,P27986,1
...,...,...,...
4627,P24928,P50750,1
4628,O75909,P24928,1
4629,P24928,Q9NYV4,1
4630,Q9NYV4,Q8N7H5,1


In [39]:
# Select human only
all_organism_set = set(list(dip_20160430_transformed['first'])+list(dip_20160430_transformed['second']))
f = open('protein_set_all_organims.txt', 'w')
for entry in all_organism_set:
    f.write(entry+'\n')

# Upload file on uniprot swiss prot id mapping, select interaction filter to check partner and organism to check whether belongs to human
# All is correct!

In [51]:
# Load uni seq dataframe
uni_seq = pd.read_csv('uni_seq.fasta', sep='\t', names=['uni', 'seq'])
uni_seq

Unnamed: 0,uni,seq
0,Q15435,MAAERGAGQQQSQEMMEVDRRVESEESGDEEGKKHSSGIVADLSEQ...
1,P19174,MAGAASPCANGCGPGAPSDAEVLHLCRSLEVGTVMTLFYSKKSQRP...
2,P53779,MSLHFLYYCSEPTLDVKIAFCQGFDKQVDVSYIAKHYNMSKSKVDN...
3,P22897,MRLPLLLVFASVIPGAVLLLDTRQFLIYNEDHKRCVDAVSPSAVQT...
4,P11277,MTSATEFENVGNQPPYSRINARWDAPDDELDNDNSSARLFERSRIK...
...,...,...
3195,P17947,MLQACKMEGFPLVPPPSEDLVPYDTDLYQRQTHEYYPYLSSDGESH...
3196,P40189,MLTLQTWLVQALFIFLTTESTGELLDPCGYISPESPVVQLHSNFTA...
3197,Q13424,MASGRRAPRTGLLELRAGAGSGAGGERWQRVLLSLAEDVLTVSPAD...
3198,P29317,MELQAARACFALLWGCALAAAAAAQGKEVVLLDFAAAGGELGWLTH...


In [41]:
# Create uni seq dictionary
dict_uni_seq = {uni: seq for uni, seq in zip(uni_seq['uni'].values, uni_seq['seq'].values)}
dict_uni_seq

{'Q15435': 'MAAERGAGQQQSQEMMEVDRRVESEESGDEEGKKHSSGIVADLSEQSLKDGEERGEEDPEEEHELPVDMETINLDRDAEDVDLNHYRIGKIEGFEVLKKVKTLCLRQNLIKCIENLEELQSLRELDLYDNQIKKIENLEALTELEILDISFNLLRNIEGVDKLTRLKKLFLVNNKISKIENLSNLHQLQMLELGSNRIRAIENIDTLTNLESLFLGKNKITKLQNLDALTNLTVLSMQSNRLTKIEGLQNLVNLRELYLSHNGIEVIEGLENNNKLTMLDIASNRIKKIENISHLTELQEFWMNDNLLESWSDLDELKGARSLETVYLERNPLQKDPQYRRKVMLALPSVRQIDATFVRF',
 'P19174': 'MAGAASPCANGCGPGAPSDAEVLHLCRSLEVGTVMTLFYSKKSQRPERKTFQVKLETRQITWSRGADKIEGAIDIREIKEIRPGKTSRDFDRYQEDPAFRPDQSHCFVILYGMEFRLKTLSLQATSEDEVNMWIKGLTWLMEDTLQAPTPLQIERWLRKQFYSVDRNREDRISAKDLKNMLSQVNYRVPNMRFLRERLTDLEQRSGDITYGQFAQLYRSLMYSAQKTMDLPFLEASTLRAGERPELCRVSLPEFQQFLLDYQGELWAVDRLQVQEFMLSFLRDPLREIEEPYFFLDEFVTFLFSKENSVWNSQLDAVCPDTMNNPLSHYWISSSHNTYLTGDQFSSESSLEAYARCLRMGCRCIELDCWDGPDGMPVIYHGHTLTTKIKFSDVLHTIKEHAFVASEYPVILSIEDHCSIAQQRNMAQYFKKVLGDTLLTKPVEISADGLPSPNQLKRKILIKHKKLAEGSAYEEVPTSMMYSENDISNSIKNGILYLEDPVNHEWYPHYFVLTSSKIYYSEETSSDQGNEDEEEPKEVSSSTELHSNEKWFHGKLGAGRDGRHIAERLLTEYCIETGAPDGSFLVRESETFVGDYTLSFWRNGKVQHCRIHSR

In [42]:
# Create new dataset HPRD compare to pan
pan_df = pd.read_csv('pan_uni_pairs.tsv', header=None, sep='\t')
pan_df.columns = ['first', 'second', 'label']
pan_df

Unnamed: 0,first,second,label
0,Q13114,P26842,1
1,P09016,Q09472,1
2,Q9Y6K9,Q14790,1
3,Q01780,Q9Y333,1
4,Q6UVK1,P60953,1
...,...,...,...
61887,Q9UFW8,O60759,0
61888,A5D8V6,P55212,0
61889,P21246,Q9H213,0
61890,P15923,Q15172,0


In [43]:
list_pairs_pan = []
for i in range(len(pan_df)):
    list_pairs_pan.append(frozenset(pan_df.iloc[i,:]))
list_pairs_pan

[frozenset({1, 'P26842', 'Q13114'}),
 frozenset({1, 'P09016', 'Q09472'}),
 frozenset({1, 'Q14790', 'Q9Y6K9'}),
 frozenset({1, 'Q01780', 'Q9Y333'}),
 frozenset({1, 'P60953', 'Q6UVK1'}),
 frozenset({1, 'P04406', 'P14672'}),
 frozenset({1, 'P06213', 'Q9P104'}),
 frozenset({1, 'P35222', 'P48729'}),
 frozenset({1, 'Q04724', 'Q15475'}),
 frozenset({1, 'P02452', 'P21810'}),
 frozenset({1, 'P60604', 'Q05086'}),
 frozenset({1, 'P00451', 'P00740'}),
 frozenset({1, 'O00287', 'O14593'}),
 frozenset({1, 'P55036', 'Q9UMX0'}),
 frozenset({1, 'Q13114', 'Q9HAU5'}),
 frozenset({1, 'O15085', 'O95477'}),
 frozenset({1, 'P0CG47', 'P46527'}),
 frozenset({1, 'O95990', 'P27824'}),
 frozenset({1, 'P36897', 'Q9NRW1'}),
 frozenset({1, 'P21246', 'Q96NT1'}),
 frozenset({1, 'O95071', 'P06401'}),
 frozenset({1, 'O14965', 'Q96EP1'}),
 frozenset({1, 'P68032', 'Q14847'}),
 frozenset({1, 'P30926', 'Q9UMX0'}),
 frozenset({1, 'Q92558', 'Q96F07'}),
 frozenset({1, 'P14672', 'Q13033'}),
 frozenset({1, 'P05549', 'P29466'}),
 

In [44]:
list_pairs_dip = []
for i in range(len(dip_transformed)):
    list_pairs_dip.append(frozenset(dip_transformed.iloc[i, :]))
list_pairs_dip

[frozenset({1, 'P01730'}),
 frozenset({1, 'P06400', 'P29375'}),
 frozenset({1, 'P09086', 'P20226'}),
 frozenset({1, 'P22681', 'P46108'}),
 frozenset({1, 'P06213', 'P27986'}),
 frozenset({1, 'P04049', 'P31946'}),
 frozenset({1, 'P15172', 'Q08999'}),
 frozenset({1, 'P17947', 'Q08945'}),
 frozenset({1, 'P25490', 'Q09472'}),
 frozenset({1, 'P36956', 'Q92793'}),
 frozenset({1, 'P15172', 'P15923'}),
 frozenset({1, 'P15884', 'Q02535'}),
 frozenset({1, 'P00519', 'Q13315'}),
 frozenset({1, 'P31314', 'Q15172'}),
 frozenset({1, 'P15172'}),
 frozenset({1, 'P08571', 'P18428'}),
 frozenset({1, 'P10415'}),
 frozenset({1, 'P10415', 'Q07812'}),
 frozenset({1, 'P19438', 'Q15628'}),
 frozenset({1, 'P01584', 'P29466'}),
 frozenset({1, 'P01350', 'P32239'}),
 frozenset({1, 'P43405', 'Q15349'}),
 frozenset({1, 'P09874', 'P42574'}),
 frozenset({1, 'P04637', 'Q00987'}),
 frozenset({1, 'Q16514'}),
 frozenset({1, 'P20226', 'P21675'}),
 frozenset({1, 'P49848', 'Q16594'}),
 frozenset({1, 'Q00403', 'Q16594'}),
 fro

In [45]:
# List of new pairs
list_pairs_dip_new = []
for pairs in list_pairs_dip:
    if pairs not in list_pairs_pan:
        list_pairs_dip_new.append(pairs)
list_pairs_dip_new

[frozenset({1, 'P01730'}),
 frozenset({1, 'P15172', 'Q08999'}),
 frozenset({1, 'P15884', 'Q02535'}),
 frozenset({1, 'P31314', 'Q15172'}),
 frozenset({1, 'P15172'}),
 frozenset({1, 'P10415'}),
 frozenset({1, 'P01350', 'P32239'}),
 frozenset({1, 'P43405', 'Q15349'}),
 frozenset({1, 'P09874', 'P42574'}),
 frozenset({1, 'Q16514'}),
 frozenset({1, 'P20226', 'Q15572'}),
 frozenset({1, 'P08047', 'Q00403'}),
 frozenset({1, 'P25490', 'Q00403'}),
 frozenset({1, 'P04637', 'P49848'}),
 frozenset({1, 'P06400', 'P29374'}),
 frozenset({1, 'P06127', 'Q8NEB9'}),
 frozenset({1, 'P06127', 'Q00839'}),
 frozenset({1, 'P06127', 'P06239'}),
 frozenset({1, 'P25445'}),
 frozenset({1, 'P14778', 'P51617'}),
 frozenset({1, 'P13747', 'P27797'}),
 frozenset({1, 'P14859', 'P20226'}),
 frozenset({1, 'P09661', 'Q12874'}),
 frozenset({1, 'P22681', 'Q12965'}),
 frozenset({1, 'P04899', 'P81274'}),
 frozenset({1, 'P08069', 'P29353'}),
 frozenset({1, 'P06213', 'P29353'}),
 frozenset({1, 'P14598', 'P19878'}),
 frozenset({1,

In [47]:
# Dictionary for pairs to index
dict_pairs_dip = {key: value for key, value in zip(list_pairs_dip, [x for x in range(len(list_pairs_dip))])}

In [48]:
# ID for new pairs of interactions
id_new = [dict_pairs_dip[key] for key in list_pairs_dip_new]
id_new

[0,
 6,
 11,
 13,
 14,
 16,
 20,
 21,
 22,
 24,
 28,
 29,
 30,
 32,
 35,
 44,
 45,
 46,
 55,
 58,
 59,
 60,
 61,
 63,
 68,
 69,
 70,
 74,
 76,
 78,
 80,
 86,
 87,
 88,
 91,
 93,
 96,
 99,
 101,
 104,
 105,
 106,
 107,
 108,
 110,
 113,
 114,
 115,
 116,
 117,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 130,
 132,
 135,
 136,
 140,
 141,
 142,
 144,
 150,
 151,
 153,
 156,
 161,
 163,
 164,
 171,
 174,
 175,
 176,
 177,
 178,
 182,
 183,
 184,
 185,
 186,
 188,
 189,
 190,
 196,
 197,
 198,
 199,
 200,
 201,
 202,
 206,
 208,
 209,
 215,
 218,
 220,
 226,
 227,
 229,
 233,
 234,
 235,
 237,
 238,
 241,
 243,
 245,
 249,
 250,
 257,
 261,
 262,
 264,
 267,
 270,
 277,
 278,
 279,
 280,
 281,
 282,
 287,
 288,
 290,
 291,
 292,
 293,
 306,
 309,
 312,
 314,
 315,
 317,
 319,
 321,
 325,
 326,
 327,
 328,
 340,
 342,
 343,
 344,
 346,
 347,
 352,
 353,
 357,
 361,
 363,
 373,
 375,
 389,
 391,
 395,
 397,
 407,
 408,
 409,
 412,
 416,
 417,
 418,
 420,
 426,
 440,
 442,
 443,
 444,
 449,
 4

In [49]:
# Write to file all new interaction pairs in dataset
dip_new = dip_transformed.iloc[id_new, :]
dip_new.to_csv('hip_new.tsv', sep='\t', header=None, index=None)
dip_new

Unnamed: 0,first,second,label
0,P01730,P01730,1
6,P15172,Q08999,1
11,P15884,Q02535,1
13,P31314,Q15172,1
14,P15172,P15172,1
...,...,...,...
4625,O75909,Q6PD62,1
4626,Q8N7H5,P50750,1
4629,P24928,Q9NYV4,1
4630,Q9NYV4,Q8N7H5,1


In [50]:
# Write to file list of new protein
set_protein_new = list(dip_new['first'])+list(dip_new['second'])
dict_protein_new = {id: seq for id, seq in zip(set_protein_new, [dict_uni_seq[id] for id in set_protein_new])}

In [None]:
### Interaction pairs file is dip_new.tsv
### Dictionary file is uni_seq.fasta
