In [51]:
# Import libraries
import pandas as pd
import numpy as np

In [52]:
# Original DIP dataset ver 20160430
dip_orig = pd.read_csv('dip_pairs.txt', sep='\t', header=None)
dip_orig.columns = ['first', 'second']
dip_orig['label'] = 1
dip_orig

Unnamed: 0,first,second,label
0,DIP-617N,DIP-617N,1
1,DIP-493N,DIP-147N,1
2,DIP-582N,DIP-472N,1
3,DIP-1078N,DIP-51N,1
4,DIP-189N,DIP-199N,1
...,...,...,...
6908,DIP-50081N,DIP-29011N,1
6909,DIP-29011N,DIP-39768N,1
6910,DIP-39768N,DIP-48673N,1
6911,DIP-46409N,DIP-46411N,1


In [53]:
dip_orig_proteins = list(set(list(dip_orig['first']) + list(dip_orig['second'])))
f = open('dip_full_proteins.txt', 'w')
for id_protein in dip_orig_proteins:
    f.write(id_protein+'\n')

In [54]:
# Load dip-uni dataframe
dip_uni = pd.read_csv('uni2dip.tsv', sep='\t')
dip_uni.columns = ['uni', 'dip']
dip_uni


Unnamed: 0,uni,dip
0,Q9H3P2,DIP-48478N
1,Q15750,DIP-27524N
2,Q9ULR0,DIP-56636N
3,Q12923,DIP-40449N
4,Q9HC16,DIP-37519N
...,...,...
2735,P21333,DIP-1136N
2736,P42226,DIP-39855N
2737,A6NKD9,DIP-56878N
2738,O15379,DIP-24253N


In [55]:
# Create dip uni dictionary
dict_dip_uni = {dip: uni for dip, uni in zip(dip_uni['dip'].values, dip_uni['uni'].values)}
dict_dip_uni

{'DIP-48478N': 'Q9H3P2',
 'DIP-27524N': 'Q15750',
 'DIP-56636N': 'Q9ULR0',
 'DIP-40449N': 'Q12923',
 'DIP-37519N': 'Q9HC16',
 'DIP-29296N': 'P52294',
 'DIP-3776N': 'Q14626',
 'DIP-29750N': 'P05455',
 'DIP-368N': 'P04637',
 'DIP-5884N': 'P51686',
 'DIP-30886N': 'P27540',
 'DIP-42723N': 'Q9UMW8',
 'DIP-33829N': 'O95613',
 'DIP-394N': 'Q16619',
 'DIP-29451N': 'Q8WXD2',
 'DIP-29044N': 'P53999',
 'DIP-1037N': 'P07996',
 'DIP-6006N': 'Q14005',
 'DIP-38920N': 'Q5VWG9',
 'DIP-1108N': 'Q01081',
 'DIP-59718N': 'Q9BZE1',
 'DIP-34187N': 'P05771',
 'DIP-33720N': 'Q13489',
 'DIP-48656N': 'Q4FZB7',
 'DIP-238N': 'Q14186',
 'DIP-39705N': 'Q9UQM7',
 'DIP-6222N': 'Q13114',
 'DIP-31221N': 'P62306',
 'DIP-39709N': 'Q7L7X3',
 'DIP-523N': 'P35568',
 'DIP-31244N': 'P50570',
 'DIP-35702N': 'Q9HBA0',
 'DIP-42222N': 'Q9UNL4',
 'DIP-42077N': 'Q13098',
 'DIP-33873N': 'P05141',
 'DIP-33218N': 'P38919',
 'DIP-33123N': 'Q9UM54',
 'DIP-47624N': 'Q4G0J3',
 'DIP-29007N': 'Q07666',
 'DIP-57N': 'P17181',
 'DIP-24187N': 'P

In [56]:
def dip_to_uni(dip_id):
    if dip_id in dict_dip_uni.keys():
        return dict_dip_uni[dip_id]
    else:
        return 0

In [57]:
# Create DIP interaction dataset ver 20160430 of what uniprot id available, reset index of orig to create new index, try to set
# index to False to see what happens
dip_transformed = dip_orig.applymap(lambda key: dip_to_uni(key))
dip_transformed = dip_transformed[(dip_transformed['first'] != 0)  & (dip_transformed['second'] != 0)].reset_index(drop=True)
dip_transformed['label'] = 1
dip_transformed.to_csv('dip_transformed.tsv', header=None, sep='\t', index=None)
dip_transformed

Unnamed: 0,first,second,label
0,P01730,P01730,1
1,P06400,P29375,1
2,P22681,P46108,1
3,P06213,P27986,1
4,P15172,Q08999,1
...,...,...,...
3459,P24928,P50750,1
3460,O75909,P24928,1
3461,P24928,Q9NYV4,1
3462,Q9NYV4,Q8N7H5,1


In [58]:
# Select human only
all_organism_set = set(list(dip_transformed['first'])+list(dip_transformed['second']))
f = open('dip_filtered_proteins.txt', 'w')
for entry in all_organism_set:
    f.write(entry+'\n')

# Upload file on uniprot swiss prot id mapping, select interaction filter to check partner and organism to check whether belongs to human
# All is correct!

In [70]:
# Load uni seq dataframe
uni_seq = pd.read_csv('uni_seq.fasta', sep='\t', names=['uni', 'seq'])
uni_seq

Unnamed: 0,uni,seq
0,P06493,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...
1,P49763,MPVMRLFPCFLQLLAGLALPAVPPQQWALSAGNGSSEVEVVPFQEV...
2,Q9NUB1,MAARTLGRGVGRLLGSLRGLSGQPARPPCGVSAPRRAASGPSGSAP...
3,Q9UQN3,MASLFKKKTVDDVIKEQNRELRGTQRAIIRDRAALEKQEKQLELEI...
4,Q13336,MEDSPTMVRVDSPTMVRGENQVSPCQGRRCFPKALGYVTGDMKELA...
...,...,...
2294,O95714,MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIV...
2295,Q16873,MKDEVALLAAVTLLGVLLQAYFSLQVISARRAFRVSPPLTTGPPEF...
2296,Q9Y2Y0,MDALEGESFALSFSSASDAEFDAVVGYLEDIIMDDEFQLLQRNFMD...
2297,P25054,MAAASYDQLLKQVEALKMENSNLRQELEDNSNHLTKLETEASNMKE...


In [71]:
# Create uni seq dictionary
dict_uni_seq = {uni: seq for uni, seq in zip(uni_seq['uni'].values, uni_seq['seq'].values)}
dict_uni_seq

{'P06493': 'MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPSTAIREISLLKELRHPNIVSLQDVLMQDSRLYLIFEFLSMDLKKYLDSIPPGQYMDSSLVKSYLYQILQGIVFCHSRRVLHRDLKPQNLLIDDKGTIKLADFGLARAFGIPIRVYTHEVVTLWYRSPEVLLGSARYSTPVDIWSIGTIFAELATKKPLFHGDSEIDQLFRIFRALGTPNNEVWPEVESLQDYKNTFPKWKPGSLASHVKNLDENGLDLLSKMLIYDPAKRISGKMALNHPYFNDLDNQIKKM',
 'P49763': 'MPVMRLFPCFLQLLAGLALPAVPPQQWALSAGNGSSEVEVVPFQEVWGRSYCRALERLVDVVSEYPSEVEHMFSPSCVSLLRCTGCCGDENLHCVPVETANVTMQLLKIRSGDRPSYVELTFSQHVRCECRHSPGRQSPDMPGDFRADAPSFLPPRRSLPMLFRMEWGCALTGSQSAVWPSSPVPEEIPRMHPGRNGKKQQRKPLREKMKPERCGDAVPRR',
 'Q9NUB1': 'MAARTLGRGVGRLLGSLRGLSGQPARPPCGVSAPRRAASGPSGSAPAVAAAAAQPGSYPALSAQAAREPAAFWGPLARDTLVWDTPYHTVWDCDFSTGKIGWFLGGQLNVSVNCLDQHVRKSPESVALIWERDEPGTEVRITYRELLETTCRLANTLKRHGVHRGDRVAIYMPVSPLAVAAMLACARIGAVHTVIFAGFSAESLAGRINDAKCKVVITFNQGLRGGRVVELKKIVDEAVKHCPTVQHVLVAHRTDNKVHMGDLDVPLEQEMAKEDPVCAPESMGSEDMLFMLYTSGSTGMPKGIVHTQAGYLLYAALTHKLVFDHQPGDIFGCVADIGWITGHSYVVYGPLCNGATSVLFESTPVYPNAGRYWETVERLKINQFYGAPTAVRLLLKYGDAWVKKYDRSSLRTLGSVGEPINCEAWEWLHRVVGDSRCTLV

In [72]:
# Create new dataset HPRD compare to pan
pan_df = pd.read_csv('pan_pairs.tsv', header=None, sep='\t')
pan_df.columns = ['first', 'second', 'label']
pan_df

Unnamed: 0,first,second,label
0,Q13114,P26842,1
1,P09016,Q09472,1
2,Q9Y6K9,Q14790,1
3,Q01780,Q9Y333,1
4,Q6UVK1,P60953,1
...,...,...,...
62087,Q9UFW8,Q9NRV9,0
62088,Q9GZR1,P55212,0
62089,P21246,Q9H213,0
62090,P15923,Q9H9P8,0


In [73]:
list_pairs_pan = []
for i in range(len(pan_df)):
    list_pairs_pan.append(frozenset(pan_df.iloc[i,:]))
list_pairs_pan

[frozenset({1, 'P26842', 'Q13114'}),
 frozenset({1, 'P09016', 'Q09472'}),
 frozenset({1, 'Q14790', 'Q9Y6K9'}),
 frozenset({1, 'Q01780', 'Q9Y333'}),
 frozenset({1, 'P60953', 'Q6UVK1'}),
 frozenset({1, 'P04406', 'P14672'}),
 frozenset({1, 'P06213', 'Q9P104'}),
 frozenset({1, 'P35222', 'P48729'}),
 frozenset({1, 'Q04724', 'Q15475'}),
 frozenset({1, 'P02452', 'P21810'}),
 frozenset({1, 'P60604', 'Q05086'}),
 frozenset({1, 'P00451', 'P00740'}),
 frozenset({1, 'O00287', 'O14593'}),
 frozenset({1, 'P55036', 'Q9UMX0'}),
 frozenset({1, 'Q92900', 'Q9HAU5'}),
 frozenset({1, 'O15085', 'O95477'}),
 frozenset({1, 'P0CG47', 'P46527'}),
 frozenset({1, 'O95990', 'P27824'}),
 frozenset({1, 'P36897', 'Q9NRW1'}),
 frozenset({1, 'P21246', 'Q96NT1'}),
 frozenset({1, 'O95071', 'P06401'}),
 frozenset({1, 'O14965', 'Q96EP1'}),
 frozenset({1, 'P68032', 'Q14847'}),
 frozenset({1, 'P30926', 'Q9UMX0'}),
 frozenset({1, 'Q92558', 'Q96F07'}),
 frozenset({1, 'P14672', 'Q13033'}),
 frozenset({1, 'P05549', 'P29466'}),
 

In [63]:
list_pairs_dip = []
for i in range(len(dip_transformed)):
    list_pairs_dip.append(frozenset(dip_transformed.iloc[i, :]))
list_pairs_dip

[frozenset({1, 'P01730'}),
 frozenset({1, 'P06400', 'P29375'}),
 frozenset({1, 'P22681', 'P46108'}),
 frozenset({1, 'P06213', 'P27986'}),
 frozenset({1, 'P15172', 'Q08999'}),
 frozenset({1, 'P17947', 'Q08945'}),
 frozenset({1, 'P25490', 'Q09472'}),
 frozenset({1, 'P36956', 'Q92793'}),
 frozenset({1, 'P15172', 'P15923'}),
 frozenset({1, 'P00519', 'Q13315'}),
 frozenset({1, 'P31314', 'Q15172'}),
 frozenset({1, 'P15172'}),
 frozenset({1, 'P10415'}),
 frozenset({1, 'P10415', 'Q07812'}),
 frozenset({1, 'P19438', 'Q15628'}),
 frozenset({1, 'P01350', 'P32239'}),
 frozenset({1, 'P43405', 'Q15349'}),
 frozenset({1, 'P09874', 'P42574'}),
 frozenset({1, 'P04637', 'Q00987'}),
 frozenset({1, 'Q16514'}),
 frozenset({1, 'P49848', 'Q16594'}),
 frozenset({1, 'Q00403', 'Q16594'}),
 frozenset({1, 'P08047', 'Q00403'}),
 frozenset({1, 'P25490', 'Q00403'}),
 frozenset({1, 'P04637', 'Q16594'}),
 frozenset({1, 'P04637', 'P49848'}),
 frozenset({1, 'P08047', 'P25490'}),
 frozenset({1, 'P06400', 'P32519'}),
 fro

In [74]:
# List of new pairs
list_pairs_dip_new = []
for pairs in list_pairs_dip:
    if pairs not in list_pairs_pan:
        list_pairs_dip_new.append(pairs)
list_pairs_dip_new

[frozenset({1, 'P01730'}),
 frozenset({1, 'P15172', 'Q08999'}),
 frozenset({1, 'P31314', 'Q15172'}),
 frozenset({1, 'P15172'}),
 frozenset({1, 'P10415'}),
 frozenset({1, 'P01350', 'P32239'}),
 frozenset({1, 'P43405', 'Q15349'}),
 frozenset({1, 'P09874', 'P42574'}),
 frozenset({1, 'Q16514'}),
 frozenset({1, 'P08047', 'Q00403'}),
 frozenset({1, 'P25490', 'Q00403'}),
 frozenset({1, 'P04637', 'P49848'}),
 frozenset({1, 'P06400', 'P29374'}),
 frozenset({1, 'P06127', 'Q00839'}),
 frozenset({1, 'P06127', 'P06239'}),
 frozenset({1, 'P25445'}),
 frozenset({1, 'P14778', 'P51617'}),
 frozenset({1, 'P13747', 'P27797'}),
 frozenset({1, 'P09661', 'Q12874'}),
 frozenset({1, 'P22681', 'Q12965'}),
 frozenset({1, 'P04899', 'P81274'}),
 frozenset({1, 'P08069', 'P29353'}),
 frozenset({1, 'P06213', 'P29353'}),
 frozenset({1, 'P14598', 'P19878'}),
 frozenset({1, 'P30305', 'P31946'}),
 frozenset({1, 'Q06124', 'Q14451'}),
 frozenset({1, 'P05067'}),
 frozenset({1, 'P10415', 'Q07820'}),
 frozenset({1, 'P19438'}

In [75]:
# Dictionary for pairs to index
dict_pairs_dip = {key: value for key, value in zip(list_pairs_dip, [x for x in range(len(list_pairs_dip))])}
dict_pairs_dip

{frozenset({1, 'P01730'}): 0,
 frozenset({1, 'P06400', 'P29375'}): 1,
 frozenset({1, 'P22681', 'P46108'}): 2,
 frozenset({1, 'P06213', 'P27986'}): 3,
 frozenset({1, 'P15172', 'Q08999'}): 4,
 frozenset({1, 'P17947', 'Q08945'}): 5,
 frozenset({1, 'P25490', 'Q09472'}): 6,
 frozenset({1, 'P36956', 'Q92793'}): 7,
 frozenset({1, 'P15172', 'P15923'}): 8,
 frozenset({1, 'P00519', 'Q13315'}): 9,
 frozenset({1, 'P31314', 'Q15172'}): 10,
 frozenset({1, 'P15172'}): 11,
 frozenset({1, 'P10415'}): 12,
 frozenset({1, 'P10415', 'Q07812'}): 13,
 frozenset({1, 'P19438', 'Q15628'}): 14,
 frozenset({1, 'P01350', 'P32239'}): 15,
 frozenset({1, 'P43405', 'Q15349'}): 16,
 frozenset({1, 'P09874', 'P42574'}): 17,
 frozenset({1, 'P04637', 'Q00987'}): 18,
 frozenset({1, 'Q16514'}): 19,
 frozenset({1, 'P49848', 'Q16594'}): 20,
 frozenset({1, 'Q00403', 'Q16594'}): 21,
 frozenset({1, 'P08047', 'Q00403'}): 22,
 frozenset({1, 'P25490', 'Q00403'}): 23,
 frozenset({1, 'P04637', 'Q16594'}): 24,
 frozenset({1, 'P04637', 

In [66]:
# ID for new pairs of interactions
id_new = [dict_pairs_dip[key] for key in list_pairs_dip_new]
id_new

[0,
 4,
 10,
 11,
 12,
 15,
 16,
 17,
 19,
 22,
 23,
 25,
 28,
 34,
 35,
 44,
 47,
 48,
 49,
 51,
 55,
 56,
 57,
 60,
 62,
 63,
 65,
 72,
 73,
 76,
 80,
 83,
 87,
 88,
 89,
 91,
 94,
 95,
 96,
 97,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 108,
 112,
 113,
 119,
 120,
 122,
 125,
 127,
 129,
 130,
 134,
 137,
 138,
 139,
 140,
 143,
 144,
 145,
 146,
 148,
 149,
 150,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 165,
 167,
 168,
 171,
 173,
 174,
 177,
 178,
 179,
 183,
 184,
 185,
 187,
 190,
 192,
 195,
 201,
 204,
 205,
 207,
 210,
 215,
 216,
 217,
 218,
 219,
 220,
 223,
 224,
 226,
 227,
 228,
 234,
 237,
 239,
 243,
 245,
 246,
 247,
 255,
 257,
 261,
 265,
 268,
 270,
 277,
 279,
 291,
 295,
 296,
 303,
 304,
 305,
 307,
 311,
 312,
 314,
 318,
 331,
 332,
 333,
 334,
 338,
 341,
 342,
 346,
 347,
 348,
 350,
 351,
 352,
 359,
 360,
 362,
 365,
 368,
 369,
 372,
 374,
 383,
 385,
 386,
 391,
 393,
 400,
 403,
 405,
 406,
 407,
 412,
 413,
 414,
 415,
 416,
 417,
 418,
 420,
 423

In [77]:
# Write to file all new interaction pairs in dataset
dip_new = dip_transformed.iloc[id_new, :]
dip_new.to_csv('dip_new.tsv', sep='\t', header=None, index=None)
dip_new

Unnamed: 0,first,second,label
0,P01730,P01730,1
4,P15172,Q08999,1
10,P31314,Q15172,1
11,P15172,P15172,1
12,P10415,P10415,1
...,...,...,...
3457,O75909,Q6PD62,1
3458,Q8N7H5,P50750,1
3461,P24928,Q9NYV4,1
3462,Q9NYV4,Q8N7H5,1


In [85]:
# Write to file list of new protein
set_protein_new = set(list(dip_new['first'])+list(dip_new['second']))
len(set_protein_new)


1986

KeyError: 'P01350'

In [None]:
### Interaction pairs file is dip_new.tsv
### Dictionary file is uni_seq.fasta
