In [2]:
# Import libraries
import pandas as pd
import numpy as np

In [17]:
# Original DIP dataset ver 20160430
dip_20160430_orig = pd.read_csv('sun_source.txt', sep='\t', header=None)
dip_20160430_orig.columns = ['first', 'second']
dip_20160430_orig

Unnamed: 0,first,second
0,DIP-617N,DIP-617N
1,DIP-493N,DIP-147N
2,DIP-582N,DIP-472N
3,DIP-1078N,DIP-51N
4,DIP-189N,DIP-199N
...,...,...
6908,DIP-50081N,DIP-29011N
6909,DIP-29011N,DIP-39768N
6910,DIP-39768N,DIP-48673N
6911,DIP-46409N,DIP-46411N


In [19]:
# Load dip-uni dataframe
dip_uni = pd.read_csv('uniprot_protein_set_dip.tab', header=None, sep='\t')
dip_uni.columns = ['uni', 'dip']
dip_uni


Unnamed: 0,uni,dip
0,Q15435,DIP-1005N
1,P19174,DIP-100N
2,P53779,DIP-1015N
3,P22897,DIP-101N
4,P11277,DIP-1021N
...,...,...
3195,P17947,DIP-953N
3196,P40189,DIP-95N
3197,Q13424,DIP-966N
3198,P29317,DIP-96N


In [21]:
# Create dip uni dictionary
dict_dip_uni = {dip: uni for dip, uni in zip(dip_uni['dip'].values, dip_uni['uni'].values)}
dict_dip_uni

{'DIP-1005N': 'Q15435',
 'DIP-100N': 'P19174',
 'DIP-1015N': 'P53779',
 'DIP-101N': 'P22897',
 'DIP-1021N': 'P11277',
 'DIP-1022N': 'P01241',
 'DIP-102N': 'P61024',
 'DIP-1030N': 'P08571',
 'DIP-1037N': 'P07996',
 'DIP-103N': 'Q96Q89',
 'DIP-1042N': 'P00519',
 'DIP-1043N': 'P10415',
 'DIP-1045N': 'P15056',
 'DIP-1047N': 'P01100',
 'DIP-1048N': 'P04049',
 'DIP-1049N': 'P09769',
 'DIP-104N': 'P27797',
 'DIP-1050N': 'P01112',
 'DIP-1051N': 'P08631',
 'DIP-1052N': 'P17275',
 'DIP-1053N': 'P17535',
 'DIP-1055N': 'P10721',
 'DIP-1056N': 'P07948',
 'DIP-1057N': 'P10242',
 'DIP-1058N': 'P01111',
 'DIP-1059N': 'P12931',
 'DIP-1061N': 'P15498',
 'DIP-1062N': 'P29597',
 'DIP-1063N': 'P62820',
 'DIP-106N': 'P19838',
 'DIP-1077N': 'Q00403',
 'DIP-1078N': 'P20226',
 'DIP-1080N': 'P01589',
 'DIP-1083N': 'P02766',
 'DIP-1098N': 'P12004',
 'DIP-109N': 'P15924',
 'DIP-1107N': 'P01033',
 'DIP-1108N': 'Q01081',
 'DIP-1118N': 'P01023',
 'DIP-1119N': 'P22303',
 'DIP-1120N': 'P02649',
 'DIP-1121N': 'P16070',

In [66]:
def dip_to_uni(dip_id):
    if dip_id in dict_dip_uni.keys():
        return dict_dip_uni[dip_id]
    else:
        return 0

In [69]:
# Create DIP interaction dataset ver 20160430 of what uniprot id available, reset index of orig to create new index, try to set
# index to False to see what happens
dip_20160430_transformed = dip_20160430_orig.applymap(lambda key: dip_to_uni(key))
dip_20160430_transformed = dip_20160430_transformed[(dip_20160430_transformed['first'] != 0)  & (dip_20160430_transformed['second'] != 0)].reset_index(drop=True)
dip_20160430_transformed

Unnamed: 0,first,second
0,P01730,P01730
1,P06400,P29375
2,P20226,P09086
3,P22681,P46108
4,P06213,P27986
...,...,...
4627,P24928,P50750
4628,O75909,P24928
4629,P24928,Q9NYV4
4630,Q9NYV4,Q8N7H5


In [63]:
# Select human only
all_organism_set = set(list(dip_20160430_transformed['first'])+list(dip_20160430_transformed['second']))
f = open('protein_set_all_organims.txt', 'w')
for entry in all_organism_set:
    f.write(entry+'\n')

# Upload file on uniprot swiss prot id mapping, select interaction filter to check partner and organism to check whether belongs to human
# All is correct!

In [59]:
# Load uni seq dataframe
uni_seq = pd.read_csv('dip.fasta', sep='\t', names=['uni', 'seq'])
uni_seq

Unnamed: 0,uni,seq
0,Q15435,MAAERGAGQQQSQEMMEVDRRVESEESGDEEGKKHSSGIVADLSEQ...
1,P19174,MAGAASPCANGCGPGAPSDAEVLHLCRSLEVGTVMTLFYSKKSQRP...
2,P53779,MSLHFLYYCSEPTLDVKIAFCQGFDKQVDVSYIAKHYNMSKSKVDN...
3,P22897,MRLPLLLVFASVIPGAVLLLDTRQFLIYNEDHKRCVDAVSPSAVQT...
4,P11277,MTSATEFENVGNQPPYSRINARWDAPDDELDNDNSSARLFERSRIK...
...,...,...
3195,P17947,MLQACKMEGFPLVPPPSEDLVPYDTDLYQRQTHEYYPYLSSDGESH...
3196,P40189,MLTLQTWLVQALFIFLTTESTGELLDPCGYISPESPVVQLHSNFTA...
3197,Q13424,MASGRRAPRTGLLELRAGAGSGAGGERWQRVLLSLAEDVLTVSPAD...
3198,P29317,MELQAARACFALLWGCALAAAAAAQGKEVVLLDFAAAGGELGWLTH...


In [65]:
# Create uni seq dictionary
dict_uni_seq = {uni: seq for uni, seq in zip(uni_seq['uni'].values, uni_seq['seq'].values)}
dict_uni_seq

{'Q15435': 'MAAERGAGQQQSQEMMEVDRRVESEESGDEEGKKHSSGIVADLSEQSLKDGEERGEEDPEEEHELPVDMETINLDRDAEDVDLNHYRIGKIEGFEVLKKVKTLCLRQNLIKCIENLEELQSLRELDLYDNQIKKIENLEALTELEILDISFNLLRNIEGVDKLTRLKKLFLVNNKISKIENLSNLHQLQMLELGSNRIRAIENIDTLTNLESLFLGKNKITKLQNLDALTNLTVLSMQSNRLTKIEGLQNLVNLRELYLSHNGIEVIEGLENNNKLTMLDIASNRIKKIENISHLTELQEFWMNDNLLESWSDLDELKGARSLETVYLERNPLQKDPQYRRKVMLALPSVRQIDATFVRF',
 'P19174': 'MAGAASPCANGCGPGAPSDAEVLHLCRSLEVGTVMTLFYSKKSQRPERKTFQVKLETRQITWSRGADKIEGAIDIREIKEIRPGKTSRDFDRYQEDPAFRPDQSHCFVILYGMEFRLKTLSLQATSEDEVNMWIKGLTWLMEDTLQAPTPLQIERWLRKQFYSVDRNREDRISAKDLKNMLSQVNYRVPNMRFLRERLTDLEQRSGDITYGQFAQLYRSLMYSAQKTMDLPFLEASTLRAGERPELCRVSLPEFQQFLLDYQGELWAVDRLQVQEFMLSFLRDPLREIEEPYFFLDEFVTFLFSKENSVWNSQLDAVCPDTMNNPLSHYWISSSHNTYLTGDQFSSESSLEAYARCLRMGCRCIELDCWDGPDGMPVIYHGHTLTTKIKFSDVLHTIKEHAFVASEYPVILSIEDHCSIAQQRNMAQYFKKVLGDTLLTKPVEISADGLPSPNQLKRKILIKHKKLAEGSAYEEVPTSMMYSENDISNSIKNGILYLEDPVNHEWYPHYFVLTSSKIYYSEETSSDQGNEDEEEPKEVSSSTELHSNEKWFHGKLGAGRDGRHIAERLLTEYCIETGAPDGSFLVRESETFVGDYTLSFWRNGKVQHCRIHSR