In [1]:
# Import libraries
import pandas as pd
import numpy as np

In [2]:
# Original pos pan dataset
pan_orig = pd.read_csv('pan_orig.tsv', sep='\t', header=None)
pan_orig.columns = ['first', 'second', 'label']
pan_orig

Unnamed: 0,first,second,label
0,NP_663777.1,NP_001233.1,1
1,NP_055436.2,NP_001420.2,1
2,NP_003630.1,NP_001073594.1,1
3,NP_001001998.1,NP_067000.1,1
4,NP_001888.2,NP_001782.1,1
...,...,...,...
73105,NP_003654.3,Q9NRV9,0
73106,Q9GZR1,NP_001217.2,0
73107,NP_002816.1,NP_054780.2,0
73108,NP_003191.1,Q9H9P8,0


In [3]:
# Create ref_uni dataframe
df_ref_uni = pd.read_csv('dict_uni_ref.tsv', header=None, sep='\t')
df_ref_uni.columns = ['ref', 'uni']
df_ref_uni

Unnamed: 0,ref,uni
0,NP_663777.1,Q13114
1,NP_001233.1,P26842
2,NP_055436.2,P09016
3,NP_001420.2,Q09472
4,NP_003630.1,Q9Y6K9
...,...,...
7744,NP_055410.1,Q8TCD5
7745,NP_055199.1,Q9UKU7
7746,NP_071403.1,Q9BYC2
7747,NP_003242.1,Q92748


In [4]:
# Create ena_uni dataframe
df_ena_uni = pd.read_csv('dict_uni_ena.tsv', header=None, sep='\t')
df_ena_uni.columns = ['ena', 'uni']
df_ena_uni

Unnamed: 0,ena,uni
0,AAF91084.1,Q96EP1
1,AAH07560.1,Q14847
2,AAQ86961.1,Q14005
3,AAH28049.1,P16298
4,AAA59925.1,P21359
...,...,...
200,AAB54009.1,Q16836
201,BAC04299.1,Q8N9N2
202,AAN76356.1,Q9Y3C4
203,AAC83687.1,Q9UN79


In [5]:
# Create uni_uni dataframe
df_uni_uni = pd.read_csv('dict_uni_uni.tsv', header=None, sep='\t')
df_uni_uni.columns = ['uni_orig', 'uni']
df_uni_uni

Unnamed: 0,uni_orig,uni
0,Q92900,Q92900
1,Q01892,Q01892
2,P07910,P07910
3,P15941,P15941
4,Q8NFH8,Q8NFH8
...,...,...
895,O43739,O43739
896,P48995,P48995
897,P17948,P17948
898,O75414,O75414


In [6]:
# Create ref uni dictionary
dict_ref_uni = {ref: uni for ref, uni in zip(df_ref_uni['ref'].values, df_ref_uni['uni'].values)}
dict_ref_uni

{'NP_663777.1': 'Q13114',
 'NP_001233.1': 'P26842',
 'NP_055436.2': 'P09016',
 'NP_001420.2': 'Q09472',
 'NP_003630.1': 'Q9Y6K9',
 'NP_001073594.1': 'Q14790',
 'NP_001001998.1': 'Q01780',
 'NP_067000.1': 'Q9Y333',
 'NP_001888.2': 'Q6UVK1',
 'NP_001782.1': 'P60953',
 'NP_001033.1': 'P14672',
 'NP_002037.2': 'P04406',
 'NP_003001.1': 'P45985',
 'NP_060901.2': 'Q9P104',
 'NP_000199.2': 'P06213',
 'NP_001020276.1': 'P48729',
 'NP_001895.1': 'P35222',
 'NP_005973.1': 'Q15475',
 'NP_005068.2': 'Q04724',
 'NP_036377.1': 'Q13573',
 'NP_001702.1': 'P21810',
 'NP_000079.2': 'P02452',
 'NP_003334.2': 'P60604',
 'NP_000453.2': 'Q05086',
 'NP_000124.1': 'P00740',
 'NP_000123.1': 'P00451',
 'NP_003712.1': 'O14593',
 'NP_000529.1': 'O00287',
 'NP_003391.1': 'O14980',
 'NP_002801.1': 'P55036',
 'NP_038466.2': 'Q9UMX0',
 'NP_542166.1': 'Q9HAU5',
 'NP_005493.2': 'O95477',
 'NP_937879.1': 'O15085',
 'NP_004214.1': 'O14933',
 'NP_061828.1': 'P0CG47',
 'NP_004055.1': 'P46527',
 'NP_001737.1': 'P27824',
 'N

In [7]:
# Create ena uni dictionary
dict_ena_uni = {ena: uni for ena, uni in zip(df_ena_uni['ena'].values, df_ena_uni['uni'].values)}
dict_ena_uni

{'AAF91084.1': 'Q96EP1',
 'AAH07560.1': 'Q14847',
 'AAQ86961.1': 'Q14005',
 'AAH28049.1': 'P16298',
 'AAA59925.1': 'P21359',
 'AAH13362.1': 'Q13761',
 'CAA64246.1': 'O00358',
 'AAA52539.1': 'P05019',
 'CAA89989.1': 'P41235',
 'AAH14214.1': 'O95782',
 'CAA41990.1': 'P26640',
 'AAD39742.1': 'Q9UHW9',
 'AAD02233.1': 'O95155',
 'AAB94503.1': 'Q13324',
 'AAH00632.1': 'Q99584',
 'AAC08964.1': 'O75531',
 'AAH00566.1': 'Q9BW83',
 'AAQ03994.1': 'Q9Y547',
 'AAH17247.1': 'Q96AE4',
 'AAA50404.1': 'P61812',
 'BAC87138.1': 'Q9UJ41',
 'AAH11837.2': 'Q99615',
 'AAF19526.1': 'Q9NPI1',
 'AAH20590.1': 'Q96BI3',
 'BAA89210.1': 'Q9UIG0',
 'AAH30590.1': 'Q99708',
 'BAA21128.1': 'O15198',
 'AAB48304.1': 'Q9H307',
 'BAA18998.1': 'P49023',
 'AAH06201.1': 'P63010',
 'BAA92038.1': 'Q9NUT2',
 'BAA05892.1': 'P41594',
 'BAB62909.1': 'Q96S53',
 'AAH11604.1': 'Q969H4',
 'AAM77350.1': 'Q7Z4I7',
 'AAH02810.1': 'O95201',
 'AAK69623.1': 'Q9BTT6',
 'AAA36585.1': 'P31751',
 'AAA51946.1': 'P06729',
 'AAK69431.1': 'Q96RT1',


In [8]:
# Create uni uni dictionary
dict_uni_uni = {uni_: uni for uni_, uni in zip(df_uni_uni['uni_orig'].values, df_uni_uni['uni'].values)}
dict_uni_uni

{'Q92900': 'Q92900',
 'Q01892': 'Q01892',
 'P07910': 'P07910',
 'P15941': 'P15941',
 'Q8NFH8': 'Q8NFH8',
 'Q15531': 'P61266',
 'P10071': 'P10071',
 'Q14643': 'Q14643',
 'O15049': 'O15049',
 'P33316': 'P33316',
 'O75916': 'O75916',
 'P45974': 'P45974',
 'Q14242': 'Q14242',
 'P08651': 'P08651',
 'Q9H0U9': 'Q9H0U9',
 'Q13084': 'Q13084',
 'Q13972': 'Q13972',
 'Q9ULK5': 'Q9ULK5',
 'Q96J02': 'Q96J02',
 'Q14526': 'Q14526',
 'Q14642': 'Q14642',
 'Q96NY7': 'Q96NY7',
 'O60673': 'O60673',
 'P20290': 'P20290',
 'Q13574': 'Q13574',
 'P28906': 'P28906',
 'Q02509': 'Q02509',
 'O75486': 'O75486',
 'Q9UGN5': 'Q9UGN5',
 'P10243': 'P10243',
 'Q96LA8': 'Q96LA8',
 'P55316': 'P55316',
 'Q8NCN2': 'Q8NCN2',
 'P14598': 'P14598',
 'P39880': 'P39880',
 'Q08495': 'Q08495',
 'Q9Y2D0': 'Q9Y2D0',
 'Q8N183': 'Q8N183',
 'P57723': 'P57723',
 'Q9Y291': 'Q9Y291',
 'Q5VUA4': 'Q5VUA4',
 'Q9BWT1': 'Q9BWT1',
 'O75251': 'O75251',
 'P36402': 'P36402',
 'Q96GX9': 'Q96GX9',
 'O75880': 'O75880',
 'Q96T88': 'Q96T88',
 'O60284': 'O

In [9]:
def id_to_uni(id):
    if id in dict_ref_uni.keys():
        return dict_ref_uni[id]
    elif id in dict_ena_uni.keys():
        return dict_ena_uni[id]
    elif id in dict_uni_uni.keys():
        return dict_uni_uni[id]
    else:
        return 0

In [10]:
# Create pan dataframe with uni id
pan_transformed = pan_orig.applymap(lambda key: id_to_uni(key))
pan_transformed['label'] = pan_orig['label']
pan_transformed

Unnamed: 0,first,second,label
0,Q13114,P26842,1
1,P09016,Q09472,1
2,Q9Y6K9,Q14790,1
3,Q01780,Q9Y333,1
4,Q6UVK1,P60953,1
...,...,...,...
73105,Q9UFW8,Q9NRV9,0
73106,Q9GZR1,P55212,0
73107,P21246,Q9H213,0
73108,P15923,Q9H9P8,0


In [11]:
# Filter all records with non-zero first and second protein
pan_transformed = pan_transformed[(pan_transformed['first'] != 0)  & (pan_transformed['second'] != 0)].reset_index(drop=True)
pan_transformed

Unnamed: 0,first,second,label
0,Q13114,P26842,1
1,P09016,Q09472,1
2,Q9Y6K9,Q14790,1
3,Q01780,Q9Y333,1
4,Q6UVK1,P60953,1
...,...,...,...
61887,Q9UFW8,Q9NRV9,0
61888,Q9GZR1,P55212,0
61889,P21246,Q9H213,0
61890,P15923,Q9H9P8,0


In [12]:
# Write to pair to create interaction dataset
pan_transformed.to_csv('pan_pairs.tsv', sep='\t', header=None, index=None)

In [13]:
# Select human only
all_organism_set = set(list(pan_transformed['first'])+list(pan_transformed['second']))
f = open('protein_set_all_organims.txt', 'w')
for entry in all_organism_set:
    f.write(entry+'\n')

# Upload file on uniprot swiss prot id mapping, select interaction filter to check partner and organism to check whether belongs to human
# All is correct!# 

In [14]:
# Load uni seq dataframe
uni_seq = pd.read_csv('uni_seq.fasta', sep='\t', names=['uni', 'seq'])
uni_seq

Unnamed: 0,uni,seq
0,P46094,MESSGNPESTTFFYYDLQSQPCENQAWVFATLATTVLYCLVFLLSL...
1,Q9Y279,MGILLGLLLLGHLTVDTYGRPILEVPESVTGPWKGDVNLPCTYDPL...
2,A8K8P3,MKNLLTEKCISSHNFHQKVIKQRMEKKVDSRYFKDGAVKKPYSAKT...
3,Q9NWB1,MNCEREQLRGNQEAAAAPDTMAQPYASAQFAPPQNGIPAEYTAPHP...
4,Q4VX76,MAQEIDLSALKELEREAILQVLYRDQAVQNTEEERTRKLKTHLQHL...
...,...,...
7015,P51451,MGLVSSKKPDKEKPIKEKDKGQWSPLKVSAQDKDAPPLPPLVVFNH...
7016,Q9ULK0,MEALTLWLLPWICQCVSVRADSIIHIGAIFEENAAKDDRVFQLAVS...
7017,O43808,MASVLSYESLVHAVAGAVGSVTAMTVFFPLDTARLRLQVDEKRKSK...
7018,O75665,MMAQSNMFTVADVLSQDELRKKLYQTFKDRGILDTLKTQLRNQLIH...


In [15]:
# Create uni seq dictionary
dict_uni_seq = {uni: seq for uni, seq in zip(uni_seq['uni'].values, uni_seq['seq'].values)}
dict_uni_seq


{'P46094': 'MESSGNPESTTFFYYDLQSQPCENQAWVFATLATTVLYCLVFLLSLVGNSLVLWVLVKYESLESLTNIFILNLCLSDLVFACLLPVWISPYHWGWVLGDFLCKLLNMIFSISLYSSIFFLTIMTIHRYLSVVSPLSTLRVPTLRCRVLVTMAVWVASILSSILDTIFHKVLSSGCDYSELTWYLTSVYQHNLFFLLSLGIILFCYVEILRTLFRSRSKRRHRTVKLIFAIVVAYFLSWGPYNFTLFLQTLFRTQIIRSCEAKQQLEYALLICRNLAFSHCCFNPVLYVFVGVKFRTHLKHVLRQFWFCRLQAPSPASIPHSPGAFAYEGASFY',
 'Q9Y279': 'MGILLGLLLLGHLTVDTYGRPILEVPESVTGPWKGDVNLPCTYDPLQGYTQVLVKWLVQRGSDPVTIFLRDSSGDHIQQAKYQGRLHVSHKVPGDVSLQLSTLEMDDRSHYTCEVTWQTPDGNQVVRDKITELRVQKLSVSKPTVTTGSGYGFTVPQGMRISLQCQARGSPPISYIWYKQQTNNQEPIKVATLSTLLFKPAVIADSGSYFCTAKGQVGSEQHSDIVKFVVKDSSKLLKTKTEAPTTMTYPLKATSTVKQSWDWTTDMDGYLGETSAGPGKSLPVFAIILIISLCCMVVFTMAYIMLCRKTSQQEHVYEAARAHAREANDSGETMRVAIFASGCSSDEPTSQNLGNNYSDEPCIGQEYQIIAQINGNYARLLDTVPLDYEFLATEGKSVC',
 'A8K8P3': 'MKNLLTEKCISSHNFHQKVIKQRMEKKVDSRYFKDGAVKKPYSAKTLSNKKSSASFGIRRELPSTSHLVQYRGTHTCTRQGRLRELRIRCVARKFLYLWIRMTFGRVFPSKARFYYEQRLLRKVFEEWKEEWWVFQHEWKLCVRADCHYRYYLYNLMFQTWKTYVRQQQEMRNKYIRAEVHDAKQKMRQAWKSWLIYVVVRRTKLQMQTTALEFRQRIILRVWWST

In [16]:
pan_transformed_protein_id = list(set(list(pan_transformed['first'].values)+list(pan_transformed['second'].values)))
f = open('pan_proteins.txt', 'w')
for id in pan_transformed_protein_id:
    f.write(id+'\n')


In [17]:
(pan_transformed['label'].values == 0).sum()


34298

In [18]:
pan_seq = pd.read_csv('pan_proteins.tsv', sep='\t', header=None)
pan_seq.columns = ['id', 'seq']
# pan_seq = list(pan_seq['seq'].values)
# import matplotlib.pyplot as plt
# pan_seq_len = [len(seq) for seq in pan_seq]
# (array([1615, 3060, 4568, 4641, 6759, 7188]


In [19]:
pan_transformed_shuffle = pan_transformed.sample(frac=1)
pan_transformed_shuffle = pan_transformed_shuffle.reset_index(drop=True)
# Write to pair to create test dataset (first 6000 pairs)

# Test set
pan_transformed_shuffle[:6000].to_csv('pan_test.tsv', sep='\t', header=None, index=None)


# Train set
pan_transformed_shuffle[6000:].to_csv('pan_train.tsv', sep='\t', header=None, index=None)


In [20]:
protein_shuffle = set(list(pan_transformed_shuffle['first'])+list(pan_transformed_shuffle['second']))
f = open('pan_proteins.txt', 'r')
protein = f.read().split('\n')
protein_ = protein.remove('')
# set(protein)
protein = set(protein) 

In [21]:
protein == protein_shuffle

True