# Data preparation

### This code prepares the Genes and drugs for Affinity calculations

#### 1. Proteins data

In [1]:
import pandas as pd
import numpy as np

In [2]:
#PPI data file paths
seq_file  = '..\PPI_seq.txt'
gene_file = '..\PPI_info.txt'
#net_path  = '..\PPI_net.txt'

gene_info = pd.read_csv(gene_file, sep='\t')
#net = pd.read_csv(net_path, sep='\t')

In [3]:
gene_info.head(-5)

Unnamed: 0,protein_external_id,preferred_name,protein_size,annotation
0,9606.ENSP00000000233,ARF5,180,ADP-ribosylation factor 5; GTP-binding protein...
1,9606.ENSP00000000412,M6PR,277,Cation-dependent mannose-6-phosphate receptor;...
2,9606.ENSP00000001008,FKBP4,459,Peptidyl-prolyl cis-trans isomerase FKBP4; Imm...
3,9606.ENSP00000001146,CYP26B1,512,Cytochrome P450 26B1; Involved in the metaboli...
4,9606.ENSP00000002125,NDUFAF7,441,"Protein arginine methyltransferase NDUFAF7, mi..."
...,...,...,...,...
19556,9606.ENSP00000485656,ENSG00000280267,478,annotation not available
19557,9606.ENSP00000485659,MUC5AC,5654,Mucin-5AC; Gel-forming glycoprotein of gastric...
19558,9606.ENSP00000485663,EIF3L,564,Eukaryotic translation initiation factor 3 sub...
19559,9606.ENSP00000485664,ENSG00000279493,155,annotation not available


In [23]:
#The info file should contain 19566 proteins/genes
len(gene_info)

19566

In [24]:
#Adding sequence col. to the data
gene_info['Sequence'] = '-'
gene_info

Unnamed: 0,protein_external_id,preferred_name,protein_size,annotation,Sequence
0,9606.ENSP00000000233,ARF5,180,ADP-ribosylation factor 5; GTP-binding protein...,-
1,9606.ENSP00000000412,M6PR,277,Cation-dependent mannose-6-phosphate receptor;...,-
2,9606.ENSP00000001008,FKBP4,459,Peptidyl-prolyl cis-trans isomerase FKBP4; Imm...,-
3,9606.ENSP00000001146,CYP26B1,512,Cytochrome P450 26B1; Involved in the metaboli...,-
4,9606.ENSP00000002125,NDUFAF7,441,"Protein arginine methyltransferase NDUFAF7, mi...",-
...,...,...,...,...,...
19561,9606.ENSP00000485671,ENSG00000280273,120,HCG1991042,-
19562,9606.ENSP00000485672,ENSG00000279458,86,annotation not available,-
19563,9606.ENSP00000485673,ENSG00000279988,243,annotation not available,-
19564,9606.ENSP00000485675,ENSG00000280116,84,annotation not available,-


In [25]:
#Cleaning seq file
file = open(seq_file, "r")
seq  = ''
indx = -1

for line in file:
    if indx == -1:
        indx += 1
        continue
    elif '>' in line:
        #Adds merged seq to gene_info
        gene_info['Sequence'][indx] = seq.replace('\n', '')
        seq = ''
        indx += 1
    else:
        seq += line
    
file.close()
gene_info.tail()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,protein_external_id,preferred_name,protein_size,annotation,Sequence
19561,9606.ENSP00000485671,ENSG00000280273,120,HCG1991042,MPPETQPDQKMCVPTRKKACLCLHALPLPFLTTYKYSLSEEVREEG...
19562,9606.ENSP00000485672,ENSG00000279458,86,annotation not available,MPRRLERKIEQEGVTVKSSSHFNPDPDAETLYKAMKGIGTNEQAII...
19563,9606.ENSP00000485673,ENSG00000279988,243,annotation not available,MEKLRKWVLWDVRYPSAAWSGGEHGRAHVALPHGIHHVGGVSIRIE...
19564,9606.ENSP00000485675,ENSG00000280116,84,annotation not available,MEPRTGGAANPKGSRGRPGPLPPACPSALPLLARLDARPLAARAAV...
19565,9606.ENSP00000485678,OR6Q1,317,Olfactory receptor 6Q1; Odorant receptor; Olfa...,-


In [26]:
#Last Gene was missing a sequence
gene_info['Sequence'][19565] = 'MQPYTKNWTQVTEFVMMGFAGIHEAHLLFFILFLTMYLFTLVENLAIILVVGLDHRLRRPMYFFLTHLSCLEIWYTSVTVPKMLAGFIGVDGGKNISYADCLSQLFIFTFLGATECFLLAAMAYDRYVAICMPLHYGAFVSWGTCIRLAAACWLVGFLTPILPIYLLSQLTFYGPNVIDHFSCDASPLLALSCSDVTWKETVDFLVSLAVLLASSMVIAVSYGNIVWTLLHIRSAAERWKAFSTCAAHLTVVSLFYGTLFFMYVQTKVTSSINFNKVVSVFYSVVTPMLNPLIYSLRNKEVKGALGRVFSLNFWKGQ'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
#Full version of genes with seq and annotation
gene_info.to_csv('..\gene_full.txt', index=False, sep = ',')

In [54]:
#Making shorter gene id table
gene_id = gene_info['protein_external_id'].copy()

#removing the 9606.ENSP000000
for gene in range(len(gene_id)):
    gene_id[gene] = gene_id[gene][-6:]

gene_id += ',' + gene_info['Sequence']
gene_id 

0        000233,MGLTVSALFSRIFGKKQMRILMVGLDAAGKTTILYKLKL...
1        000412,MFPFYSCWRTGLLLLLLAVAVRESWQTEEKTCDLVGEKG...
2        001008,MTAEEMKATESGAQSAPLPMEGVDISPKQDEGVLKVIKR...
3        001146,MLFEGLDLVSALATLAACLVSVTLLLAVSQQLWQLRWAA...
4        002125,MSVLLRSGLGPLCAVARAAIPFIWRGKYFSSGNEPAENP...
                               ...                        
19561    485671,MPPETQPDQKMCVPTRKKACLCLHALPLPFLTTYKYSLS...
19562    485672,MPRRLERKIEQEGVTVKSSSHFNPDPDAETLYKAMKGIG...
19563    485673,MEKLRKWVLWDVRYPSAAWSGGEHGRAHVALPHGIHHVG...
19564    485675,MEPRTGGAANPKGSRGRPGPLPPACPSALPLLARLDARP...
19565    485678,MQPYTKNWTQVTEFVMMGFAGIHEAHLLFFILFLTMYLF...
Name: protein_external_id, Length: 19566, dtype: object

In [55]:
gene_id.to_csv('..\gene_short.txt', index=False, sep = ',', header = '')

  """Entry point for launching an IPython kernel.


#### This code prepares the SIDERS dataset

In [87]:
#reduced data file paths
drug_file  = '..\drug_names.tsv'
SE_file  = '..\meddra_all_se.tsv'

drugs = pd.read_csv(drug_file, names=['DrugID','Name'], sep='\t')
SE = pd.read_csv(SE_file, names=['1','2','3','4','5','6','7'], sep='\t')

In [88]:
drugs

Unnamed: 0,DrugID,Name
0,CID100000085,carnitine
1,CID100000119,gamma-aminobutyric
2,CID100000137,5-aminolevulinic
3,CID100000143,leucovorin
4,CID100000146,5-methyltetrahydrofolate
...,...,...
1425,CID156603655,pegaptanib
1426,CID156842239,n-3
1427,CID170683024,x
1428,CID170695640,colestyramine


In [89]:
SE

Unnamed: 0,1,2,3,4,5,6,7
0,EMA/WC500020092.html,CID100216416,CID000216416,C0000737,LLT,C0000737,Abdominal pain
1,EMA/WC500020092.html,CID100216416,CID000216416,C0000737,PT,C0000737,Abdominal pain
2,EMA/WC500020092.html,CID100216416,CID000216416,C0000737,PT,C0687713,Gastrointestinal pain
3,EMA/WC500020092.html,CID100216416,CID000216416,C0002170,LLT,C0002170,Alopecia
4,EMA/WC500020092.html,CID100216416,CID000216416,C0002170,PT,C0002170,Alopecia
...,...,...,...,...,...,...,...
4753136,safety/2008_-_May_PI_-_Viread_PI.html,CID100119830,CID005481350,C1565489,LLT,C1565489,Renal impairment
4753137,safety/2008_-_May_PI_-_Viread_PI.html,CID100119830,CID005481350,C1565489,PT,C0035078,Renal failure
4753138,safety/2008_-_May_PI_-_Viread_PI.html,CID100119830,CID005481350,C1565489,PT,C1565489,Renal impairment
4753139,safety/2008_-_May_PI_-_Viread_PI.html,CID100119830,CID005481350,C1608945,LLT,C1608945,Exfoliative rash


In [90]:
len(drugs_list) #should be 1430

1430

In [91]:
SE = SE.loc[SE['5'] == 'LLT']
SE = SE[['2','6','7']]
SE.columns = ['DrugID', 'SEID', 'SE name']
SE

Unnamed: 0,DrugID,SEID,SE name
0,CID100216416,C0000737,Abdominal pain
3,CID100216416,C0002170,Alopecia
5,CID100216416,C0002395,Dementia Alzheimer's type
7,CID100216416,C0002622,Amnesia
9,CID100216416,C0002871,Anaemia
...,...,...,...
4753130,CID100119830,C0497365,Rash generalised
4753132,CID100119830,C0851341,Infestation NOS
4753134,CID100119830,C0917801,Insomnia
4753136,CID100119830,C1565489,Renal impairment


In [92]:
len(pd.unique(SE['DrugID'])) #should be 1430

1430

In [95]:
SE = SE.sort_values(by=['DrugID', 'SE name'])

In [103]:
SE = SE.drop_duplicates(subset=['DrugID','SEID','SE name'], 
                        keep='first', ignore_index=True)

In [107]:
SE.to_csv('se_all.csv', index=False)

In [104]:
SE

Unnamed: 0,DrugID,SEID,SE name
0,CID100000085,C0000729,Abdominal cramps
1,CID100000085,C0000737,Abdominal pain
2,CID100000085,C0151736,Accidental injury
3,CID100000085,C0002418,Amblyopia
4,CID100000085,C0002871,Anaemia
...,...,...,...
138894,CID171306834,C0042963,Vomiting
138895,CID171306834,C3665596,Warts
138896,CID171306834,C0043096,Weight decreased
138897,CID171306834,C0043094,Weight increased
