In [1]:
import numpy as np
import scipy as sc
import pandas as pd
import itertools
from Bio import Entrez
from urllib.error import HTTPError,URLError
import textwrap
import time
pd.set_option('display.max_columns', 500)

In [2]:
def swmatrix(a, b, match_score=3, gap_cost=2):
    H = np.zeros((len(a) + 1, len(b) + 1), np.int)
    for i, j in itertools.product(range(1, H.shape[0]), range(1, H.shape[1])):
        match = H[i - 1, j - 1] + (match_score if a[i - 1] == b[j - 1] else - match_score)
        delete = H[i - 1, j] - gap_cost
        insert = H[i, j - 1] - gap_cost
        H[i, j] = max(match, delete, insert, 0)
    return H
def swtraceback(H, b, b_='', old_i=0):
    # flip H to get index of **last** occurrence of H.max() with np.argmax()
    H_flip = np.flip(np.flip(H, 0), 1)
    i_, j_ = np.unravel_index(H_flip.argmax(), H_flip.shape)
    i, j = np.subtract(H.shape, (i_ + 1, j_ + 1))  # (i, j) are **last** indexes of H.max()
    if H[i, j] == 0:
        return b_, j
    b_ = b[j - 1] + '-' + b_ if old_i - i > 1 else b[j - 1] + b_
    return swtraceback(H[0:i, 0:j], b, b_, i)
def smith_waterman(a, b, match_score=3, gap_cost=2):
    a, b = a.upper(), b.upper()
    H = swmatrix(a, b, match_score, gap_cost)
    b_, pos = swtraceback(H, b)
    return pos, pos + len(b_)

In [3]:
s1='LGITYDGM'
s2 = 'MADSHNTQYCSLQESAQAQQELDNDQETMETSEEEEDTTTSNKVYGSGIPSPPQSPQRAYSPCVALASIPDSPSEEASIKGSGGLEDPLYLLHNAQNTKVYDLVDFLVLNYQMKAFTTKAEMLESIGREYEEYYPLIFSEASECLKMVFGLDMVEVDPSVHSYILVTALGITYDGMMTDVLGMPKTGILIAVLSVIFMKGNYVSEEIIWEMVNNIGLCGGRDPYIHKDPRKLISEEFVQEGCLKYRQVPNSDPPSYGFLWGPRAFAETSKMKVLQFFASINKTHPRAYPEKYAEALQDEIDRTKAWILNRCSNSSDLLTF'

smith_waterman(s1,s2)

(168, 176)

In [19]:
Entrez.email ="carloswertcarvajal@gmail.com"
IEDB = pd.read_csv('/Volumes/Maxtor/h2kball1.csv')
IEDB2 = pd.read_csv('/Volumes/Maxtor/h2kball2.csv')
IEDB = pd.concat([IEDB,IEDB2],ignore_index=True,sort=True)
IEDB = IEDB[['Epitope.2','Epitope.8','Assay.4']]
IEDB = IEDB[~(IEDB['Assay.4']=='Negative')]
IEDB = IEDB.drop_duplicates(subset='Epitope.2')
IEDB = IEDB.dropna(axis=0,subset=['Epitope.8']).reset_index(drop=True)
namprop = IEDB[['Epitope.8']].drop_duplicates(subset='Epitope.8').reset_index(drop=True)
namprop = namprop.drop(list(range(0,3614))).reset_index(drop=True)
proteins=pd.DataFrame(columns=['peptides','NB'])
cc=0
for index, row in namprop.iterrows():
    errorch = 0
    if index == 0:
        pass
    else:
        line = row['Epitope.8']
        print(line)
        try:
            handle = Entrez.efetch(db="protein", id=line, retmode="xml")
        except HTTPError:
            time.sleep(20)
            try:
                handle = Entrez.efetch(db="protein", id=line, retmode="xml")
            except HTTPError:
                errorch = 1
                print('Protein '+line+' not found')
        except URLError:
            time.sleep(20)
            handle = Entrez.efetch(db="protein", id=line, retmode="xml")
        if errorch == 1:
            pass
        else:
            records = Entrez.read(handle)
            time.sleep(1) # to make sure not many requests go per second to ncbi
            s2 = records[0]["GBSeq_sequence"].upper()
            peptss = IEDB[IEDB['Epitope.8']==line].reset_index(drop=True)
            for index, row in peptss.iterrows():
                s1 = row['Epitope.2']
                pos1, pos2 = smith_waterman(s1,s2)
                s2 = s2[:pos1] + s2[pos2+1:]
            peps = textwrap.wrap(s2,13)
            for n in range (0,len(peps)-1):
                cc=cc+1
                proteins.loc[cc]=[peps[n],int(0)]
proteins = proteins.drop_duplicates(subset='peptides')
proteins.to_csv("/Volumes/Maxtor/realnegatives4.csv",index=False)

Q08509
Q9EP66
E9PX09
Protein E9PX09 not found
Q6Q477
O35206
P58929
A2BGI8
Protein A2BGI8 not found
Q5DTT3
Q9D2C2
Q3V1M1
Q9QZ82
P0CW03
Q8R010
Q8BW56
P55200
O54990
O89050
Q8K0E8
Q8BFR1
P03953
Q80YD3
Q8VCW8
Q7TMS5
Q3UNX5
Q8K0L3
Q5SSZ5
Q5SYD0
O54804
P51557
F6SPQ1
Protein F6SPQ1 not found
Q6NZR2
Q9D1H9
P13516
P29812
Q64338
P52624
O35927
Q3U2P1
P80318
Q8VC49
P53995
O70423
Q62059
B1AWM4
Protein B1AWM4 not found
Q9D826
Q80W22
P28661
Q8R4D5
P58196
Q99JY9
Q99NH2
Q8BP86
P35584
Q8C181
Q3UQS8
Q91W64
Q8R5K2
P21447
Q2UY11
Q8BYF6
Q49B93
E9QAZ2
Protein E9QAZ2 not found
Q4PZA2
Q06194
Q5MJS3
Q3TUY3
Q8C0V9
Q8BZS9
O54824
Q8VDP6
Q9WV91
P09240
Q9WUU9
Q99JZ4
Q03717
A2A5R2
Q6ZPF4
Q78KK3
Q9Z0F8
Q9QZW0
Q9D6X5
P26516
B9EJX5
Protein B9EJX5 not found
Q920A1
Q8C7E9
Q62465
O54714
Q3TMC4
B2RVL6
Q9JMF7
Q9Z2V9
P28666
Q3UM18
P01029
Q3TJ91
Q9Z2D1
Q91VZ6
Q5SW15
Q9D7H3
Q7TS73
Q9ES07
P41251
Q8CIP4
O35658
Q8K009
Q8CGD2
H3BLL3
Protein H3BLL3 not found
Q8CII2
E9QKG6
Protein E9QKG6 not found
Q99JF5
Q6ZQ73
A2AL34
Protein A2AL34 n

In [102]:
hola = textwrap.wrap("123456789", 2)
print(hola)
#Q99JY0

['12', '34', '56', '78', '9']


In [20]:
print(proteins)
proteins = proteins.drop_duplicates(subset='peptides')
proteins.to_csv("/Volumes/Maxtor/realnegatives4.csv",index=False)

            peptides NB
1      MNGHMSNRSSGYG  0
2      VYPSQLNGYGREH  0
3      SSRTSAKALYEQR  0
4      KNYARDSVSSVSD  0
5      VSQYRVEHLTTFV  0
6      LDRKDAMITVEDG  0
7      IRKLKLLDAKGKV  0
8      WTQDMILQVDDRA  0
9      VSLIDLESKNELE  0
10     NFPLNTISHCQAV  0
11     VHACSYDSILALV  0
12     CKEPTQSKPDLHL  0
13     FQCDEVKANLISE  0
14     DIESAISDSKGGK  0
15     QKRRPEALRMIAK  0
16     ADPGIPPPPRAPA  0
17     PVPPGTVTQVDVR  0
18     SRVAAWSAWAADQ  0
19     GDFEKPRQYHEQE  0
20     ETPEMMAARIDRD  0
21     VQILNHILDDIEF  0
22     FITKLQKAAEAFS  0
23     ELSKRKKSKKSKR  0
24     KGPGEGVLTLRAK  0
25     PPPPDEFVDCFQK  0
26     FKHGFNLLAKLKS  0
27     HIQNPSASDLVHF  0
28     LFTPLNMVVQATG  0
29     GPELASSVLSPLL  0
30     TKDTVDFLNYTAT  0
...              ... ..
35323  VDSMKKIIFYSHA  0
35324  AEKYPADITAYLN  0
35325  VADDCQYFSGEWD  0
35326  STLPKERQIEIEK  0
35327  KVNVTCSKARSLQ  0
35328  EKLSVKYKKRQDL  0
35329  MKNRIAFLIGSFI  0
35330  IALLIGFDLYYHM  0
35331  IYFLYGSKAILAY  0
35332  DVFTLMTSN

In [21]:
IEDB = pd.read_csv('/Volumes/Maxtor/h2kball1.csv')
IEDB2 = pd.read_csv('/Volumes/Maxtor/h2kball2.csv')
IEDB = pd.concat([IEDB,IEDB2],ignore_index=True,sort=True)
IEDB = IEDB[['Epitope.2','Epitope.8','Assay.4']]
IEDB = IEDB[~(IEDB['Assay.4']=='Negative')]
IEDB = IEDB.drop_duplicates(subset='Epitope.2')
IEDB = IEDB.dropna(axis=0,subset=['Epitope.8']).reset_index(drop=True)
namprop = IEDB[['Epitope.8']].drop_duplicates(subset='Epitope.8').reset_index(drop=True)
#namprop = namprop.drop(list(range(132,180))).reset_index(drop=True)
#Q99JY0
print(namprop[namprop['Epitope.8']=='Q7ARG3'])

     Epitope.8
4349    Q7ARG3


In [94]:
namprop = IEDB[['Epitope.8']].drop_duplicates(subset='Epitope.8').reset_index(drop=True)
namprop = namprop.drop(list(range(132,180))).reset_index(drop=True)
print(namprop.loc[200:230])

    Epitope.8
200    Q9D902
201    P58749
202    Q8K4I3
203    O88845
204    P43247
205    Q8CH77
206    E9PVP1
207    P47753
208    Q99L48
209    Q8R574
210    Q9JHF5
211    Q8K4Z5
212    P62245
213    Q3UFY0
214    Q920Q4
215    Q91YE5
216    Q3UHQ6
217    P40201
218    Q8BH79
219    Q8BHK1
220    Q80W47
221    Q64343
222    P15307
223    Q9ERE3
224    Q91XB7
225    P28867
226    Q8C4B4
227    Q9CZM2
228    Q9CPV7
229    Q09014
230    Q6NS46


In [28]:
Neg1 = pd.read_csv('/Volumes/Maxtor/realnegatives.csv')
print(Neg1)

             peptides  NB
0       MKVKVLSLLVPAL   0
1       LVAGAANAAEIYN   0
2       KDGNKLDLFGKVD   0
3       GLHYFSDDKGSDG   0
4       DQTYMRIGFKGET   0
5       QVNDQLTGYGQWE   0
6       YQIQGNQTEGSND   0
7       SWFADAGSFDYGR   0
8       NYGVTYDVTSWTD   0
9       VLPEFGGDTYGAD   0
10      NFMQQRGNGYATY   0
11      DGLDFALQYQGKN   0
12      GSVSGENTNGRSL   0
13      LNQNGDGYGGSLT   0
14      YAIGEGFSVGGAI   0
15      TTSKRTADQNNTA   0
16      NARLYGNGDRATV   0
17      YTGGLKYDANNIY   0
18      LAAQYSQTYNATR   0
19      FGTSNGSNPSTSY   0
20      GFANKAQNFEVVA   0
21      QYQFDFGLRPSVA   0
22      YLQSKGKDISNGY   0
23      GASYGDQDIVKYV   0
24      DVGATYYFNKNMS   0
25      TYVDYKINLLDKN   0
26      DFTRDAGINTDDI   0
27      MADSHNTQYCSLQ   0
28      ESAQAQQELDNDQ   0
29      ETMETSEEEEDTT   0
...               ...  ..
224638  VDSMKKIIFYSHA   0
224639  AEKYPADITAYLN   0
224640  VADDCQYFSGEWD   0
224641  STLPKERQIEIEK   0
224642  KVNVTCSKARSLQ   0
224643  EKLSVKYKKRQDL   0
224644  MKNR

In [52]:
IEDB = pd.read_csv('/Volumes/Maxtor/secondtrain.csv')
Neg1 = pd.read_csv('/Volumes/Maxtor/realnegatives.csv')
Neg1 = Neg1.sample(frac=0.3).reset_index(drop=True)
allp = pd.concat([IEDB,Neg1],ignore_index=True,sort=True)
allp = allp[['peptides','NB']]
allp = allp.sample(frac=1).reset_index(drop=True)


In [53]:
negg = len(allp[allp['NB']==0])
poss = len(allp[allp['NB']==1])
total = negg+poss
print(negg/total)

0.9010762107370792


In [54]:
allp.to_csv("/Volumes/Maxtor/thirdtrain.csv",index=False)