In [142]:
import numpy as np
import pandas as pd
import urllib.request
import bs4
import re
import time

from google import search

In [143]:
df = pd.read_csv('../../result/final.annotations.txt', sep='\t')
grouped = df.groupby('pdb_id')

In [144]:
df.head()

Unnamed: 0,pdb_id,species,chain_mhc_a,mhc_a_allele,chain_mhc_b,mhc_b_allele,mhc_type,chain_antigen,antigen_seq,chain_tcr,tcr_v_allele,tcr_j_allele,tcr_region,tcr_region_start,tcr_region_end,tcr_region_seq
0,2f53,HomoSapiens,A,A*02:01:48,B,B2M,MHCI,C,SLLMWITQC,D,TRAV21*01,,CDR1,27,33,DSAIYN
1,2f53,HomoSapiens,A,A*02:01:48,B,B2M,MHCI,C,SLLMWITQC,D,TRAV21*01,,CDR2,50,53,IPF
2,2f53,HomoSapiens,A,A*02:01:48,B,B2M,MHCI,C,SLLMWITQC,D,TRAV21*01,,CDR3,90,105,CAVRPTSGGSYIPTF
3,2f53,HomoSapiens,A,A*02:01:48,B,B2M,MHCI,C,SLLMWITQC,E,TRBV6-5*01,,CDR1,24,31,QDMNHEY
4,2f53,HomoSapiens,A,A*02:01:48,B,B2M,MHCI,C,SLLMWITQC,E,TRBV6-5*01,,CDR2,48,51,SVS


In [145]:
columns = ['chunk.id', 'cdr3.alpha', 'v.alpha', 'j.alpha', 'cdr3.beta', 'v.beta',
       'd.beta', 'j.beta', 'species', 'mhc.a', 'mhc.b', 'mhc.class',
       'antigen.epitope', 'antigen.gene', 'antigen.species', 'reference.id',
       'method.identification', 'method.frequency', 'method.singlecell',
       'method.sequencing', 'method.verification', 'meta.study.id',
       'meta.cell.subset', 'meta.subset.frequency', 'meta.subject.cohort',
       'meta.subject.id', 'meta.replica.id', 'meta.clone.id',
       'meta.epitope.id', 'meta.tissue', 'meta.donor.MHC',
       'meta.donor.MHC.method', 'meta.structure.id']

In [146]:
# in grouped.groups.keys()

In [150]:
rows = []
for key, group in grouped:
    row = pd.Series([np.nan]*len(columns), columns)
    row['chunk.id'] = key
    row['cdr3.alpha'] = group.loc[(group.tcr_v_allele.str.startswith('TRA')) & 
                                  (group.tcr_region == 'CDR3'), 'tcr_region_seq'].iloc[0]
    row['v.alpha'] = group.loc[(group.tcr_v_allele.str.startswith('TRA')), 'tcr_v_allele'].iloc[0]
    row['j.alpha'] = group.loc[(group.tcr_v_allele.str.startswith('TRA')), 'tcr_j_allele'].iloc[0]
    row['cdr3.beta'] = group.loc[(group.tcr_v_allele.str.startswith('TRB')) & 
                                 (group.tcr_region == 'CDR3'), 'tcr_region_seq'].iloc[0]
    row['v.beta'] = group.loc[(group.tcr_v_allele.str.startswith('TRB')), 'tcr_v_allele'].iloc[0]
    row['j.beta'] = group.loc[(group.tcr_v_allele.str.startswith('TRB')), 'tcr_j_allele'].iloc[0]
    row['species'] = group['species'].iloc[0]
    row['mhc.a'] = group['mhc_a_allele'].iloc[0]
    row['mhc.b'] = group['mhc_b_allele'].iloc[0]
    row['mhc.class'] = group['mhc_type'].iloc[0]
    row['antigen.epitope'] = group['antigen_seq'].iloc[0]
    row['mhc.class'] = group['mhc_type'].iloc[0]
    rows.append(row)    
    
db = pd.concat(rows, axis=1).T

# Somehow id '4e41' transformed into float 4e+41
ind = db['chunk.id'].apply(type) == float
if any(ind):
    db.loc[ind, 'chunk.id'] = '4e41'

In [88]:
db.head()

Unnamed: 0,chunk.id,cdr3.alpha,v.alpha,j.alpha,cdr3.beta,v.beta,d.beta,j.beta,species,mhc.a,...,meta.subset.frequency,meta.subject.cohort,meta.subject.id,meta.replica.id,meta.clone.id,meta.epitope.id,meta.tissue,meta.donor.MHC,meta.donor.MHC.method,meta.structure.id
0,1ao7,CAVTTDSWGKLQF,TRAV12-2*02,.,CASRPGLAGGRPEQYF,TRBV6-5*01,,TRBJ2-3*01,HomoSapiens,A*02:01:59,...,,,,,,,,,,
1,1bd2,CAAMEGAQKLVF,TRAV29_DV5*01,.,CASSYPGGGFYEQYF,TRBV6-5*01,,TRBJ2-3*01,HomoSapiens,A*02:01:48,...,,,,,,,,,,
2,1d9k,CAATGSFNKLTF,TRAV14D-2*01,.,CASGGQGRAEQFF,TRBV13-2*03,,TRBJ2-5*01,MusMusculus,2944427,...,,,,,,,,,,
3,1fo0,CAMRGDYGGSGNKLIF,TRAV16*01,.,CSADRVGNTLYF,TRBV1*01,,.,MusMusculus,48425589,...,,,,,,,,,,
4,1fyt,CAVSESPFGNEKLTF,TRAV8-4*01,.,CASSSTGLPYGYTF,TRBV28*01,,.,HomoSapiens,DRA*01:02:03,...,,,,,,,,,,


In [89]:
def get_pubmed_id(pdb_id):
    try:
        url = 'http://www.rcsb.org/pdb/explore.do?structureId='+pdb_id.lower()
        source = urllib.request.urlopen(url).read()
        soup = bs4.BeautifulSoup(source, 'html5lib')
        string = str(soup.find('meta', {'name':'description'}))
        pattern = re.compile('<meta content="[0-9A-Za-z]{4}:\s+(.+)" name')
        article = pattern.match(string).group(1)
    except BaseException:
        print(pdb_id.lower()+': '+"Something's wrong")
        return np.nan
    for counter in range(1):
        try:
            if counter > 0:
                time.sleep(np.random.random_integers(10, 30))
            links = [url for url in search(article, stop=40)]
            pmids = [split[-1] if split[-2] == 'pubmed' else '' for split in [link.split('/') for link in links]]
            global pm
            pm = pmids[np.where(np.array(pmids) != '')[0][0]]
        except BaseException as e:
            if str(e).find('503'):
                raise Exception('Seems google blocked you')
            continue
        else:
            break
    print(pdb_id.lower()+': '+pm)
    return pm

In [90]:
#refs = pd.Series([get_pubmed_id(pdb_id) for pdb_id in db['chunk.id']], list(db['chunk.id']))

In [91]:
#get_pubmed_id('1ao7')

In [92]:
#refs = refs.apply(lambda x: 'PMID'+x)
#db['reference.id'] = refs
#db.to_csv('database.txt', sep='\t', index=False)

In [93]:
def get_pubmed_id2(pdb_id):
    try:
        url = 'http://www.rcsb.org/pdb/explore.do?structureId='+pdb_id
        source = urllib.request.urlopen(url).read()
        soup = bs4.BeautifulSoup(source, 'html5lib')
        string = str(soup.find('meta', {'name':'description'}))
        pattern = re.compile('<meta content="[0-9A-Za-z]{4}:\s+(.+)" name')
        article = pattern.match(string).group(1)

        mkquery = lambda x: 'http://www.ncbi.nlm.nih.gov/pubmed?term=%28'+'+'.join(x.split(' '))+'[Title]%29'
        url = mkquery(article)
        print('Article url: '+url)
        source = urllib.request.urlopen(url).read()
        soup = bs4.BeautifulSoup(source, 'html.parser')
        if any(np.array([x.text for x in soup.findAll('h3')]) == 'Abstract'):
            find_pmid = lambda x: x.findAll('div', id='maincontent')[0].findAll('div', 'resc')[0].find('dd').text
            res = find_pmid(soup)
            print('PMID: '+res+'\n')
            return res
        if any(np.array([x.text for x in soup.findAll('h2')]) == 'Search results'):
            find_pmid = lambda x: x.findAll('div', id='maincontent')[0].findAll('dl', 'rprtid')[0].find('dd').text
            res = find_pmid(soup)+'?'
            print('PMID: '+res+'\n')
            return res
        print('PMID: not found\n')
        return ''
    except BaseException:
        return ''

In [94]:
refs = pd.Series([get_pubmed_id2(pdb_id) for pdb_id in db['chunk.id']], list(db['chunk.id']))

http://www.ncbi.nlm.nih.gov/pubmed?term=%28Structure+of+the+complex+between+human+T-cell+receptor,+viral+peptide+and+HLA-A2.[Title]%29
PMID: 21084668?
http://www.ncbi.nlm.nih.gov/pubmed?term=%28The+crystal+structure+of+a+T+cell+receptor+in+complex+with+peptide+and+MHC+class+II.[Title]%29
PMID: 10583947
http://www.ncbi.nlm.nih.gov/pubmed?term=%28Crystal+structure+of+a+T+cell+receptor+bound+to+an+allogeneic+MHC+molecule.[Title]%29
http://www.ncbi.nlm.nih.gov/pubmed?term=%28A+functional+hot+spot+for+antigen+recognition+in+a+superagonist+TCR/MHC+complex.[Title]%29
PMID: 10755612
http://www.ncbi.nlm.nih.gov/pubmed?term=%28Structure+of+a+complex+of+the+human+alpha/beta+T+cell+receptor+(TCR)+HA1.7,+influenza+hemagglutinin+peptide,+and+major+histocompatibility+complex+class+II+molecule,+HLA-DR4+(DRA*0101+and+DRB1*0401):+insight+into+TCR+cross-restriction+and+alloreactivity.[Title]%29
PMID: 11877480
http://www.ncbi.nlm.nih.gov/pubmed?term=%28A+T+cell+receptor+CDR3beta+loop+undergoes+conformatio

In [None]:
print('Filling missed values')

In [166]:
refs[refs == '']

4eup    
4ftv    
dtype: object

In [154]:
refs['1bd2'] = '9586631'
refs['1fo0'] = '11017099'
refs['1fyt'] = '11060013'
refs['2f53'] = '16600963'
refs['2f54'] = '16600963'
refs['3d39'] = '19698083'
refs['4eup'] = ''
refs['4ftv'] = ''
refs['4e41'] = '17334368'
refs['5d2l'] = '26429912'
refs['5d2n'] = '26429912'

In [165]:
refs[refs.apply(lambda x: x[-1] == '?' if x else False)]

Series([], dtype: object)

In [164]:
refs['1ao7'] = '8906788'
refs['1oga'] = '12796775'

In [169]:
pretty_refs = refs.apply(lambda x: 'PMID:'+str(x))

In [172]:
db.index = db['chunk.id']
db.loc[:, 'reference.id'] = pretty_refs
db.to_csv('../../result/database.txt', sep='\t', index=False)