In [1]:
import requests
import json
import pandas as pd
from pandas.io.json import json_normalize
import datetime
from bs4 import BeautifulSoup

In [2]:

resp = requests.get('http://docm.genome.wustl.edu/api/v1/variants.json?detailed_view=true')
docm_web = json_normalize(resp.json(), 'diseases', ['amino_acid', 
                                                    'cdna_change', 
                                                    'chromosome', 
                                                    'gene', 
                                                    'hgvs', 
                                                    'mutation_type', 
                                                    'reference', 
                                                    'reference_version', 
                                                    'start', 
                                                    'stop', 
                                                    'strand', 
                                                    'variant', 
                                                    'variant_type', 
                                                        ['transcript',
                                                         'name']])
docm_web.doid = 'DOID:' + docm_web.doid



In [3]:
doids = set(docm_web.doid.tolist())

## choose an arbitrary disease id compatible with ClinVar

#### The MSH id is given preference because most DOID map to only one MSHid, however when a MSHid is not available an arbitrary OMIMid or UMLSid is choosen instead.

In [4]:
def find_xrefs(doid):
    resp_json = requests.get('http://www.disease-ontology.org/api/metadata/' +
                             doid).json()
    if 'xrefs' in resp_json:
        return resp_json
    else:
        return find_xrefs(resp_json['parents'][0][2])

In [5]:
msh_lookup = {}
for doid in doids:
    resp_json = find_xrefs(doid)
    xrefs = [x for x in resp_json['xrefs'] if x.startswith(('MSH', 
                                                            'OMIM', 
                                                            'UMLS'))]
    msh_lookup[doid] = xrefs.pop(0)

In [6]:
docm_web[['Condition ID type', 'Condition ID value']] =pd.DataFrame(docm_web.doid.map(msh_lookup).str.split(':').tolist(), columns=['Condition ID type', 'Condition ID value'])
docm_web['Condition ID type'] = docm_web['Condition ID type'].str.rstrip('_CUI')
docm_web['Condition ID type'] = docm_web['Condition ID type'].replace(to_replace='MSH', value='MeSH')
docm_web['URL']='http://docm.genome.wustl.edu/variants/' + docm_web['hgvs']

In [7]:
docm_clinvar = docm_web[['hgvs', 'chromosome', 'start', 'stop', 'reference', 'variant','URL','Condition ID type', 'Condition ID value', 'source_pubmed_id']]

In [8]:
docm_clinvar.columns=['HGVS', 'Chromosome', 'Start', 'Stop', 'Reference allele','Alternate allele','URL', 'Condition ID type', 'Condition ID value','Clinical significance citations']
docm_clinvar['Clinical significance citations'] = 'PMID:' + docm_clinvar['Clinical significance citations'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [9]:
docm_clinvar

Unnamed: 0,HGVS,Chromosome,Start,Stop,Reference allele,Alternate allele,URL,Condition ID type,Condition ID value,Clinical significance citations
0,ENST00000361445:c.7514G>C,1,11169361,11169361,C,G,http://docm.genome.wustl.edu/variants/ENST0000...,UMLS,C1378703,PMID:24631838
1,ENST00000361445:c.7255G>A,1,11174420,11174420,C,T,http://docm.genome.wustl.edu/variants/ENST0000...,UMLS,C0279680,PMID:24625776
2,ENST00000361445:c.6667C>A,1,11182179,11182179,G,T,http://docm.genome.wustl.edu/variants/ENST0000...,UMLS,C1378703,PMID:24622468
3,ENST00000361445:c.6644C>A,1,11184573,11184573,G,T,http://docm.genome.wustl.edu/variants/ENST0000...,UMLS,C1378703,PMID:24631838
4,ENST00000361445:c.6637C>T,1,11184580,11184580,G,A,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,D008545,PMID:26490311
5,ENST00000361445:c.6324C>A,1,11187094,11187094,G,T,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,D013964,PMID:25295501
6,ENST00000361445:c.6040G>A,1,11187857,11187857,C,T,http://docm.genome.wustl.edu/variants/ENST0000...,UMLS,C0279680,PMID:24625776
7,ENST00000361445:c.5902C>T,1,11188519,11188519,G,A,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,D008545,PMID:26490311
8,ENST00000361445:c.4379T>C,1,11217299,11217299,A,G,http://docm.genome.wustl.edu/variants/ENST0000...,UMLS,C1378703,PMID:24631838
9,ENST00000358432:c.1171G>C,1,16464489,16464489,C,G,http://docm.genome.wustl.edu/variants/ENST0000...,UMLS,C0149782,PMID:20360610


In [10]:
docm_clinvar['Clinical significance'] = 'likely pathogenic'
docm_clinvar['Collection method'] = 'literature only'
docm_clinvar['Allele origin'] = 'somatic'
docm_clinvar['Affected status'] ='yes'
docm_clinvar['Mode of inheritance'] = 'Somatic mutation'
docm_clinvar['Assertion method']='DoCM Curation'
docm_clinvar['Assertion method citation'] = 'http://docm.genome.wustl.edu/sources'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [11]:
grouped = docm_clinvar.groupby(['HGVS','Condition ID value'])
grouped_list = []
for name, group in grouped:
    grouped_list.append([name[0], name[1], ';'.join(group['Clinical significance citations'].tolist())] )


In [12]:
pubmed_grouped_df = pd.DataFrame(grouped_list, columns=['HGVS','Condition ID value','Clinical significance citations'])

In [13]:
docm_clinvar.columns

Index(['HGVS', 'Chromosome', 'Start', 'Stop', 'Reference allele',
       'Alternate allele', 'URL', 'Condition ID type', 'Condition ID value',
       'Clinical significance citations', 'Clinical significance',
       'Collection method', 'Allele origin', 'Affected status',
       'Mode of inheritance', 'Assertion method', 'Assertion method citation'],
      dtype='object')

In [14]:
docm_clinvar_noPM= docm_clinvar[['HGVS', 'Chromosome', 'Start', 'Stop', 'Reference allele',
       'Alternate allele', 'URL', 'Condition ID type', 'Condition ID value',
                   'Clinical significance',
       'Collection method', 'Allele origin', 'Affected status',
       'Mode of inheritance', 'Assertion method', 'Assertion method citation']]

In [15]:
docm_clinvar2 = pd.merge(pubmed_grouped_df,docm_clinvar_noPM,on=['HGVS','Condition ID value'],how='left').drop_duplicates()

In [16]:
docm_clinvar2

Unnamed: 0,HGVS,Condition ID value,Clinical significance citations,Chromosome,Start,Stop,Reference allele,Alternate allele,URL,Condition ID type,Clinical significance,Collection method,Allele origin,Affected status,Mode of inheritance,Assertion method,Assertion method citation
0,ENST00000078429:c.626A>C,C536494,PMID:25157968;PMID:22733540,19,3118942,3118942,A,C,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,likely pathogenic,literature only,somatic,yes,Somatic mutation,DoCM Curation,http://docm.genome.wustl.edu/sources
2,ENST00000078429:c.626A>C,D008545,PMID:21083380;PMID:2549426;PMID:1328859;PMID:2...,19,3118942,3118942,A,C,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,likely pathogenic,literature only,somatic,yes,Somatic mutation,DoCM Curation,http://docm.genome.wustl.edu/sources
7,ENST00000078429:c.626A>T,C536494,PMID:25157968;PMID:22733540;PMID:24141786;PMID...,19,3118942,3118942,A,T,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,likely pathogenic,literature only,somatic,yes,Somatic mutation,DoCM Curation,http://docm.genome.wustl.edu/sources
11,ENST00000078429:c.626A>T,D008545,PMID:21083380;PMID:2549426;PMID:1328859;PMID:2...,19,3118942,3118942,A,T,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,likely pathogenic,literature only,somatic,yes,Somatic mutation,DoCM Curation,http://docm.genome.wustl.edu/sources
16,ENST00000206249:c.1138G>C,D001943,PMID:24185512;PMID:24185510;PMID:24398047,6,152332832,152332832,G,C,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,likely pathogenic,literature only,somatic,yes,Somatic mutation,DoCM Curation,http://docm.genome.wustl.edu/sources
19,ENST00000206249:c.1387T>C,D001943,PMID:24185512,6,152415537,152415537,T,C,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,likely pathogenic,literature only,somatic,yes,Somatic mutation,DoCM Curation,http://docm.genome.wustl.edu/sources
20,ENST00000206249:c.1601T>A,D001943,PMID:24185512,6,152419914,152419914,T,A,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,likely pathogenic,literature only,somatic,yes,Somatic mutation,DoCM Curation,http://docm.genome.wustl.edu/sources
21,ENST00000206249:c.1604C>A,D001943,PMID:24185512,6,152419917,152419917,C,A,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,likely pathogenic,literature only,somatic,yes,Somatic mutation,DoCM Curation,http://docm.genome.wustl.edu/sources
22,ENST00000206249:c.1607T>G,D001943,PMID:24185512;PMID:24185510,6,152419920,152419920,T,G,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,likely pathogenic,literature only,somatic,yes,Somatic mutation,DoCM Curation,http://docm.genome.wustl.edu/sources
24,ENST00000206249:c.1609T>A,D001943,PMID:24185512;PMID:24398047;PMID:24185512;PMID...,6,152419922,152419922,T,A,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,likely pathogenic,literature only,somatic,yes,Somatic mutation,DoCM Curation,http://docm.genome.wustl.edu/sources
