In [9]:
import requests
import json
import pandas as pd
from pandas.io.json import json_normalize

In [5]:
response = requests.get('http://docm.genome.wustl.edu/api/v1/variants.json?detailed_view=true')

In [7]:
response.json()

[{'amino_acid': 'p.R2505P',
  'cdna_change': 'c.7514',
  'chromosome': '1',
  'diseases': [{'disease': 'renal carcinoma',
    'doid': '4451',
    'source_pubmed_id': 24631838}],
  'drug_interactions': [{'aggregated_by': 'Dienstman Knowledge Database - https://www.synapse.org/#!Synapse:syn2370773',
    'association': 'sensitivity',
    'drug': 'rapamycin',
    'effect': 'gain-of-function',
    'evidence_type': 'emerging',
    'pathway': 'activation',
    'source_pubmed_id': 24631838,
    'status': 'preclinical'}],
  'gene': 'MTOR',
  'hgvs': 'ENST00000361445:c.7514G>C',
  'mutation_type': 'missense',
  'reference': 'C',
  'reference_version': 'GRCh37',
  'start': 11169361,
  'stop': 11169361,
  'strand': '-1',
  'transcript': {'name': 'ENST00000361445',
   'source': 'ensembl',
   'version': '74_37'},
  'variant': 'G',
  'variant_type': 'SNP'},
 {'amino_acid': 'p.E2419K',
  'cdna_change': 'c.7255',
  'chromosome': '1',
  'diseases': [{'disease': 'urinary bladder urothelial carcinoma',
  

In [12]:

resp = requests.get('http://docm.genome.wustl.edu/api/v1/variants.json?detailed_view=true')
docm_web = json_normalize(resp.json(), 'diseases', ['amino_acid', 
                                                    'cdna_change', 
                                                    'chromosome', 
                                                    'gene', 
                                                    'hgvs', 
                                                    'mutation_type', 
                                                    'reference', 
                                                    'reference_version', 
                                                    'start', 
                                                    'stop', 
                                                    'strand', 
                                                    'variant', 
                                                    'variant_type', 
                                                        ['transcript',
                                                         'name']])



In [13]:
docm_web

Unnamed: 0,disease,doid,source_pubmed_id,transcript.name,stop,variant_type,variant,gene,strand,reference_version,hgvs,reference,amino_acid,cdna_change,chromosome,start,mutation_type
0,renal carcinoma,4451,24631838,ENST00000361445,11169361,SNP,G,MTOR,-1,GRCh37,ENST00000361445:c.7514G>C,C,p.R2505P,c.7514,1,11169361,missense
1,urinary bladder urothelial carcinoma,4006,24625776,ENST00000361445,11174420,SNP,T,MTOR,-1,GRCh37,ENST00000361445:c.7255G>A,C,p.E2419K,c.7255,1,11174420,missense
2,renal carcinoma,4451,24622468,ENST00000361445,11182179,SNP,T,MTOR,-1,GRCh37,ENST00000361445:c.6667C>A,G,p.Q2223K,c.6667,1,11182179,missense
3,renal carcinoma,4451,24631838,ENST00000361445,11184573,SNP,T,MTOR,-1,GRCh37,ENST00000361445:c.6644C>A,G,p.S2215Y,c.6644,1,11184573,missense
4,melanoma,1909,26490311,ENST00000361445,11184580,SNP,A,MTOR,-1,GRCh37,ENST00000361445:c.6637C>T,G,p.P2213S,c.6637,1,11184580,missense
5,thyroid carcinoma,3963,25295501,ENST00000361445,11187094,SNP,T,MTOR,-1,GRCh37,ENST00000361445:c.6324C>A,G,p.F2108L,c.6324,1,11187094,missense
6,urinary bladder urothelial carcinoma,4006,24625776,ENST00000361445,11187857,SNP,T,MTOR,-1,GRCh37,ENST00000361445:c.6040G>A,C,p.E2014K,c.6040,1,11187857,missense
7,melanoma,1909,26490311,ENST00000361445,11188519,SNP,A,MTOR,-1,GRCh37,ENST00000361445:c.5902C>T,G,p.H1968Y,c.5902,1,11188519,missense
8,renal carcinoma,4451,24631838,ENST00000361445,11217299,SNP,G,MTOR,-1,GRCh37,ENST00000361445:c.4379T>C,A,p.L1460P,c.4379,1,11217299,missense
9,lung squamous cell carcinoma,3907,20360610,ENST00000358432,16464489,SNP,G,EPHA2,-1,GRCh37,ENST00000358432:c.1171G>C,C,p.G391R,c.1171,1,16464489,missense


In [20]:
doids = set(('DOID:' + docm_web.doid).tolist())

## choose an arbitrary disease id compatible with ClinVar

#### The MSH id is given preference because most DOID map to only one MSHid, however when a MSHid is not available an arbitrary OMIMid or UMLSid is choosen instead.

In [90]:
def find_xrefs(doid):
    resp_json = requests.get('http://www.disease-ontology.org/api/metadata/' +
                             doid).json()
    if 'xrefs' in resp_json:
        return resp_json
    else:
        return find_xrefs(resp_json['parents'][0][2])

In [95]:

doid

'5374'

In [108]:
msh_lookup = {}
for doid in doids:
    resp_json = find_xrefs(doid)
    xrefs = [x for x in resp_json['xrefs'] if x.startswith(('MSH', 
                                                            'OMIM', 
                                                            'UMLS'))]
    msh_lookup[doid.lstrip('DOID:')] = xrefs.pop(0)

In [110]:
docm_web[['Condition ID type', 'Condition ID value']] =pd.DataFrame(docm_web.doid.map(msh_lookup).str.split(':').tolist(), columns=['Condition ID type', 'Condition ID value'])
docm_web['Condition ID type'] = docm_web['Condition ID type'].str.rstrip('_CUI')
docm_web['Condition ID type'] = docm_web['Condition ID type'].replace(to_replace='MSH', value='MeSH')
docm_web['URL']='http://docm.genome.wustl.edu/variants/' + docm_web['hgvs']

In [109]:
docm_web.columns

Index(['disease', 'doid', 'source_pubmed_id', 'transcript.name', 'stop',
       'variant_type', 'variant', 'gene', 'strand', 'reference_version',
       'hgvs', 'reference', 'amino_acid', 'cdna_change', 'chromosome', 'start',
       'mutation_type', 'Condition ID type', 'Condidion ID value'],
      dtype='object')

In [121]:
docm_clinvar = docm_web[['hgvs', 'chromosome', 'start', 'stop', 'reference', 'variant','URL','Condition ID type', 'Condition ID value', 'source_pubmed_id']]

In [129]:
docm_clinvar.columns=['HGVS', 'Chromosome', 'Start', 'Stop', 'Reference allele','Alternate allele','URL', 'Condition ID type', 'Condition ID value','Assertion method citation']
docm_clinvar['Assertion method citation'] = 'PMID:' + docm_clinvar['Assertion method citation'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [130]:
docm_clinvar

Unnamed: 0,HGVS,Chromosome,Start,Stop,Reference allele,Alternate allele,URL,Condition ID type,Condidion ID value,Assertion method citation
0,ENST00000361445:c.7514G>C,1,11169361,11169361,C,G,http://docm.genome.wustl.edu/variants/ENST0000...,UMLS,C1378703,PMID:24631838
1,ENST00000361445:c.7255G>A,1,11174420,11174420,C,T,http://docm.genome.wustl.edu/variants/ENST0000...,UMLS,C0279680,PMID:24625776
2,ENST00000361445:c.6667C>A,1,11182179,11182179,G,T,http://docm.genome.wustl.edu/variants/ENST0000...,UMLS,C1378703,PMID:24622468
3,ENST00000361445:c.6644C>A,1,11184573,11184573,G,T,http://docm.genome.wustl.edu/variants/ENST0000...,UMLS,C1378703,PMID:24631838
4,ENST00000361445:c.6637C>T,1,11184580,11184580,G,A,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,D008545,PMID:26490311
5,ENST00000361445:c.6324C>A,1,11187094,11187094,G,T,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,D013964,PMID:25295501
6,ENST00000361445:c.6040G>A,1,11187857,11187857,C,T,http://docm.genome.wustl.edu/variants/ENST0000...,UMLS,C0279680,PMID:24625776
7,ENST00000361445:c.5902C>T,1,11188519,11188519,G,A,http://docm.genome.wustl.edu/variants/ENST0000...,MeSH,D008545,PMID:26490311
8,ENST00000361445:c.4379T>C,1,11217299,11217299,A,G,http://docm.genome.wustl.edu/variants/ENST0000...,UMLS,C1378703,PMID:24631838
9,ENST00000358432:c.1171G>C,1,16464489,16464489,C,G,http://docm.genome.wustl.edu/variants/ENST0000...,UMLS,C0149782,PMID:20360610


In [134]:
docm_clinvar['Clinical significance'] = 'pathogenic or other?'
docm_clinvar['Collection method'] = 'literature only'
docm_clinvar['Allele origin'] = 'somatic'
docm_clinvar['Affected status'] ='yes'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [136]:
docm_clinvar.to_csv('docm_clinvar.tsv',sep='\t', index=False)