Skip to content

Commit

Permalink
fix syntax errors for parsers
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinxin committed Jul 31, 2015
1 parent 3a1c628 commit 2832de7
Show file tree
Hide file tree
Showing 24 changed files with 132 additions and 234 deletions.
3 changes: 0 additions & 3 deletions src/dataload/contrib/cadd/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
from .cadd_parser import load_contig


__METADATA__ = {
"src_name": 'CADD',
"src_url": 'http://cadd.gs.washington.edu/',
Expand Down
20 changes: 10 additions & 10 deletions src/dataload/contrib/cadd/cadd_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,19 @@
from utils.common import timesofar
from utils.mongo import get_src_db
from utils.hgvs import get_hgvs_from_vcf
## tabix file links from CADD http://cadd.gs.washington.edu/download
## whole genome SNVs including annotations
# tabix file links from CADD http://cadd.gs.washington.edu/download
# whole genome SNVs including annotations
whole_genome = 'http://krishna.gs.washington.edu/download/CADD/v1.2/whole_genome_SNVs_inclAnno.tsv.gz'
## SNV variants on Illumina Exome BeadChip
# SNV variants on Illumina Exome BeadChip
exome = '/opt/myvariant.info/load_archive/cadd/HumanExome-12v1-1_A_inclAnno.tsv.gz'
## 1000 Genomes variants SNVs and InDels including all annotations
# 1000 Genomes variants SNVs and InDels including all annotations
thousandgp = '/opt/myvariant.info/load_archive/cadd/1000G_inclAnno.tsv.gz'
## Exome Aggreation Consortium variants including all annotations
# Exome Aggreation Consortium variants including all annotations
exac = 'opt/myvariant.info/load_archive/cadd/ExAC.r0.2_inclAnno.tsv.gz'
## ESP6500 variants SNVs and InDels including all annotations
# ESP6500 variants SNVs and InDels including all annotations
esp = 'opt/myvariant.info/load_archive/cadd/ESP6500SI_inclAnno.tsv.gz'

## number of fields/annotations
# number of fields/annotations
VALID_COLUMN_NO = 116

DEPENDENCIES = ["pysam", "pymongo"]
Expand Down Expand Up @@ -217,8 +217,8 @@ def load_contig(contig):
'''save cadd contig into mongodb collection.
should be an iterable.
'''
#if CADD_INPUT == "exome":
#CADD_INPUT = exome
# if CADD_INPUT == "exome":
# CADD_INPUT = exome
tabix = pysam.Tabixfile(whole_genome)
src_db = get_src_db()
target_coll = src_db["cadd"]
Expand All @@ -235,6 +235,6 @@ def load_contig(contig):
if cnt % 100000 == 0:
print(cnt, timesofar(t0))
if doc_list:
target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0)
target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0)
print("successfully loaded cadd chromosome %s into mongodb" % contig)
print("total docs: {}; total time: {}".format(cnt, timesofar(t0)))
2 changes: 0 additions & 2 deletions src/dataload/contrib/clinvar/clinvar_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,6 @@ def _map_line_to_json(fields):
HGVS = None
cds = fields[18].split(":")
cds = cds[1]

seq = re.findall(r'[ATCGMNHYR]+|[0-9]+', cds)[-1]
replace = re.findall(r'[ATCGMNYR=]+', cds)
sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds)
ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds)
Expand Down
6 changes: 1 addition & 5 deletions src/dataload/contrib/clinvar/clinvar_xml_parser.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
import re
from itertools import imap, groupby
import os

# Generated Mon Mar 30 11:14:08 2015 by generateDS.py version 2.15a.
# Command line:
# /home/cwu/opt/devpy/bin/generateDS.py -o\
# "clinvar.py" -s "clinvarsubs.py" /home/cwu/Desktop/clinvar_public.xsd
import clinvar

from utils.dataload import unlist, dict_sweep, \
value_convert, merge_duplicate_rows, rec_handler
value_convert, rec_handler


def _map_line_to_json(cp):
Expand Down
15 changes: 7 additions & 8 deletions src/dataload/contrib/cosmic2/cosmic_parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
# -*- coding: utf-8 -*-
import os
import csv
import re
from itertools import imap, groupby, ifilter
import operator
import collections
from utils.dataload import dict_sweep, value_convert, merge_duplicate_rows


VALID_COLUMN_NO = 29 + 1
Expand Down Expand Up @@ -34,7 +33,7 @@ def _map_line_to_json(fields):
HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd)
elif del_ins:
HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, comp.group())
#elif comp:
# elif comp:
# HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, comp.group())
else:
HGVS = fields[12]
Expand All @@ -45,7 +44,7 @@ def _map_line_to_json(fields):
return

one_snp_json = {
"sorter" : fields[17] + fields[13],
"sorter": fields[17] + fields[13],
"_id": HGVS,
"cosmic":
{
Expand Down Expand Up @@ -93,8 +92,8 @@ def _map_line_to_json(fields):

# open file, parse, pass to json mapper
def load_data(input_file):
#os.system("sort -t$'\t' -k18 -k14 %s > %s_sorted.tsv" % (input_file, input_file))
#open_file = open("%s_sorted.tsv" % (input_file))
# os.system("sort -t$'\t' -k18 -k14 %s > %s_sorted.tsv" % (input_file, input_file))
# open_file = open("%s_sorted.tsv" % (input_file))
open_file = open(input_file)
open_file = csv.reader(open_file, delimiter="\t")
cosmic = []
Expand All @@ -109,8 +108,8 @@ def load_data(input_file):
print row[-1]
cosmic = sorted(cosmic, key=operator.itemgetter(17), reverse=True)
cosmic = ifilter(lambda row:
row[17] != "" and
row[13] != "", cosmic)
row[17] != "" and
row[13] != "", cosmic)
json_rows = imap(_map_line_to_json, cosmic)
json_rows = (row for row in json_rows if row)
row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"]))
Expand Down
10 changes: 2 additions & 8 deletions src/dataload/contrib/dbnsfp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from .dbnsfp_parser import load_data as _load_data


<<<<<<< HEAD
DBNSFP_INPUT_FILE = '/opt/myvariant.info/load_archive/dbnsfp/dbNSFP3.0b2c_variant.chr*'
=======

__METADATA__ = {
"src_name": 'dbNSFP',
"src_url": 'https://sites.google.com/site/jpopgen/dbNSFP',
Expand All @@ -12,10 +10,6 @@
}


DBNSFP_INPUT_FILE = '/opt/myvariant.info/load_archive/dbnsfp/dbNSFP2.9_variant*'
>>>>>>> 4cdff9ae14fa4c74acd237520a17ae1927f8aafd


def load_data():
dbnsfp_data = _load_data(DBNSFP_INPUT_FILE)
return dbnsfp_data
Expand Down Expand Up @@ -103,7 +97,7 @@ def get_mapping():
"type": "string"
}
}
},
},
"interpro_domain": {
"type": "string"
},
Expand Down
10 changes: 6 additions & 4 deletions src/dataload/contrib/dbnsfp/dbnsfp_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@

'''this parser is for dbNSFP v3.0 beta2 downloaded from
https://sites.google.com/site/jpopgen/dbNSFP'''


# convert one snp to json
def _map_line_to_json(fields, version = 'hg19'):
def _map_line_to_json(fields, version='hg19'):
# specific variable treatment
chrom = fields[0]
if chrom = 'M':
if chrom == 'M':
chrom = 'MT'
# fields[7] in version 2, represent hg18_pos
if fields[10] == ".":
Expand Down Expand Up @@ -209,14 +211,14 @@ def _map_line_to_json(fields, version = 'hg19'):


# open file, parse, pass to json mapper
def data_generator(input_file, version = 'hg19'):
def data_generator(input_file, version='hg19'):
open_file = open(input_file)
db_nsfp = csv.reader(open_file, delimiter="\t")
db_nsfp.next() # skip header
previous_row = None
for row in db_nsfp:
assert len(row) == VALID_COLUMN_NO
current_row = _map_line_to_json(row, version = 'hg19')
current_row = _map_line_to_json(row, version='hg19')
if previous_row:
if current_row["_id"] == previous_row["_id"]:
aa = previous_row["dbnsfp"]["aa"]
Expand Down
6 changes: 3 additions & 3 deletions src/dataload/contrib/dbsnp/dbsnp_asn1flat.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,9 @@ def _parse_SNP(self, rec_lines):
}
if len(snp_line) == 5:
allele_origin = snp_line[4][1]
#pat = "(.+)\((.+)\)/(.+)\((.+)\)"
#grp = re.match(pat, allele_orgin).groups()
#assert len(grp) == 4
# pat = "(.+)\((.+)\)/(.+)\((.+)\)"
# grp = re.match(pat, allele_orgin).groups()
# assert len(grp) == 4
pat = "(.*)\((.+)\)"
d = []
for x in allele_origin.split('/'):
Expand Down
2 changes: 1 addition & 1 deletion src/dataload/contrib/dbsnp/dbsnp_vcf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def get_hgvs_name(record, as_list=False):
# something ambigious here
pass
else:
#other cases of deletion currently been ignored
# other cases of deletion currently been ignored
# e.g. rs369371434, rs386822484
pass
else:
Expand Down
61 changes: 30 additions & 31 deletions src/dataload/contrib/drugbank/__init__.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,37 @@
# -*- coding: utf-8 -*-

from .drugbank_parser import load_data #as _load_data
# DRUGBANK_URL = "http://www.drugbank.ca/genobrowse/snp-adr?page="

#DRUGBANK_URL = "http://www.drugbank.ca/genobrowse/snp-adr?page="

#def load_data():
# def load_data():
# drugbank_data = _load_data(DRUGBANK_URL)
# return drugbank_data


def get_mapping():
mapping = {
"drugbank": {
"properties": {
"adverse_reaction": {
"type": "string",
"analyzer": "string_lowercase"
},
"defining_change": {
"type": "string",
"analyzer": "string_lowercase"
},
"drug": {
"type": "string",
"analyzer": "string_lowercase"
},
"interacting_gene_or_enzyme": {
"type": "string",
"analyzer": "string_lowercase"
},
"snp_rs_id": {
"type": "string",
"analyzer": "string_lowercase"
}
}
}
}
return mapping
mapping = {
"drugbank": {
"properties": {
"adverse_reaction": {
"type": "string",
"analyzer": "string_lowercase"
},
"defining_change": {
"type": "string",
"analyzer": "string_lowercase"
},
"drug": {
"type": "string",
"analyzer": "string_lowercase"
},
"interacting_gene_or_enzyme": {
"type": "string",
"analyzer": "string_lowercase"
},
"snp_rs_id": {
"type": "string",
"analyzer": "string_lowercase"
}
}
}
}
return mapping
9 changes: 3 additions & 6 deletions src/dataload/contrib/drugbank/drugbank_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from dataload.contrib.drugbank.drugbank_write_file import write_file
from itertools import imap, groupby
from utils.dataload import dict_sweep, merge_duplicate_rows
import csv
Expand All @@ -15,9 +14,8 @@ def _map_line_to_json(fields):
if HGVS is None:
return
one_snp_json = {

"_id": HGVS,
'drugbank':
'drugbank':
{
'drug': fields[2],
'interacting_gene_or_enzyme': fields[3],
Expand All @@ -28,11 +26,10 @@ def _map_line_to_json(fields):
'references': fields[7]
}
}

return dict_sweep(one_snp_json, ['Not Available'])


def load_data(input_file):
def load_data(input_file):
"""
write_file output and csv.reader input_file
'/opt/myvariant.info/load_archive/drugbank/drugbank.csv'
Expand All @@ -42,4 +39,4 @@ def load_data(input_file):
drugbank.next()
json_rows = imap(_map_line_to_json, drugbank)
row_groups = (it for (key, it) in groupby(json_rows, lambda row: row['_id']))
return (merge_duplicate_rows(rg, 'drugbank') for rg in row_groups)
return (merge_duplicate_rows(rg, 'drugbank') for rg in row_groups)
13 changes: 7 additions & 6 deletions src/dataload/contrib/drugbank/drugbank_write_file.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
# -*- coding: utf-8 -*-
import pandas as pd


def rs2hgvs(mv, rs_id):
var = mv.queryvariant(rs_id)
hgvs = [(rs_id, variant['_id']) for variant in var['hits'] if variant]
return hgvs


def write_file(url):
pages = range(1,5)
pages = range(1, 5)
urls = map(lambda x: url + str(x), pages)
drugbank_df = pd.concat(map(lambda url: pd.read_html(url)[0], urls))

rs_ids = drugbank_df['SNP RS ID']
mv = MyVariantInfo()
hgvs = [rs2hgvs(mv, rs_id) for rs_id in rs_ids]
id_df = pd.concat(map(pd.DataFrame, hgvs))

id_df.columns = "rs_id", "hgvs_id"
drugbank_df.rename(columns={"SNP RS ID": "rs_id"}, inplace=True)

drugbank_df.set_index("rs_id", inplace=True)
id_df.set_index("rs_id", inplace=True)
df = id_df.join(drugbank_df, how='right').drop_duplicates()
Expand Down
4 changes: 2 additions & 2 deletions src/dataload/contrib/emv/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
}


## must convert column 3, the coding HGVS nomenclature, to genomic.
## paste new column to file before loading data
# must convert column 3, the coding HGVS nomenclature, to genomic.
# paste new column to file before loading data
EMV_INPUT_FILE = '/opt/myvariant.info/load_archive/emv/emv.csv'


Expand Down
4 changes: 2 additions & 2 deletions src/dataload/contrib/emv/emv_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@

from utils.dataload import dict_sweep, value_convert, unlist, merge_duplicate_rows

## merge EMV file with genomic ID file
#def file_merge(emv_file, id_file):
# merge EMV file with genomic ID file
# def file_merge(emv_file, id_file):
# os.system("cut -f3 genomic_id.txt > genomic_id3.txt")
# os.system("paste -d"," genomic_id3.txt EmVClass.2014-3.csv > emv.csv")

Expand Down

0 comments on commit 2832de7

Please sign in to comment.