Merge branch 'master' of github.com:arq5x/gemini

arq5x · Jan 12, 2016 · ba5066c · ba5066c
2 parents 73032f5 + 6d52627
commit ba5066c
Show file tree

Hide file tree

Showing 39 changed files with 502 additions and 164 deletions.
diff --git a/docs/content/database_schema.rst b/docs/content/database_schema.rst
@@ -201,6 +201,9 @@ clinvar_dsdbid            STRING        Variant disease database ID
 clinvar_disease_acc       STRING        Variant Accession and Versions
 clinvar_in_locus_spec_db  BOOL          Submitted from a locus-specific database?
 clinvar_on_diag_assay     BOOL          Variation is interrogated in a clinical diagnostic assay?
+clinvar_gene_phenotype    STRING        '|' delimited list of phenotypes associated with this gene (includes any variant in the same 
+                                        gene in clinvar not just the current variant).
+geno2mp_hpo_ct            INTEGER       Value from geno2mp indicating count of HPO profiles. Set to -1 if missing
 ========================  ========      ==============================================================================================
 
 

diff --git a/docs/content/history.rst b/docs/content/history.rst
@@ -8,6 +8,19 @@ Release History
 #. Use SQLAlchemy for table definitions in an effort to support different RDBMS backends.
 #. X-linked recessive and dominant tools.
 
+0.18.1
+======
+1. Don't user order by when not needed for built in tools. Speeds up queries when min-kindreds is None or 1.
+2. Document --min-gq
+3. Set missing AF to -1 (instead of NULL) for unknown for all ESP, 1KG, ExAC allele frequency columns.
+4. Fix gemini load with -t all when only VEP is present.
+5. Add clinvar_gene_phenotype column which can be used to limit candidates to those that are in the same
+   gene as a gene with the known phenotype from clinvar. Phenotypes are all lower case.
+   Likely usage is: --filter "... and clinvar_gene_phenotype LIKE '%dysplasia%' where the '%' are needed
+   because the column is a '|' delimited list of all disease for that gene.
+6. Fix bug in gemini annotate where numeric operations on integers did not work for VCF.
+7. Add `geno2mp_hpo_ct` column which will be > 0 if that variant is present in geno2mp (http://geno2mp.gs.washington.edu/Geno2MP/#/)
+
 0.18.0
 ======
 0. Improved installation and update via conda (via Brad Chapman!).

diff --git a/docs/content/preprocessing.rst b/docs/content/preprocessing.rst
@@ -9,7 +9,7 @@ Step 1. split, left-align, and trim variants
 =============================================
 
 Variants with multiple alternate alleles will not be handled correctly by gemini (or by the tools
-used to annotate the variants). As projects get more samples it is likely that a non-negligible 
+used to annotate the variants). As projects get more samples it is likely that a non-negligible
 percentage of site will have multiple alternate alleles.
 
 In addition, variants that are not left-aligned and trimmed can be incorrectly (or not)
@@ -23,7 +23,7 @@ left-align, and trim their variants**. The tools we recommend for this are eithe
     vt decompose -s $VCF | vt normalize -r $REFERENCE - > $NEW_VCF
 
 gemini uses the allele depths from the AD tag. In order for `vt` to decompose correctly, users will have
-to change the #INFO field for AD in the header from Number=. to Number=R. 
+to change the #INFO field for AD in the header from Number=. to Number=R.
 
 Then the `$NEW_VCF` can be annotated with snpEff or VEP.
 
@@ -45,8 +45,8 @@ into GEMINI.
 
 .. note::
 	Choose the annotator as per your requirement!
-	Some gene/transcript annotations are available with only one tool (e.g. 
-	``Polyphen/Sift`` with VEP). As such these values would be set to None, 
+	Some gene/transcript annotations are available with only one tool (e.g.
+	``Polyphen/Sift`` with VEP). As such these values would be set to None,
 	if an alternate annotator is used during the load step.
 
 Instructions for installing and running these tools can be found in the following section:
@@ -156,17 +156,18 @@ The PED file format is documented here: PED_. An example PED file looks like thi
 |	1 M10500     -9    -9    2    2
 |	1 M128215    M10475  M10500    1    1
 
-The columns are family_id, name, paternal_id, maternal_id, sex and phenotype.
-For gemini, you can use either tabs or spaces, but not both.
+The columns are ``family_id``, ``name``, ``paternal_id``, ``maternal_id``,
+``sex`` and ``phenotype``. For GEMINI, you can use either tabs or spaces, but
+not both.
 
 You can also provide a PED file with a heading starting with #, and include extra
 fields, like this:
 
-|	#family_id name paternal_id maternal_id sex phenotype hair_color 
-| 	1 M10475    -9       -9  1    1 brown 
-| 	1 M10478     M10475  M10500    2    2 brown 
-| 	1 M10500     -9      -9    2    2 black 
-| 	1 M128215    M10475  M10500    1    1 blue 
+|	#family_id name paternal_id maternal_id sex phenotype hair_color
+| 	1 M10475    -9       -9  1    1 brown
+| 	1 M10478     M10475  M10500    2    2 brown
+| 	1 M10500     -9      -9    2    2 black
+| 	1 M128215    M10475  M10500    1    1 blue
 
 This will add the extra columns to the ``samples`` table and allow for you to
 use those extra columns during queries.
@@ -217,4 +218,4 @@ Loading VCFs without genotypes.
 To do.
 
 .. _PED: http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped
-.. _vt-paper: http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped 
+.. _vt-paper: http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped
diff --git a/docs/content/tools.rst b/docs/content/tools.rst
@@ -92,6 +92,14 @@ the GEMINI database.
 Filter variants that do not have at least this depth for all members in a
 a family. Default is 0.
 
+
+---------------------
+``--min-gq [0]``
+---------------------
+
+Filter variants that do not have at least this genotype quality for each sample in a family.
+Default is 0. Higher values are more stringent.
+
 ----------------------
 ``--allow-unaffected``
 ----------------------
@@ -166,7 +174,7 @@ Genotype Requirements
 - Sites are automatically phased by transmission when parents are present in order to remove false positive candidates.
 
   a. If data from one or both parents are unavailable and the child's data was not phased prior to loading into GEMINI,
-     all comp_het variant pairs will automatically be given priority == 3
+     all comp_het variant pairs will automatically be given priority == 2
 
   b. `--max-priority x` can be used to set the maximum allowed priority level at which candidate pairs are included in the output.
 
@@ -177,7 +185,7 @@ we prioritize with these rules:
 mom   dad      kid       phaseable   priority   notes
 ===   ===      ====      =========   ========   ================================================
 R-H   H-R      H-H       both        1          both sites phaseable and alts on opposite chroms
-               H-H       NO          2          singleton (unphaseable) HETs have priority 2.
+n/a   n/a      H-H       NO          2          singleton (unphaseable) HETs have priority 2.
 R-H   H-H      H-H       one         3          should be a rare occurrence
 H-H   H-H      H-H       NO          3          should be a rare occurrence
 A-R   H-H      H-H       both        NA         exclude hom-alts from un-affecteds
@@ -186,7 +194,7 @@ R-R   H-H      H-H       both        NA         phaseable, but alts are on the s
 
 .. note::
 
-   candidates of priority == 3 are very unlikely (< 1%) to be real
+   candidates of priority == 3 are very unlikely (< 1%) to be causal for a rare Mendelian condition
    (see: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3734130/); we report them
    for completeness, but strongly recommend using priority 1 and 2 only.
    Priority 2 is useful when there are multiple families, some of which consist of

diff --git a/docs/index.rst b/docs/index.rst
@@ -60,7 +60,7 @@ should make the installation more reliable.
 For users with an existing installation with any trouble using `gemini update --devel`,
 we suggest to do a fresh install using a command like:
 
-.. code-block::
+.. code-block:: bash
 
     wget https://github.com/arq5x/gemini/raw/master/gemini/scripts/gemini_install.py
     python gemini_install.py $tools $data
@@ -70,7 +70,7 @@ where `$tools` and `$data` are paths writable on your system.
 
 With an existing `$tool` and `$data` directory from a previous install, you can use the installer to re-install the Python code with the new version, but leave the existing data in place. To do this, first remove the old anaconda directory:
 
-.. code-block::
+.. code-block:: bash
 
     rm -rf $data/anaconda
 

diff --git a/gemini/GeminiQuery.py b/gemini/GeminiQuery.py
@@ -801,6 +801,7 @@ def _connect_to_database(self):
         # open up a new database
         if os.path.exists(self.db):
             self.conn = sqlite3.connect(self.db)
+            self.conn.text_factory = str
             self.conn.isolation_level = None
             # allow us to refer to columns by name
             #self.conn.row_factory = RowFactory

diff --git a/gemini/annotation_provenance/make-geno2mp.sh b/gemini/annotation_provenance/make-geno2mp.sh
@@ -0,0 +1,6 @@
+# 0.18.1
+wget -O - http://geno2mp.gs.washington.edu/download/Geno2MP.variants.vcf \
+	 | vt decompose -s - \
+	 | vt normalize -r /data/human/b37/human_g1k_v37_decoy.fasta - \
+	 | bgzip -c > geno2mp.variants.tidy.vcf.gz
+tabix geno2mp.variants.tidy.vcf.gz
diff --git a/gemini/annotations.py b/gemini/annotations.py
@@ -47,7 +47,8 @@ def get_anno_files( args ):
      'vista_enhancers': os.path.join(anno_dirname, 'hg19.vista.enhancers.20131108.bed.gz'),
      'fitcons': os.path.join(anno_dirname, "hg19_fitcons_fc-i6-0_V1-01.bed.gz"),
      'cosmic': os.path.join(anno_dirname, 'cosmic-v68-GRCh37.tidy.vcf.gz'),
-     'exac': os.path.join(anno_dirname, 'ExAC.r0.3.sites.vep.tidy.vcf.gz')
+     'exac': os.path.join(anno_dirname, 'ExAC.r0.3.sites.vep.tidy.vcf.gz'),
+     'geno2mp': os.path.join(anno_dirname, 'geno2mp.variants.tidy.vcf.gz'),
     }
     # optional annotations
     if os.path.exists(os.path.join(anno_dirname, 'hg19.gerp.bw')):
@@ -165,6 +166,9 @@ def lookup_clinvar_significance(self, sig_code):
                                    num_hom_alt \
                                    num_chroms")
 
+EXAC_EMPTY = ExacInfo(False, -1, -1, -1, -1, -1,
+                     -1, -1, -1, -1, -1, -1, -1)
+
 def load_annos(args):
     """
     Populate a dictionary of Tabixfile handles for
@@ -697,6 +701,8 @@ def get_esp_info(var):
                         else:
                             # alt allele is stored as 2nd.
                             acs[key] = float(lines[1]) / denom
+                    else:
+                        acs[key] = -1
 
                 # Is the SNP on an human exome chip?
                 if info_map.get('EXOME_CHIP') is not None and \
@@ -706,10 +712,11 @@ def get_esp_info(var):
                         info_map['EXOME_CHIP'] == "yes":
                     exome_chip = 1
                 break
-    return ESPInfo(found, acs.get('EA_AC'), acs.get("AA_AC"), acs.get("TAC"), exome_chip)
+    return ESPInfo(found, acs.get('EA_AC', -1), acs.get("AA_AC", -1),
+            acs.get("TAC", -1), exome_chip)
 
 
-EMPTY_1000G = ThousandGInfo(False, None, None, None, None, None, None)
+EMPTY_1000G = ThousandGInfo(False, -1, -1, -1, -1, -1, -1)
 def get_1000G_info(var, empty=EMPTY_1000G):
     """
     Returns a suite of annotations from the 1000 Genomes project
@@ -731,14 +738,25 @@ def get_1000G_info(var, empty=EMPTY_1000G):
                     (key, value) = info.split("=", 1)
                     info_map[key] = value
 
-            return ThousandGInfo(True, info_map.get('AF'), info_map.get('AMR_AF'),
-                         info_map.get('EAS_AF'), info_map.get('SAS_AF'),
-                         info_map.get('AFR_AF'), info_map.get('EUR_AF'))
+            return ThousandGInfo(True, info_map.get('AF', -1), info_map.get('AMR_AF', -1),
+                         info_map.get('EAS_AF', -1), info_map.get('SAS_AF', -1),
+                         info_map.get('AFR_AF', -1), info_map.get('EUR_AF', -1))
     return empty
 
-EXAC_EMTPY = ExacInfo(False, None, None, None, None, None,
-                     None, None, None, None, None, None, None)
-def get_exac_info(var, empty=EXAC_EMTPY):
+def get_geno2mp_ct(var):
+
+    for hit in annotations_in_vcf(var, "geno2mp", "vcf", "grch37"):
+        if not (var.start == hit.pos and var.REF == hit.ref):
+            continue
+        if not var.ALT[0] in hit.alt.split(","): continue
+
+        ct = next(x for x in hit.info.split(";") if x.startswith("HPO_CT="))
+        val = int(ct.split("=")[1])
+        return val
+    # missing is -1
+    return -1
+
+def get_exac_info(var, empty=EXAC_EMPTY):
     """
     Returns the allele frequencies from the Exac data (Broad)
     """
@@ -769,7 +787,7 @@ def get_exac_info(var, empty=EXAC_EMTPY):
             if info_map.get('AF') is not None:
                 aaf_ALL = info_map['AF'].split(",")[allele_num]
             else:
-                aaf_ALL = None
+                aaf_ALL = -1
 
             for grp in ('Adj', 'AFR', 'AMR', 'EAS', 'FIN', 'NFE', 'OTH', 'SAS'):
                 ac = info_map.get('AC_%s' % grp)
@@ -785,9 +803,9 @@ def get_exac_info(var, empty=EXAC_EMTPY):
                 ac_list = ac.split(",")
                 afs[grp] = float(ac_list[allele_num]) / float(an)
 
-            num_hets = info_map.get("AC_Het")
-            num_homs = info_map.get("AC_Hom")
-            called_chroms = info_map.get('AN_Adj')
+            num_hets = info_map.get("AC_Het", -1)
+            num_homs = info_map.get("AC_Hom", -1)
+            called_chroms = info_map.get('AN_Adj', -1)
 
             return ExacInfo(True, aaf_ALL, afs['Adj'], afs['AFR'], afs['AMR'],
                                 afs['EAS'], afs['FIN'], afs['NFE'], afs['OTH'],

diff --git a/gemini/config.py b/gemini/config.py
@@ -58,10 +58,10 @@ def read_gemini_config(dirs=None, allow_missing=False, use_globals=True, args=No
     if fname:
         with open(fname) as in_handle:
             config = yaml.load(in_handle)
-    if args and hasattr(args, "annotationdir") and args.annotationdir:
+    if args and hasattr(args, "annotation_dir") and args.annotation_dir:
         # If --annotation-dir is given via commandline interface, we will overwrite the
         # location from the config file
-        config["annotation_dir"] = args.annotationdir
+        config["annotation_dir"] = args.annotation_dir
     return config
 
 def _find_best_config_file(dirs=None):

diff --git a/gemini/database.py b/gemini/database.py
@@ -151,6 +151,8 @@ def create_tables(cursor, effect_fields=None):
                     clinvar_in_locus_spec_db bool,
                     clinvar_on_diag_assay bool,
                     clinvar_causal_allele text,
+                    clinvar_gene_phenotype text,
+                    geno2mp_hpo_ct integer,
                     pfam_domain text,
                     cyto_band text default NULL,
                     rmsk text default NULL,
@@ -462,6 +464,7 @@ def update_gene_summary_w_cancer_census(cursor, genes):
 @contextlib.contextmanager
 def database_transaction(db):
     conn = sqlite3.connect(db)
+    conn.text_factory = str
     conn.isolation_level = None
     cursor = conn.cursor()
     cursor.execute('PRAGMA synchronous = OFF')

diff --git a/gemini/gemini_annotate.py b/gemini/gemini_annotate.py
@@ -161,7 +161,7 @@ def get_hit_count(hits):
 def _map_list_types(hit_list, col_type):
     # TODO: handle missing because of VCF.
     try:
-        if col_type == "int":
+        if col_type in ("int", "integer"):
             return [int(h) for h in hit_list if not h in (None, 'nan')]
         elif col_type == "float":
             return [float(h) for h in hit_list if not h in (None, 'nan')]

diff --git a/gemini/gemini_load.py b/gemini/gemini_load.py
@@ -111,6 +111,8 @@ def finalize_merged_db(tmp_db, db):
     print st, "indexing final database."
 
     main_conn = sqlite3.connect(tmp_db)
+    main_conn.text_factory = str
+
     main_conn.isolation_level = None
     main_curr = main_conn.cursor()
     main_curr.execute('PRAGMA synchronous = OFF')