Implement gapped sequences in genotype. Extend sequence viewer.

airr-community · Jan 6, 2020 · 6860e07 · 6860e07
1 parent e5b8998
commit 6860e07
Show file tree

Hide file tree

Showing 14 changed files with 273 additions and 165 deletions.
diff --git a/app.py b/app.py
@@ -1223,7 +1223,6 @@ def new_sequence(species):
             gene_description.sequence_name = form.new_name.data
             gene_description.inferred_sequences.append(seq)
             gene_description.sequence = seq.sequence_details.nt_sequence
-            gene_description.coding_seq_imgt = ''
             gene_description.organism = species
             gene_description.status = 'draft'
             gene_description.author = current_user.name
@@ -1241,6 +1240,7 @@ def new_sequence(species):
             gene_description.end_5prime_ext = seq.end_5prime_ext
             gene_description.locus = seq.genotype_description.locus
             gene_description.sequence_type = seq.genotype_description.sequence_type
+            gene_description.coding_seq_imgt = seq.sequence_details.nt_sequence_gapped if gene_description.sequence_type == 'V' else seq.sequence_details.nt_sequence
 
             copy_acknowledgements(seq, gene_description)
 
@@ -2059,13 +2059,16 @@ def descs_to_airr(descs):
     filename = 'affirmed_germlines_%s_%s.%s' % (species, format, ext)
     return Response(dl, mimetype="application/octet-stream", headers={"Content-disposition": "attachment; filename=%s" % filename})
 
-# Temp route to fetch ENA details for existing submissions
-@app.route('/ena_details', methods=['GET'])
+from imgt.imgt_ref import gap_sequence
+
+# Temp route to add gapped sequences to genotypes
+@app.route('/add_gapped', methods=['GET'])
 @login_required
-def tidy_genotype():
+def add_gapped():
     if not current_user.has_role('Admin'):
         return redirect('/')
 
+    refs = get_imgt_gapped_reference_genes()
     subs = db.session.query(Submission).all()
     if subs is None:
         flash('Submissions not found')
@@ -2074,86 +2077,31 @@ def tidy_genotype():
     report = ''
 
     for sub in subs:
-        if len(sub.repertoire) > 0 and sub.repertoire[0].repository_name == 'ENA':
+        if len(sub.repertoire) > 0:
             report += 'Processing submission ' + sub.submission_id + '<br>'
 
+            for gd in sub.genotype_descriptions:
+                report += 'Processing genotype ' + gd.genotype_name + '<br>'
+
+                if gd.sequence_type == 'V':
+                    for gen in gd.genotypes:
+                        #if gen.nt_sequence_gapped is None or len(gen.nt_sequence_gapped) < 1:
+                        if gen.sequence_id in refs[sub.species]:
+                            gen.nt_sequence_gapped = gap_sequence(gen.nt_sequence, refs[sub.species][gen.sequence_id].upper())
+                        elif gen.closest_reference in refs[sub.species]:
+                            gen.nt_sequence_gapped = gap_sequence(gen.nt_sequence, refs[sub.species][gen.closest_reference])
+                            report += 'Gapping ' + gen.sequence_id + ' against ' + gen.closest_reference + '<br>'
+                        elif '_' in gen.sequence_id and gen.sequence_id.split('_')[0] in refs[sub.species]:
+                            gen.nt_sequence_gapped = gap_sequence(gen.nt_sequence, refs[sub.species][gen.sequence_id.split('_')[0]])
+                            report += 'Gapping ' + gen.sequence_id + ' against ' + gen.sequence_id.split('_')[0] + '<br>'
+                        elif '+' in gen.sequence_id and gen.sequence_id.split('+')[0] in refs[sub.species]:
+                            gen.nt_sequence_gapped = gap_sequence(gen.nt_sequence, refs[sub.species][gen.sequence_id.split('+')[0]])
+                            report += 'Gapping ' + gen.sequence_id + ' against ' + gen.sequence_id.split('+')[0] + '<br>'
+                        else:
+                            report += 'Not sure how to gap ' + gen.sequence_id + ' - skipping <br>'
 
-            rep = sub.repertoire[0]
-            try:
-                details = get_ena_project_details(rep.rep_accession_no)
-                rep.rep_title = details['title']
-                rep.dataset_url = details['url']
-            except:
-                report += 'project accession number ' + rep.rep_accession_no + 'not found.<br>'
-
-
-            for inf in sub.inferred_sequences:
-                report += 'Processing inferred sequence' + str(inf.sequence_id) + '<br>'
-                try:
-                    resp = get_ena_nuc_details(inf.seq_accession_no)
-                    inf.seq_record_title = resp['title']
-                except:
-                    report += 'sequence accession number ' + inf.seq_accession_no + 'not found.<br>'
-
-                for rec in inf.record_set:
-                    db.session.delete(rec)
-
-                run_ids = inf.run_ids
-                run_ids = run_ids.replace(',', ' ')
-                run_ids = run_ids.replace(';', ' ')
-                run_ids = run_ids.split()
-
-                for run_id in run_ids:
-                    try:
-                        resp = get_ena_srr_details(run_id)
-
-                        rec = RecordSet()
-                        rec.rec_accession_no = run_id
-                        rec.rec_record_title = resp['title']
-                        rec.rec_url = resp['url']
-                        inf.record_set.append(rec)
-                    except:
-                        report += 'Cant fetch run id details for accession ' + run_id + '<br>'
+                db.session.commit()
 
-            for gen in sub.genotype_descriptions:
-                report += 'Processing genotype' + gen.genotype_name + '<br>'
-                for rec in gen.sample_names:
-                    db.session.delete(rec)
+    return report
 
-                sam_ids = gen.genotype_biosample_ids
-                sam_ids = sam_ids.replace(',', ' ')
-                sam_ids = sam_ids.replace(';', ' ')
-                sam_ids = sam_ids.split()
 
-                for sam_id in sam_ids:
-                    try:
-                        resp = get_ena_samn_details(sam_id)
-                        rec = SampleName()
-                        rec.sam_accession_no = sam_id
-                        rec.sam_record_title = resp['title']
-                        rec.sam_url = resp['url']
-                        gen.sample_names.append(rec)
-                    except ValueError as e:
-                        report += 'Cant fetch sample details for accession ' + sam_id + '<br>'
-
-                for rec in gen.record_set:
-                    db.session.delete(rec)
-
-                run_ids = gen.genotype_run_ids
-                run_ids = run_ids.replace(',', ' ')
-                run_ids = run_ids.replace(';', ' ')
-                run_ids = run_ids.split()
-
-                for run_id in run_ids:
-                    try:
-                        resp = get_ena_srr_details(run_id)
-                        rec = RecordSet()
-                        rec.rec_accession_no = run_id
-                        rec.rec_record_title = resp['title']
-                        rec.rec_url = resp['url']
-                        gen.record_set.append(rec)
-
-                    except ValueError as e:
-                        report += 'Cant fetch recordset details for accession ' + run_id + '<br>'
-                db.session.commit()
-    return(report)
diff --git a/db/genotype_db.py b/db/genotype_db.py
@@ -38,6 +38,7 @@ class Genotype(db.Model, GenotypeMixin):
     haplotyping_gene = db.Column(db.String(1000))
     haplotyping_ratio = db.Column(db.String(1000))
     nt_sequence = db.Column(db.Text())
+    nt_sequence_gapped = db.Column(db.Text())
     description_id = db.Column(db.Integer, db.ForeignKey('genotype_description.id'))
     genotype_description = db.relationship('GenotypeDescription', backref = 'genotypes')
 
@@ -68,6 +69,7 @@ def save_Genotype(db, object, form, new=False):
     object.haplotyping_gene = form.haplotyping_gene.data
     object.haplotyping_ratio = form.haplotyping_ratio.data
     object.nt_sequence = form.nt_sequence.data
+    object.nt_sequence_gapped = form.nt_sequence_gapped.data
 
     if new:
         db.session.add(object)
@@ -102,6 +104,7 @@ def populate_Genotype(db, object, form):
     form.haplotyping_gene.data = object.haplotyping_gene
     form.haplotyping_ratio.data = object.haplotyping_ratio
     form.nt_sequence.data = object.nt_sequence
+    form.nt_sequence_gapped.data = object.nt_sequence_gapped
 
 
 
@@ -132,6 +135,7 @@ def copy_Genotype(c_from, c_to):
     c_to.haplotyping_gene = c_from.haplotyping_gene
     c_to.haplotyping_ratio = c_from.haplotyping_ratio
     c_to.nt_sequence = c_from.nt_sequence
+    c_to.nt_sequence_gapped = c_from.nt_sequence_gapped
 
 
 
@@ -200,5 +204,6 @@ def make_Genotype_view(sub, private = False):
     ret.items.append({"item": "Haplotyping Gene", "value": sub.haplotyping_gene, "tooltip": "The gene (or genes) from which haplotyping was inferred (e.g. IGHJ6)", "field": "haplotyping_gene"})
     ret.items.append({"item": "Haplotyping Ratio", "value": sub.haplotyping_ratio, "tooltip": "The ratio (expressed as two percentages) with which the two inferred haplotypes were found (e.g. 60:40)", "field": "haplotyping_ratio"})
     ret.items.append({"item": "NT Sequence", "value": sub.nt_sequence, "tooltip": "The consensus sequence provided by the inference tool", "field": "nt_sequence"})
+    ret.items.append({"item": "Gapped NT Sequence", "value": sub.nt_sequence_gapped, "tooltip": "IMGT-gapped sequence (V-gene only)", "field": "nt_sequence_gapped"})
     return ret
 
diff --git a/db/genotype_upload.py b/db/genotype_upload.py
@@ -13,10 +13,13 @@
 from wtforms import ValidationError
 from sqlalchemy.exc import OperationalError
 from db.genotype_tables import *
+from imgt.imgt_ref import gap_sequence
+from imgt.imgt_ref import get_imgt_gapped_reference_genes
 
 light_loci = ['IGK', 'IGL', 'TRA', 'TRG']
 
 def file_to_genotype(name, desc, db):
+    refs = get_imgt_gapped_reference_genes()
     line = 2
     try:
         fi = open(name, 'r')
@@ -68,6 +71,16 @@ def file_to_genotype(name, desc, db):
 
             if has_data:
                 rec.description_id = desc.id
+
+                # For the time being, we'll gap V records that don't have a gapped sequence provided.
+                # We can phase this out after a few months, once we're confident that everyone is using an ogrdbstats script which provides gapped sequences
+
+                if desc.sequence_type == 'V' and not rec.nt_sequence_gapped:
+                    if rec.sequence_id in refs[desc.submission.species]:
+                        rec.nt_sequence_gapped = gap_sequence(rec.nt_sequence, refs[desc.submission.species][rec.sequence_id].upper())
+                    else:
+                        rec.nt_sequence_gapped = gap_sequence(rec.nt_sequence, refs[desc.submission.species][rec.closest_reference].upper())
+
                 db.session.add(rec)
                 db.session.commit()
 

diff --git a/db/genotype_view_table.py b/db/genotype_view_table.py
@@ -17,9 +17,6 @@ def td_contents(self, item, attr_list):
         imgt_ref_gapped = get_imgt_gapped_reference_genes()
         ref_codon_usage = get_reference_v_codon_usage()
 
-        bt_view = '<button type="button" id="btn_view_seq" class="btn btn-xs text-info icon_back" data-toggle="modal" data-target="#seqModal" data-sequence="%s" data-name="%s" data-fa="%s" data-toggle="tooltip" title="View"><span class="glyphicon glyphicon-search"></span>&nbsp;</button>' \
-               % (format_nuc_sequence(item.nt_sequence, 50), item.sequence_id, format_fasta_sequence(item.sequence_id, item.nt_sequence, 50))
-
         if item.sequence_id in imgt_ref[item.genotype_description.submission.species]:
             if item.nt_sequence.lower() == imgt_ref[item.genotype_description.submission.species][item.sequence_id]:
                 icon = 'glyphicon-ok'
@@ -65,6 +62,7 @@ def td_contents(self, item, attr_list):
         bt_runs = ''
         bt_hotspots = ''
         bt_ref_found = ''
+        annots = []
 
         if item.sequence_id not in imgt_ref[item.genotype_description.submission.species]:
             if item.closest_reference not in imgt_ref[item.genotype_description.submission.species]:
@@ -85,7 +83,7 @@ def td_contents(self, item, attr_list):
                 mismatch = 0
                 aligned = True
 
-                for (r,s) in zip(list(ref_nt), list(seq_nt)):
+                for (r,s) in zip(ref_nt, seq_nt):
                     if r != s:
                         mismatch += 1
 
@@ -99,17 +97,18 @@ def td_contents(self, item, attr_list):
                     if item.genotype_description.sequence_type == 'V' and item.genotype_description.locus+'V' in ref_codon_usage[item.genotype_description.submission.species]:
                         try:
                             q_codons = []
-                            ref_aa_gapped = list(imgt_ref_gapped[item.genotype_description.submission.species][item.closest_reference].upper().translate(gap='.'))
+                            ref_aa_gapped = imgt_ref_gapped[item.genotype_description.submission.species][item.closest_reference].upper().translate(gap='.')
                             seq_aa = Seq(item.nt_sequence.upper(), generic_dna).translate()
 
                             seq_aa_gapped = gap_sequence(seq_aa, ref_aa_gapped)
-                            seq_aa_gapped = list(seq_aa_gapped)
                             family = find_family(item.closest_reference)
 
                             for i in range( min(len(ref_aa_gapped), len(seq_aa_gapped))):
                                 if ref_aa_gapped[i] != seq_aa_gapped[i] and '*' not in (ref_aa_gapped[i], seq_aa_gapped[i]) and '.' not in (ref_aa_gapped[i], seq_aa_gapped[i]):
                                     if seq_aa_gapped[i] not in ref_codon_usage[item.genotype_description.submission.species][item.genotype_description.locus + 'V'][family][i+1]:
                                         q_codons.append("%s%d" % (seq_aa_gapped[i], i+1))
+                                        j = len(seq_aa_gapped[:i].replace('.', ''))
+                                        annots.append((3*j, 3, '%s%d previously unreported in this family' % (seq_aa_gapped[i], i+1)))
 
                             if len(q_codons) > 0:
                                 bt_codon_usage = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Amino Acid(s) previously unreported in this family: %s"><span class="glyphicon glyphicon-info-sign"></span>&nbsp;</button>' % ", ".join(q_codons)
@@ -119,39 +118,46 @@ def td_contents(self, item, attr_list):
 
                     # Check for lengthened strings of the same base
 
-                    ref_qpos = [m.start() for m in re.finditer('(.)\\1+\\1+\\1+', str(ref_nt))]
                     seq_qpos = [m.start() for m in re.finditer('(.)\\1+\\1+\\1+', str(seq_nt))]
-
                     q_runs = []
 
+                    # walk up each identified repeat of 4nt or more, flag any differences
                     for p in seq_qpos:
-                        if p not in ref_qpos:
-                            q_runs.append("%d" % find_gapped_index(p, item.genotype_description.submission.species, item.closest_reference))
+                        rep_c = seq_nt[p]
+                        i = p
+                        while i < len(seq_nt) and i < len(ref_nt) and seq_nt[i] == rep_c:
+                            if ref_nt[i] != rep_c:
+                                q_runs.append("%d" % find_gapped_index(i, item.genotype_description.submission.species, item.closest_reference))
+                                annots.append((i, 1, 'Possible repeated read error'))
+                                break
+                            i += 1
 
                     if len(q_runs) > 0:
                         bt_runs = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Possible repeated read errors at IMGT position(s) %s"><span class="glyphicon glyphicon-info-sign"></span>&nbsp;</button>' % ", ".join(q_runs)
 
                     # Check for RGYW/WRCY hotspot change
 
-                    ref_qpos = [m.start() for m in re.finditer('[AG][GC][CT][AT]', str(ref_nt))]
-                    seq_qpos = [m.start() for m in re.finditer('[AG][GC][CT][AT]', str(seq_nt))]
+                    ref_qpos = [m.start() for m in re.finditer('[AG][G][CT][AT]', str(ref_nt))]
 
                     q_hotspots= []
 
-                    for p in seq_qpos:
-                        if p in ref_qpos and list(ref_nt)[p+1] != list(seq_nt)[p+1]:
+                    for p in ref_qpos:
+                        if ref_nt[p+1] != seq_nt[p+1]:
                             q_hotspots.append("%d" % find_gapped_index(p+1, item.genotype_description.submission.species, item.closest_reference))
+                            annots.append((p+1, 1, 'G/C SNP in RGYW hotspot'))
 
-                    ref_qpos = [m.start() for m in re.finditer('[AT][AG][GC][CT]', str(ref_nt))]
-                    seq_qpos = [m.start() for m in re.finditer('[AT][AG][GC][CT]', str(seq_nt))]
+                    ref_qpos = [m.start() for m in re.finditer('[AT][AG][C][CT]', str(ref_nt))]
 
-                    for p in seq_qpos:
-                        if p in ref_qpos and list(ref_nt)[p+2] != list(seq_nt)[p+2]:
+                    for p in ref_qpos:
+                        if ref_nt[p+2] != seq_nt[p+2]:
                             q_hotspots.append("%d" % find_gapped_index(p+2, item.genotype_description.submission.species, item.closest_reference))
+                            annots.append((p+2, 1, 'C/G SNP in WRCY hotspot'))
 
                     if len(q_hotspots) > 0:
                         bt_hotspots = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="G/C SNP in RGYW/WRCY hotspot at IMGT position(s) %s"><span class="glyphicon glyphicon-info-sign"></span>&nbsp;</button>' % ", ".join(q_hotspots)
 
+        bt_view = popup_seq_button(item.sequence_id, item.nt_sequence, item.nt_sequence_gapped, annots=annots)
+
         return bt_view + bt_check + bt_imgt + bt_igpdb + bt_vdjbase + bt_indels + bt_codon_usage + bt_runs + bt_hotspots + bt_ref_found
 
 class GenTitleCol(StyledCol):

diff --git a/forms/sequence_view_form.py b/forms/sequence_view_form.py
@@ -85,11 +85,11 @@ def setup_sequence_view_tables(db, seq, private):
         elif fn == 'coding_seq_imgt':
             if field['value'] is not None and len(field['value']) > 0:
                 if seq.sequence_type == 'V':
-                    field['value'] =  Markup('<button id="seq_coding_view" name="seq_coding_view" type="button" class="btn btn-xs text-info icon_back" data-toggle="modal" data-target="#seqModal" data-sequence="%s" data-name="%s" data-fa="%s" data-toggle="tooltip" title="View"><span class="glyphicon glyphicon-search"></span>&nbsp;</button>' \
-                        % (format_imgt_v(seq.coding_seq_imgt, 52) + trailer_text, seq.sequence_name, format_fasta_sequence(seq.sequence_name, seq.coding_seq_imgt, 50)))
+                    field['value'] = Markup(popup_seq_button(seq.sequence_name, seq.coding_seq_imgt.replace('.', ''), seq.coding_seq_imgt))
                 else:
-                    field['value'] =  Markup('<button id="seq_coding_view" name="seq_coding_view" type="button" class="btn btn-xs text-info icon_back" data-toggle="modal" data-target="#seqModal" data-sequence="%s" data-name="%s" data-fa="%s"><span class="glyphicon glyphicon-search" data-toggle="tooltip" title="View"></span>&nbsp;</button>' \
-                        % (format_nuc_sequence(seq.sequence, 50) + trailer_text, seq.sequence_name, format_fasta_sequence(seq.sequence_name, seq.sequence, 50)))
+                    field['value'] = Markup(popup_seq_button(seq.sequence_name, seq.coding_seq_imgt, ''))
+#                    field['value'] =  Markup('<button id="seq_coding_view" name="seq_coding_view" type="button" class="btn btn-xs text-info icon_back" data-toggle="modal" data-target="#seqModal" data-sequence="%s" data-name="%s" data-fa="%s"><span class="glyphicon glyphicon-search" data-toggle="tooltip" title="View"></span>&nbsp;</button>' \
+#                        % (format_nuc_sequence(seq.sequence, 50) + trailer_text, seq.sequence_name, format_fasta_sequence(seq.sequence_name, seq.sequence, 50)))
             else:
                 field['value'] = 'None'
         elif fn == 'release_description':
@@ -127,4 +127,4 @@ def setup_sequence_view_tables(db, seq, private):
         t._cols['body'].th_html_attrs['class'] += ' row-80'
         tables['history'].append(t)
 
-    return tables
+    return tables