Skip to content

Commit

Permalink
Implement gapped sequences in genotype. Extend sequence viewer.
Browse files Browse the repository at this point in the history
  • Loading branch information
williamdlees committed Jan 6, 2020
1 parent e5b8998 commit 6860e07
Show file tree
Hide file tree
Showing 14 changed files with 273 additions and 165 deletions.
110 changes: 29 additions & 81 deletions app.py
Expand Up @@ -1223,7 +1223,6 @@ def new_sequence(species):
gene_description.sequence_name = form.new_name.data
gene_description.inferred_sequences.append(seq)
gene_description.sequence = seq.sequence_details.nt_sequence
gene_description.coding_seq_imgt = ''
gene_description.organism = species
gene_description.status = 'draft'
gene_description.author = current_user.name
Expand All @@ -1241,6 +1240,7 @@ def new_sequence(species):
gene_description.end_5prime_ext = seq.end_5prime_ext
gene_description.locus = seq.genotype_description.locus
gene_description.sequence_type = seq.genotype_description.sequence_type
gene_description.coding_seq_imgt = seq.sequence_details.nt_sequence_gapped if gene_description.sequence_type == 'V' else seq.sequence_details.nt_sequence

copy_acknowledgements(seq, gene_description)

Expand Down Expand Up @@ -2059,13 +2059,16 @@ def descs_to_airr(descs):
filename = 'affirmed_germlines_%s_%s.%s' % (species, format, ext)
return Response(dl, mimetype="application/octet-stream", headers={"Content-disposition": "attachment; filename=%s" % filename})

# Temp route to fetch ENA details for existing submissions
@app.route('/ena_details', methods=['GET'])
from imgt.imgt_ref import gap_sequence

# Temp route to add gapped sequences to genotypes
@app.route('/add_gapped', methods=['GET'])
@login_required
def tidy_genotype():
def add_gapped():
if not current_user.has_role('Admin'):
return redirect('/')

refs = get_imgt_gapped_reference_genes()
subs = db.session.query(Submission).all()
if subs is None:
flash('Submissions not found')
Expand All @@ -2074,86 +2077,31 @@ def tidy_genotype():
report = ''

for sub in subs:
if len(sub.repertoire) > 0 and sub.repertoire[0].repository_name == 'ENA':
if len(sub.repertoire) > 0:
report += 'Processing submission ' + sub.submission_id + '<br>'

for gd in sub.genotype_descriptions:
report += 'Processing genotype ' + gd.genotype_name + '<br>'

if gd.sequence_type == 'V':
for gen in gd.genotypes:
#if gen.nt_sequence_gapped is None or len(gen.nt_sequence_gapped) < 1:
if gen.sequence_id in refs[sub.species]:
gen.nt_sequence_gapped = gap_sequence(gen.nt_sequence, refs[sub.species][gen.sequence_id].upper())
elif gen.closest_reference in refs[sub.species]:
gen.nt_sequence_gapped = gap_sequence(gen.nt_sequence, refs[sub.species][gen.closest_reference])
report += 'Gapping ' + gen.sequence_id + ' against ' + gen.closest_reference + '<br>'
elif '_' in gen.sequence_id and gen.sequence_id.split('_')[0] in refs[sub.species]:
gen.nt_sequence_gapped = gap_sequence(gen.nt_sequence, refs[sub.species][gen.sequence_id.split('_')[0]])
report += 'Gapping ' + gen.sequence_id + ' against ' + gen.sequence_id.split('_')[0] + '<br>'
elif '+' in gen.sequence_id and gen.sequence_id.split('+')[0] in refs[sub.species]:
gen.nt_sequence_gapped = gap_sequence(gen.nt_sequence, refs[sub.species][gen.sequence_id.split('+')[0]])
report += 'Gapping ' + gen.sequence_id + ' against ' + gen.sequence_id.split('+')[0] + '<br>'
else:
report += 'Not sure how to gap ' + gen.sequence_id + ' - skipping <br>'

rep = sub.repertoire[0]
try:
details = get_ena_project_details(rep.rep_accession_no)
rep.rep_title = details['title']
rep.dataset_url = details['url']
except:
report += 'project accession number ' + rep.rep_accession_no + 'not found.<br>'


for inf in sub.inferred_sequences:
report += 'Processing inferred sequence' + str(inf.sequence_id) + '<br>'
try:
resp = get_ena_nuc_details(inf.seq_accession_no)
inf.seq_record_title = resp['title']
except:
report += 'sequence accession number ' + inf.seq_accession_no + 'not found.<br>'

for rec in inf.record_set:
db.session.delete(rec)

run_ids = inf.run_ids
run_ids = run_ids.replace(',', ' ')
run_ids = run_ids.replace(';', ' ')
run_ids = run_ids.split()

for run_id in run_ids:
try:
resp = get_ena_srr_details(run_id)

rec = RecordSet()
rec.rec_accession_no = run_id
rec.rec_record_title = resp['title']
rec.rec_url = resp['url']
inf.record_set.append(rec)
except:
report += 'Cant fetch run id details for accession ' + run_id + '<br>'
db.session.commit()

for gen in sub.genotype_descriptions:
report += 'Processing genotype' + gen.genotype_name + '<br>'
for rec in gen.sample_names:
db.session.delete(rec)
return report

sam_ids = gen.genotype_biosample_ids
sam_ids = sam_ids.replace(',', ' ')
sam_ids = sam_ids.replace(';', ' ')
sam_ids = sam_ids.split()

for sam_id in sam_ids:
try:
resp = get_ena_samn_details(sam_id)
rec = SampleName()
rec.sam_accession_no = sam_id
rec.sam_record_title = resp['title']
rec.sam_url = resp['url']
gen.sample_names.append(rec)
except ValueError as e:
report += 'Cant fetch sample details for accession ' + sam_id + '<br>'

for rec in gen.record_set:
db.session.delete(rec)

run_ids = gen.genotype_run_ids
run_ids = run_ids.replace(',', ' ')
run_ids = run_ids.replace(';', ' ')
run_ids = run_ids.split()

for run_id in run_ids:
try:
resp = get_ena_srr_details(run_id)
rec = RecordSet()
rec.rec_accession_no = run_id
rec.rec_record_title = resp['title']
rec.rec_url = resp['url']
gen.record_set.append(rec)

except ValueError as e:
report += 'Cant fetch recordset details for accession ' + run_id + '<br>'
db.session.commit()
return(report)
5 changes: 5 additions & 0 deletions db/genotype_db.py
Expand Up @@ -38,6 +38,7 @@ class Genotype(db.Model, GenotypeMixin):
haplotyping_gene = db.Column(db.String(1000))
haplotyping_ratio = db.Column(db.String(1000))
nt_sequence = db.Column(db.Text())
nt_sequence_gapped = db.Column(db.Text())
description_id = db.Column(db.Integer, db.ForeignKey('genotype_description.id'))
genotype_description = db.relationship('GenotypeDescription', backref = 'genotypes')

Expand Down Expand Up @@ -68,6 +69,7 @@ def save_Genotype(db, object, form, new=False):
object.haplotyping_gene = form.haplotyping_gene.data
object.haplotyping_ratio = form.haplotyping_ratio.data
object.nt_sequence = form.nt_sequence.data
object.nt_sequence_gapped = form.nt_sequence_gapped.data

if new:
db.session.add(object)
Expand Down Expand Up @@ -102,6 +104,7 @@ def populate_Genotype(db, object, form):
form.haplotyping_gene.data = object.haplotyping_gene
form.haplotyping_ratio.data = object.haplotyping_ratio
form.nt_sequence.data = object.nt_sequence
form.nt_sequence_gapped.data = object.nt_sequence_gapped



Expand Down Expand Up @@ -132,6 +135,7 @@ def copy_Genotype(c_from, c_to):
c_to.haplotyping_gene = c_from.haplotyping_gene
c_to.haplotyping_ratio = c_from.haplotyping_ratio
c_to.nt_sequence = c_from.nt_sequence
c_to.nt_sequence_gapped = c_from.nt_sequence_gapped



Expand Down Expand Up @@ -200,5 +204,6 @@ def make_Genotype_view(sub, private = False):
ret.items.append({"item": "Haplotyping Gene", "value": sub.haplotyping_gene, "tooltip": "The gene (or genes) from which haplotyping was inferred (e.g. IGHJ6)", "field": "haplotyping_gene"})
ret.items.append({"item": "Haplotyping Ratio", "value": sub.haplotyping_ratio, "tooltip": "The ratio (expressed as two percentages) with which the two inferred haplotypes were found (e.g. 60:40)", "field": "haplotyping_ratio"})
ret.items.append({"item": "NT Sequence", "value": sub.nt_sequence, "tooltip": "The consensus sequence provided by the inference tool", "field": "nt_sequence"})
ret.items.append({"item": "Gapped NT Sequence", "value": sub.nt_sequence_gapped, "tooltip": "IMGT-gapped sequence (V-gene only)", "field": "nt_sequence_gapped"})
return ret

13 changes: 13 additions & 0 deletions db/genotype_upload.py
Expand Up @@ -13,10 +13,13 @@
from wtforms import ValidationError
from sqlalchemy.exc import OperationalError
from db.genotype_tables import *
from imgt.imgt_ref import gap_sequence
from imgt.imgt_ref import get_imgt_gapped_reference_genes

light_loci = ['IGK', 'IGL', 'TRA', 'TRG']

def file_to_genotype(name, desc, db):
refs = get_imgt_gapped_reference_genes()
line = 2
try:
fi = open(name, 'r')
Expand Down Expand Up @@ -68,6 +71,16 @@ def file_to_genotype(name, desc, db):

if has_data:
rec.description_id = desc.id

# For the time being, we'll gap V records that don't have a gapped sequence provided.
# We can phase this out after a few months, once we're confident that everyone is using an ogrdbstats script which provides gapped sequences

if desc.sequence_type == 'V' and not rec.nt_sequence_gapped:
if rec.sequence_id in refs[desc.submission.species]:
rec.nt_sequence_gapped = gap_sequence(rec.nt_sequence, refs[desc.submission.species][rec.sequence_id].upper())
else:
rec.nt_sequence_gapped = gap_sequence(rec.nt_sequence, refs[desc.submission.species][rec.closest_reference].upper())

db.session.add(rec)
db.session.commit()

Expand Down
42 changes: 24 additions & 18 deletions db/genotype_view_table.py
Expand Up @@ -17,9 +17,6 @@ def td_contents(self, item, attr_list):
imgt_ref_gapped = get_imgt_gapped_reference_genes()
ref_codon_usage = get_reference_v_codon_usage()

bt_view = '<button type="button" id="btn_view_seq" class="btn btn-xs text-info icon_back" data-toggle="modal" data-target="#seqModal" data-sequence="%s" data-name="%s" data-fa="%s" data-toggle="tooltip" title="View"><span class="glyphicon glyphicon-search"></span>&nbsp;</button>' \
% (format_nuc_sequence(item.nt_sequence, 50), item.sequence_id, format_fasta_sequence(item.sequence_id, item.nt_sequence, 50))

if item.sequence_id in imgt_ref[item.genotype_description.submission.species]:
if item.nt_sequence.lower() == imgt_ref[item.genotype_description.submission.species][item.sequence_id]:
icon = 'glyphicon-ok'
Expand Down Expand Up @@ -65,6 +62,7 @@ def td_contents(self, item, attr_list):
bt_runs = ''
bt_hotspots = ''
bt_ref_found = ''
annots = []

if item.sequence_id not in imgt_ref[item.genotype_description.submission.species]:
if item.closest_reference not in imgt_ref[item.genotype_description.submission.species]:
Expand All @@ -85,7 +83,7 @@ def td_contents(self, item, attr_list):
mismatch = 0
aligned = True

for (r,s) in zip(list(ref_nt), list(seq_nt)):
for (r,s) in zip(ref_nt, seq_nt):
if r != s:
mismatch += 1

Expand All @@ -99,17 +97,18 @@ def td_contents(self, item, attr_list):
if item.genotype_description.sequence_type == 'V' and item.genotype_description.locus+'V' in ref_codon_usage[item.genotype_description.submission.species]:
try:
q_codons = []
ref_aa_gapped = list(imgt_ref_gapped[item.genotype_description.submission.species][item.closest_reference].upper().translate(gap='.'))
ref_aa_gapped = imgt_ref_gapped[item.genotype_description.submission.species][item.closest_reference].upper().translate(gap='.')
seq_aa = Seq(item.nt_sequence.upper(), generic_dna).translate()

seq_aa_gapped = gap_sequence(seq_aa, ref_aa_gapped)
seq_aa_gapped = list(seq_aa_gapped)
family = find_family(item.closest_reference)

for i in range( min(len(ref_aa_gapped), len(seq_aa_gapped))):
if ref_aa_gapped[i] != seq_aa_gapped[i] and '*' not in (ref_aa_gapped[i], seq_aa_gapped[i]) and '.' not in (ref_aa_gapped[i], seq_aa_gapped[i]):
if seq_aa_gapped[i] not in ref_codon_usage[item.genotype_description.submission.species][item.genotype_description.locus + 'V'][family][i+1]:
q_codons.append("%s%d" % (seq_aa_gapped[i], i+1))
j = len(seq_aa_gapped[:i].replace('.', ''))
annots.append((3*j, 3, '%s%d previously unreported in this family' % (seq_aa_gapped[i], i+1)))

if len(q_codons) > 0:
bt_codon_usage = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Amino Acid(s) previously unreported in this family: %s"><span class="glyphicon glyphicon-info-sign"></span>&nbsp;</button>' % ", ".join(q_codons)
Expand All @@ -119,39 +118,46 @@ def td_contents(self, item, attr_list):

# Check for lengthened strings of the same base

ref_qpos = [m.start() for m in re.finditer('(.)\\1+\\1+\\1+', str(ref_nt))]
seq_qpos = [m.start() for m in re.finditer('(.)\\1+\\1+\\1+', str(seq_nt))]

q_runs = []

# walk up each identified repeat of 4nt or more, flag any differences
for p in seq_qpos:
if p not in ref_qpos:
q_runs.append("%d" % find_gapped_index(p, item.genotype_description.submission.species, item.closest_reference))
rep_c = seq_nt[p]
i = p
while i < len(seq_nt) and i < len(ref_nt) and seq_nt[i] == rep_c:
if ref_nt[i] != rep_c:
q_runs.append("%d" % find_gapped_index(i, item.genotype_description.submission.species, item.closest_reference))
annots.append((i, 1, 'Possible repeated read error'))
break
i += 1

if len(q_runs) > 0:
bt_runs = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Possible repeated read errors at IMGT position(s) %s"><span class="glyphicon glyphicon-info-sign"></span>&nbsp;</button>' % ", ".join(q_runs)

# Check for RGYW/WRCY hotspot change

ref_qpos = [m.start() for m in re.finditer('[AG][GC][CT][AT]', str(ref_nt))]
seq_qpos = [m.start() for m in re.finditer('[AG][GC][CT][AT]', str(seq_nt))]
ref_qpos = [m.start() for m in re.finditer('[AG][G][CT][AT]', str(ref_nt))]

q_hotspots= []

for p in seq_qpos:
if p in ref_qpos and list(ref_nt)[p+1] != list(seq_nt)[p+1]:
for p in ref_qpos:
if ref_nt[p+1] != seq_nt[p+1]:
q_hotspots.append("%d" % find_gapped_index(p+1, item.genotype_description.submission.species, item.closest_reference))
annots.append((p+1, 1, 'G/C SNP in RGYW hotspot'))

ref_qpos = [m.start() for m in re.finditer('[AT][AG][GC][CT]', str(ref_nt))]
seq_qpos = [m.start() for m in re.finditer('[AT][AG][GC][CT]', str(seq_nt))]
ref_qpos = [m.start() for m in re.finditer('[AT][AG][C][CT]', str(ref_nt))]

for p in seq_qpos:
if p in ref_qpos and list(ref_nt)[p+2] != list(seq_nt)[p+2]:
for p in ref_qpos:
if ref_nt[p+2] != seq_nt[p+2]:
q_hotspots.append("%d" % find_gapped_index(p+2, item.genotype_description.submission.species, item.closest_reference))
annots.append((p+2, 1, 'C/G SNP in WRCY hotspot'))

if len(q_hotspots) > 0:
bt_hotspots = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="G/C SNP in RGYW/WRCY hotspot at IMGT position(s) %s"><span class="glyphicon glyphicon-info-sign"></span>&nbsp;</button>' % ", ".join(q_hotspots)

bt_view = popup_seq_button(item.sequence_id, item.nt_sequence, item.nt_sequence_gapped, annots=annots)

return bt_view + bt_check + bt_imgt + bt_igpdb + bt_vdjbase + bt_indels + bt_codon_usage + bt_runs + bt_hotspots + bt_ref_found

class GenTitleCol(StyledCol):
Expand Down
10 changes: 5 additions & 5 deletions forms/sequence_view_form.py
Expand Up @@ -85,11 +85,11 @@ def setup_sequence_view_tables(db, seq, private):
elif fn == 'coding_seq_imgt':
if field['value'] is not None and len(field['value']) > 0:
if seq.sequence_type == 'V':
field['value'] = Markup('<button id="seq_coding_view" name="seq_coding_view" type="button" class="btn btn-xs text-info icon_back" data-toggle="modal" data-target="#seqModal" data-sequence="%s" data-name="%s" data-fa="%s" data-toggle="tooltip" title="View"><span class="glyphicon glyphicon-search"></span>&nbsp;</button>' \
% (format_imgt_v(seq.coding_seq_imgt, 52) + trailer_text, seq.sequence_name, format_fasta_sequence(seq.sequence_name, seq.coding_seq_imgt, 50)))
field['value'] = Markup(popup_seq_button(seq.sequence_name, seq.coding_seq_imgt.replace('.', ''), seq.coding_seq_imgt))
else:
field['value'] = Markup('<button id="seq_coding_view" name="seq_coding_view" type="button" class="btn btn-xs text-info icon_back" data-toggle="modal" data-target="#seqModal" data-sequence="%s" data-name="%s" data-fa="%s"><span class="glyphicon glyphicon-search" data-toggle="tooltip" title="View"></span>&nbsp;</button>' \
% (format_nuc_sequence(seq.sequence, 50) + trailer_text, seq.sequence_name, format_fasta_sequence(seq.sequence_name, seq.sequence, 50)))
field['value'] = Markup(popup_seq_button(seq.sequence_name, seq.coding_seq_imgt, ''))
# field['value'] = Markup('<button id="seq_coding_view" name="seq_coding_view" type="button" class="btn btn-xs text-info icon_back" data-toggle="modal" data-target="#seqModal" data-sequence="%s" data-name="%s" data-fa="%s"><span class="glyphicon glyphicon-search" data-toggle="tooltip" title="View"></span>&nbsp;</button>' \
# % (format_nuc_sequence(seq.sequence, 50) + trailer_text, seq.sequence_name, format_fasta_sequence(seq.sequence_name, seq.sequence, 50)))
else:
field['value'] = 'None'
elif fn == 'release_description':
Expand Down Expand Up @@ -127,4 +127,4 @@ def setup_sequence_view_tables(db, seq, private):
t._cols['body'].th_html_attrs['class'] += ' row-80'
tables['history'].append(t)

return tables
return tables

0 comments on commit 6860e07

Please sign in to comment.