In [None]:
import binascii
import gzip
import json
import os
import sys
import re

from io import BytesIO

import pandas as pd
import json
import altair as alt

from altair_saver import save
from Bio import SeqIO
from IPython.display import HTML
from onecodex import Api
from onecodex.notebooks.report import set_style, title

In [None]:
ocx = Api()

In [None]:
ENVIRONMENT = os.environ.get("ONE_CODEX_REPORT_ENV", "draft")

if ENVIRONMENT == "production":
    sample_uuid = os.environ["ONE_CODEX_SAMPLE_UUID"]
else:
    sample_uuid = None
    sample_filename = "sample.fastq"
    
sample = ocx.Samples.get(sample_uuid)    

assert sample is not None, "Sample does not exist"
sample_filename = sample.filename
if not os.path.exists(sample_filename):
    sample.download()

In [None]:
# output paths
VARIANTS_TSV_PATH = "variants.tsv"
NEXTCLADE_JSON = "nextclade.json"
NEXTCLADE_TSV_PATH = "nextclade.tsv"
PANGOLIN_CSV_PATH = "pangolin.csv"
BAM_PATH = "covid19.bam"
CONSENSUS_PATH = "consensus.fa"

# input paths
REFERENCE_PATH = os.environ.get(
    "FASTA_REFERENCE", "/share/nCoV-2019.reference.fasta"
)

# default illumina + ivar pipeline
BED_FILE_PATH = os.environ.get("BED_FILE_PATH", "/share/ARTIC-V1.bed")
REFERENCE_NAME = os.path.basename(REFERENCE_PATH).rstrip('.fasta')

In [None]:
if os.getenv("SEQUENCING_PLATFORM") == "Oxford Nanopore":
#     print("Using ONT ARTIC v3 pipeline to call variants")
    MIN_DEPTH = 50
    !/usr/local/bin/covid19_call_variants.artic.sh {sample_filename} > variants.log 2>&1
else:
#     print("Using short-read ARTIC v1 pipeline to call variants")
    MIN_DEPTH = 10
    !/usr/local/bin/covid19_call_variants.sh {REFERENCE_PATH} {sample_filename} {BED_FILE_PATH} > variants.log 2>&1

In [None]:
### Before proceeding, do QC on the consensus sequence.

error_messages = []
for record in SeqIO.parse(CONSENSUS_PATH, "fasta"):
    if record.seq.count("N") > 20000:
        error_messages.append("The consensus sequence has too many ambiguous bases: " + str(record.seq.count("N")) + " N's against the 29,903 base reference sequence.")
    runs = re.split(r"[^ATGC]", str(record.seq)) # Split contig into unambiguous stretches
    max_len = len(max(runs, key=len)) # Length of longest unambiguous stretch
    if max_len < 10000:
        error_messages.append("The consensus sequence is too incomplete for GISAID submission: the longest stretch of unambiguous bases is only " + str(max_len) + " bases (must be over 10,000).")

error_dict = {"msg": " ".join(error_messages)}

In [None]:
# post-process variants
!post_process_variants.sh consensus.fa > variants.log 2>&1

In [None]:
# load reference genome
reference = list(SeqIO.parse(REFERENCE_PATH, "fasta"))
reference_length = len(reference[0])

In [None]:
!samtools depth $BAM_PATH > snps.depth 2> /dev/null

In [None]:
n_reads = sample.primary_classification.results()["n_reads"]

In [None]:
samtools_view_output = !samtools view -F 2308 $BAM_PATH | wc -l
n_mapped_reads = int(samtools_view_output[0])
proportion_mapped_reads = n_mapped_reads / n_reads

In [None]:
depth_table = []

with open("snps.depth") as handle:
    for line in handle:
        row = line.strip().split("\t")
        depth_table.append(
            {"reference": row[0], "position": int(row[1]), "depth": int(row[2])}
        )
depth_table = pd.DataFrame(depth_table, columns=["reference", "position", "depth"])

In [None]:
# Calculate genome coverage (what percent of bases are coveraged at X coverage)
# Use a fixed reference length that we use for `samtools depth` above

covered_sites = set()
covered_sites_mindepth = set()

for _, row in depth_table.iterrows():
    row = row.to_dict()
    if row["depth"] >= 1:
        covered_sites.add(row["position"])
    if row["depth"] >= MIN_DEPTH:
        covered_sites_mindepth.add(row["position"])        

cov = len(covered_sites) / reference_length
if cov <= 0.9:
    error_messages.append("The consensus sequence is too incomplete for GISAID submission (reads must span >90% of the reference).")
cov_mindepth = len(covered_sites_mindepth) / reference_length

In [None]:
# get mean over windows because altair can't handle > 5k points ...
binned_depths = []
window_width = reference_length // 4500

for i in range(1, reference_length, window_width):
    window = depth_table.loc[
        (depth_table["position"] > i) & (depth_table["position"] < i + window_width)
    ]

    binned_depths.append(
        {"position": i, "depth": window["depth"].mean(),}
    )

binned_depths = pd.DataFrame(binned_depths)
# Convert position from bp to kbp, to improve how the coverage plot looks
binned_depths["position"] = binned_depths["position"]/1000
mean_depth = depth_table["depth"].mean() if not depth_table.empty else 0
median_depth = depth_table["depth"].median() if not depth_table.empty else 0

In [None]:
# Read Nextclade and Pangolin tables

# Don't need to read table; can get all info from Nextclade json
#nextclade_table = pd.read_csv(NEXTCLADE_TSV_PATH, sep="\t")
pangolin_table = pd.read_csv(PANGOLIN_CSV_PATH, sep=",")

# Add to results.json

In [None]:
# Read nextclade JSON
##### Please note that everything in the Nextclade JSON (nt positions, ranges, codon positions) is 0-indexed,
##### but SARS-CoV-2 variants (and most things) are reported as 1-indexed.

with open(NEXTCLADE_JSON) as json_file:
    nextclade_json = json.load(json_file)
    assert len(nextclade_json) == 1, f"expected exactly 1 result in: {nextclade_json}"
    nextclade_json = nextclade_json[0]

In [None]:
# Generate warnings if indels are detected? (ONT does not reliably detect these)
warnings = []
if nextclade_json['insertions'] != []:
    warnings.append('Insertions are detected.')
if nextclade_json['deletions'] != []:
    warnings.append('Deletions are detected.')
if warnings != []:
    for warning in warnings:
        display(warning)

In [None]:
!grep -v "^#" variants.vcf > variants.vcf.noheaders

if os.path.getsize("variants.vcf.noheaders") == 0: # If there are no variants
    n_snps = 0
    n_snps_mindepth = 0
    variant_table = pd.DataFrame()
else: # If there are variants, generate a variants table
    rows_list = []
    for subst in nextclade_json['substitutions']: # Each substitution is a dictionary
        dict1 = {}
        dict1['Position'] = subst['pos'] + 1 # JSON positions are 0-indexed; convert to 1-index
        dict1['Ref'] = subst['refNuc']
        dict1['Alt'] = subst['queryNuc']
        if len(subst['aaSubstitutions']) != 0:
            for mutation in subst['aaSubstitutions']: # JSON codons are 0-indexed; convert to 1-index
                dict1['Amino acid mutation'] = mutation['refAA'] + str(mutation['codon']+1) + mutation['queryAA']
        else:
            dict1['Amino acid mutation'] = ''
        rows_list.append(dict1)

    variant_table = pd.DataFrame(rows_list)
    
    # Add in gene info
    df_orfs = pd.read_csv("./annot_table.orfs.txt", \
        sep="\t", \
        header=None, \
        usecols=[0, 1, 2], \
        names=["gene", "start", "stop"])

    for i in variant_table.index:
        for j in df_orfs.index:
            if df_orfs.loc[j, "start"] <= variant_table.loc[i, "Position"] <= df_orfs.loc[j, "stop"]:
                variant_table.loc[i, "Gene"] = df_orfs.loc[j, "gene"]
                
    # Add in depth info
    variant_table = variant_table.set_index('Position')
    df_vcf = pd.read_csv("variants.vcf.noheaders", \
                         sep='\t', \
                         usecols=[1,7], \
                         names=['position','info'], \
                         index_col=['position']\
                        )

    sr = [ [ int(n) for n in x[1].split(';')[0].split(',') ] for x in df_vcf['info'].str.rsplit(";SR=") ] 
    assert { len(x) for x in sr } == {4}
    df_vcf['Ref depth'] = [ sum(n[:2]) for n in sr ] 
    df_vcf['Alt depth'] = [ sum(n[2:]) for n in sr ]
    df_vcf['Total depth'] = df_vcf['Ref depth'] + df_vcf['Alt depth']
    
    summed = df_vcf.reset_index()[['position', 'Ref depth', 'Alt depth', 'Total depth']].groupby('position').agg(sum)
    
    summed['Alt frequency (%)'] = summed['Alt depth']/(summed['Alt depth'] + summed['Ref depth'])*100


In [None]:
pd.options.display.float_format = '{:,.2f}'.format
combo = variant_table.merge(summed, left_index=True, right_index=True, how='left')

display_table = combo.fillna('-')[['Ref','Alt','Alt depth','Total depth','Alt frequency (%)','Gene','Amino acid mutation']]

In [None]:
n_snps_mindepth = sum(combo['Total depth'] > MIN_DEPTH)

In [None]:
nextclade_pm_count = nextclade_json['qc']['privateMutations']['total']
nextclade_lineage = nextclade_json['clade']

pangolin_lineage = pangolin_table['lineage'].iloc[0]
pangolin_version = pangolin_table['pangoLEARN_version'].iloc[0]

In [None]:
title("SARS-CoV-2 (COVID-19) Sequencing Overview")

In [None]:
text = f"""
This report summarizes the detection of SARS-CoV-2 single-nucleotide variants (SNVs) in sample 
<strong>{sample_filename}</strong>.

<p>A minimum depth of 50x was chosen for confident SNV detection based on <a href="https://doi.org/10.1038/s41467-020-20075-6">benchmarking</a> of SARS-CoV-2 sequencing data generated with ARTIC network amplicon protocols and ONT sequencing. This benchmarking study also concludes that ONT sequencing is unsuitable for detection of small indel varants, which we do no report here.

<p>This sample contained <strong>{n_reads:,}</strong> reads, with
<strong>{proportion_mapped_reads:.1%}</strong> mapping to the 
<a href='https://www.ncbi.nlm.nih.gov/nuccore/MN908947.3/' target='_blank'>Wuhan-Hu-1 reference</a>.
Reads span <strong>{cov:.0%}</strong> of the genome, with a mean depth of <strong>{mean_depth:.0f}x</strong>, and {cov_mindepth:.0%} of the genome covered at depths >{MIN_DEPTH:}x.</p>

<p>A total of <strong>{n_snps_mindepth}</strong> variant{'s were' if n_snps_mindepth != 1 else 'was'} detected at depths >{MIN_DEPTH:}x.
This genome is classified as Pangolin lineage <strong>{pangolin_lineage}</strong> using PangoLEARN version {pangolin_version} and Nextclade lineage <strong>{nextclade_lineage}</strong> with {nextclade_pm_count} private mutation{'s' if nextclade_pm_count != 1 else ''}.</p>"""

HTML(text)

In [None]:
# Coverage plot
reference_length_kb = reference_length // 1000

plot = (
    alt.Chart(binned_depths)
    .mark_area()
    .transform_window(rolling_mean="mean(depth)", frame=[-50, 50])
    .encode(
        x=alt.X(
            "position",
            title="Genomic Coordinate (kb)",
            scale=alt.Scale(domain=[0, reference_length_kb]),
        ),
        y=alt.Y("rolling_mean:Q", scale=alt.Scale(type="linear"), title="Depth"),
    )
    .properties(
        title=f"SARS-CoV-2 ({REFERENCE_NAME})",
        width=550,
        height=150,
    )
)
plot

In [None]:
display(display_table)

In [None]:
if os.path.getsize("variants.vcf.noheaders") != 0: # If there are variants
    HTML(variant_table[combo['Total depth'] > MIN_DEPTH].to_html(index=False))
    legend_text = "SARS-CoV-2 variants."

    n_extra_variants = (
        sum(combo["Total depth"] > MIN_DEPTH) if not variant_table.empty else 0
    )

    if n_extra_variants > 0:
        legend_text += f" An additional {n_extra_variants} variant{'s' if n_extra_variants > 1 else ''} <{MIN_DEPTH}× depth {'are' if n_extra_variants > 1 else 'is'} not shown."


    if os.environ.get("ONE_CODEX_REPORT_UUID"):
        legend_text += f""" 
             A variants TSV and consensus FASTA is available <a target="_blank" href=\"{'https://app.onecodex.com/report/' + os.environ['ONE_CODEX_REPORT_UUID'] + '/files'}\">here</a>.
            """
    HTML(
        '<div style="text-align: center; padding-top: 10px; font-size: 0.7em; color: #777;"><em>'
        + legend_text
        + "</em></div>"
    )

### Additional Resources

- Additional bioinformatics pipeline details are [available on GitHub](https://github.com/onecodex/sars-cov-2)
- [Nextstrain](https://nextstrain.org/ncov) maintains an up-to-date analysis of SARS-CoV-2 (HCoV-19).
- The [Global Initiative on Sharing All Influenza Data (GISAID)](https://www.gisaid.org/) hosts viral genomes from ongoing outbreaks. Please [contact us](mailto:hello@onecodex.com) for help submitting your data.

In [None]:
# Add One Codex report ID to footer for reproducibility/data provenance (not yet in v0.7.2)
HTML(
    f"""
<style type='text/css'>
@page {{
    @bottom-center {{
        content: "{os.environ['ONE_CODEX_REPORT_UUID'] + ' -' if os.environ.get('ONE_CODEX_REPORT_UUID') else ''} NOT FOR DIAGNOSTIC USE" !important;
    }}
}}
</style>
"""
)

In [None]:
# Save a JSON too, including filtered variants <50x
results = {
    "n_reads": n_reads,
    "n_mapped_reads": n_mapped_reads,
    "report_id": os.environ.get("ONE_CODEX_REPORT_UUID"), 
    "sample_id": os.environ.get("ONE_CODEX_SAMPLE_UUID"),
    "variants": combo.to_dict(orient='records'),
    "coverage": cov,
    "coverage_over_50x": cov_mindepth,
    "mean_depth": mean_depth,
    "median_depth": median_depth,
    "nextclade_results": nextclade_json,
    "variant_table": variant_table.to_dict(orient="records"),
    "warnings": warnings
}

with gzip.open(f"{sample.filename}.report.json.gz", "w") as f:
    f.write(json.dumps(results).encode())

if len(error_messages) > 0:
    with open("error.json", "w") as handle:
        json.dump(error_dict, handle)

In [None]:
# Clean up files
!rm -f {sample.filename} snps.depth variants.log covid19.bam.bai