# Inspect Sample with IGV
### Author: Jonn Smith
### Date: 2023/03/15
Enter sample info and inspect the sample reads and variants in an IGV Notebook session.
***

## Input Your Sample ID:

In [1]:
cohort_id = "Broad_2019_Senegal_Dataset_With_VCFs_2"

***

## Setup

In [2]:
import os
import json
import math

import firecloud.api as fapi
import pandas as pd
import numpy as np

from collections import namedtuple
from IPython.display import HTML

from tqdm.notebook import tqdm

#################################################################

## From: https://stackoverflow.com/a/55804230
import google.auth
import google.auth.transport.requests
creds, project = google.auth.default()

# creds.valid is False, and creds.token is None
# Need to refresh credentials to populate those

auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

#################################################################

bucket = os.environ['WORKSPACE_BUCKET']
workspace = os.environ['WORKSPACE_NAME']
namespace = os.environ['WORKSPACE_NAMESPACE']

flowcell_table = "sample"
sample_table = "sample_set"
cohort_table = "sample_set_set"

#################################################################

print(f"Namespace: {namespace}")
print(f"Workspace: {workspace}")
print(f"Bucket:    {bucket}")
print()
print(f"Flowcell table: {flowcell_table}")
print(f"Sample table:   {sample_table}")
print(f"Cohort table:   {cohort_table}")

Namespace: broad-firecloud-dsde-methods
Workspace: sr-malaria
Bucket:    gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd

Flowcell table: sample
Sample table:   sample_set
Cohort table:   sample_set_set


## Install IGV

***

In [3]:
!pip install igv_notebook



## Load data

In [4]:
def load_table(namespace, workspace, table_name, store_membership=False, membership_column="samples"):
    ent_old = fapi.get_entities(namespace, workspace, table_name).json()
    tbl_old = None

    membership = None
    if len(ent_old) > 0:
        tbl_old = pd.DataFrame(list(map(lambda e: e['attributes'], ent_old)))
        tbl_old[f"entity:{table_name}_id"] = list(map(lambda f: f['name'], ent_old))

        if store_membership:
            membership = list(map(lambda g: set(map(lambda h: h['entityName'], g['items'])), tbl_old[membership_column]))
            del tbl_old[membership_column]

        c = list(tbl_old.columns)
        c.remove(f"entity:{table_name}_id")
        c = [f"entity:{table_name}_id"] + c
        tbl_old = tbl_old[c]
        tbl_old = tbl_old.astype(str)

    return tbl_old, membership

# We also have to remove any `nan` values here:
def fix_nans(df, quiet=True):
    if not quiet: print("Replacing all `nan` values with empty strings: ")
    for c in df.columns.values:
        nan_types = ("nan", float('nan'))
        has_nan = False
        num_denaned = 0
        for n in nan_types:
            if (sum(df[c] == n) > 0):
                num_denaned += sum(df[c] == n)
                df.loc[df[c] == n, c] = ""
                has_nan = True
        if has_nan and not quiet:
            print(f"\t{c}: {num_denaned}")

    if not quiet: print("Replacing numpy nan values...")
    if not quiet: print("Done.")
    return df.replace(np.nan, "")

In [5]:
print("Loading Sample Table...\t", end="")
tbl_cohort, _ = load_table(namespace, workspace, cohort_table)
print("DONE")

print("Cleaning NaN values from data...\t", end="")
tbl_cohort = fix_nans(tbl_cohort)
print("DONE")

Loading Sample Table...	DONE
Cleaning NaN values from data...	DONE


In [6]:
cohort_row = tbl_cohort[tbl_cohort[f"entity:{cohort_table}_id"] == cohort_id]
if len(cohort_row) == 0:
    message = f"Error: cohort does not exist in the data: {cohort_id}"
    print(message)
    print()
    raise RuntimeError(message)
elif len(cohort_row) > 1:
    message = f"Error: sample has multiple rows in the data: {cohort_id} (len={len(cohort_row)})"
    print(message)
    print()
    raise RuntimeError(message)

In [7]:
tbl_cohort

Unnamed: 0,entity:sample_set_set_id,raw_joint_vcf,genomicsDB,joint_mt,joint_recalibrated_vcf_tbi,raw_joint_vcf_tbi,sample_sets,joint_recalibrated_vcf,joint_zarr,annotated_joint_vcf,annotated_joint_vcf_tbi,snpEff_summary,snpEff_genes,drug_res_report
0,test_450_samples,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/s...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,"{'itemsType': 'EntityReference', 'items': [{'e...",gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/s...,,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/s...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/s...,,,
1,test_98_samples,,,,,,"{'itemsType': 'EntityReference', 'items': [{'e...",,,,,,,
2,MalariaGEN_Crosses,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,"{'itemsType': 'EntityReference', 'items': [{'e...",gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,,,,,,
3,Broad_2019_Senegal_Dataset_All_Samples,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,"{'itemsType': 'EntityReference', 'items': [{'e...",gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,,,
4,Broad_2019_Senegal_Dataset_With_VCFs,,,,,,"{'itemsType': 'EntityReference', 'items': [{'e...",,,,,,,
5,Broad_2022_Senegal_Dataset,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,"{'itemsType': 'EntityReference', 'items': [{'e...",gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...
6,Broad_2019_Senegal_Dataset_With_VCFs_2,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,"{'itemsType': 'EntityReference', 'items': [{'e...",gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,
7,Broad_2022_Senegal_Dataset_2,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,"{'itemsType': 'EntityReference', 'items': [{'e...",gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/r...,
8,Pf7_qc_passed_chunk_0_passed,,,,,,"{'itemsType': 'EntityReference', 'items': [{'e...",,,,,,,
9,Pf7_qc_passed_chunk_1_passed,,,,,,"{'itemsType': 'EntityReference', 'items': [{'e...",,,,,,,


## Setup IGV

In [8]:
IgvData = namedtuple('IgvData', ['type', 'name', 'data_field', 'index_field'])
GenomicLocus = namedtuple('GenomicLocus', ['name', 'contig', 'start', 'end'])

In [9]:
data_for_igv = [
    IgvData('vcf', 'Variants (Joint Called / Rescored)', "annotated_joint_vcf", "annotated_joint_vcf_tbi"),
]

In [10]:
# Do some validation here:
for f in data_for_igv:
    if cohort_row[f.data_field].iloc[0] == "":
        raise RuntimeError(f"Sample does not have {f}: {cohort_id}")
    if f.index_field and cohort_row[f.index_field].iloc[0] == "":
        raise RuntimeError(f"Sample does not have {f}: {cohort_id}")

In [11]:
loci = [
    GenomicLocus('dhfr', "Pf3D7_04_v3", 747897, 750065),
    GenomicLocus('mdr1', "Pf3D7_05_v3", 955955, 963095),
    GenomicLocus('pfcrt', "Pf3D7_07_v3", 402385, 406341),
    GenomicLocus('dhps', "Pf3D7_08_v3", 547896, 551057),
    GenomicLocus('hrp2', "Pf3D7_08_v3", 1373212, 1376988),
    GenomicLocus('kelch13', "Pf3D7_13_v3", 1724600, 1727877),
    GenomicLocus('hrp3', "Pf3D7_13_v3", 2840236, 2842840),
]

In [12]:
v_squished_height = 24
v_expanded_height = 400
v_viz_window = 10000
v_state = "expanded"

roi_alpha = 0.05

In [13]:
def generate_tracks_for_igv(data_for_igv, sample_row_df,
                           v_squished_height=v_squished_height,
                           v_expanded_height=v_expanded_height,
                           v_viz_window=v_viz_window,
                           v_state=v_state):
    tracks = []
    for d in data_for_igv:
        if d.type == "bam":
            tracks.append(
                {
                    "name": d.name,
                    "url": sample_row_df[d.data_field].iloc[0],
                    "indexURL": sample_row_df[d.index_field].iloc[0],
                    "format": "bam",
                    "type": "alignment"
                }
            )
        elif d.type == "vcf":
            tracks.append(
                {
                    "type": "variant",
                    "format": "vcf",
                    "url": sample_row_df[d.data_field].iloc[0],
                    "indexURL": sample_row_df[d.index_field].iloc[0],
                    "name": d.name,
                    "squishedCallHeight": v_squished_height,
                    "expandedCallHeight": v_expanded_height,
                    "displayMode": v_state,
                    "visibilityWindow": v_viz_window
                }
            )
        else:
            raise RuntimeError(f"Unknown IgvData.type: {d.type}")
    return tracks

In [14]:
def get_roi_entries_for_loci(loci, color=[3,52,249], alpha=roi_alpha):
    rois = []
    for l in loci:
        rois.append(
            {
                "name": l.name,
                "color": f"rgba({color[0]},{color[1]},{color[2]},{roi_alpha})",
                "features": [
                    {
                        "chr": l.contig,
                        "start": l.start,
                        "end": l.end
                    },
                ]
            }
        )
    return rois
    

In [15]:
# Set up regions of interest:
rois = [
    {
        "name": "Core Genome",
        "url": "gs://broad-dsp-lrma-pfcrosses/regions-20130225.onebased.Core.bed",
        "indexed": False,
        "color": f"rgba(94,255,1,{roi_alpha})"
    }
]
rois.extend(get_roi_entries_for_loci(loci))

In [16]:
igv_browser_settings = {
    "oauthToken": creds.token,
    "reference": {
        "id": "Pf3D7",
        "name": "Plasmodium falciparum 3D7 (v61)",
        "fastaURL": "gs://broad-dsde-methods-long-reads/resources/references/plasmodb_release-61/Pfalciparum3D7/fasta/data/PlasmoDB-61_Pfalciparum3D7_Genome.fasta",
        "indexURL": "gs://broad-dsde-methods-long-reads/resources/references/plasmodb_release-61/Pfalciparum3D7/fasta/data/PlasmoDB-61_Pfalciparum3D7_Genome.fasta.fai",
        "tracks": [{
            "name": "Genes",
            "url": "gs://broad-dsde-methods-long-reads-public/resources/malaria/pfalciparum3D7/PlasmoDB-61_Pfalciparum3D7.gff",
            "order": 1000000,
            "displayMode": "expanded",
            "nameField": "Name",
            "searchableFields": ["ID", "Name", "description", "ebi_biotype", "Parent", "gene_id", "protein_source_id"],
            "indexed": True
        }]
    },
    "tracks": generate_tracks_for_igv(data_for_igv, cohort_row),
    "roi": rois
}

## Run IGV

In [17]:
import igv_notebook
igv_notebook.init()
igv_browser = igv_notebook.Browser(igv_browser_settings)

# Start at pfcrt:
igv_browser.search("Pf3D7_07_v3:402385-406341")

# Make our cell real wide:
display(HTML("<style>.container { width:100% !important; }</style>"))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>