Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
6dd16b7
bqsr sketch
lucidtronix May 23, 2018
7d086f3
write labeled tensors
lucidtronix Sep 17, 2018
d3f1f60
bqsr label tensors model
lucidtronix Sep 18, 2018
919aad6
anno model
lucidtronix Sep 18, 2018
0a60c8e
bqsring
lucidtronix Sep 18, 2018
89958f9
cleanup
lucidtronix Sep 18, 2018
666ad57
bqsr args
lucidtronix Sep 21, 2018
d8c45cc
change margin, skip soft clips, etc.
takutosato Sep 26, 2018
37b25bf
fix conflict
lucidtronix Sep 26, 2018
81981f4
bqsr rev complement, no more reverse annotation
lucidtronix Sep 26, 2018
e1bde7c
use lambda for metrics
takutosato Oct 11, 2018
16dae77
Revert "use lambda for metrics" aka put the lambda dream on hold
takutosato Oct 11, 2018
39d6831
next
takutosato Oct 11, 2018
35e8e98
visualize before changes
takutosato Oct 11, 2018
f8443a7
oh yea
takutosato Oct 11, 2018
677ff41
option to use input in log space
takutosato Oct 17, 2018
18411b6
support multiple chromosomes
takutosato Oct 23, 2018
30a0f33
handle bq 2 and write stats to a file
takutosato Oct 23, 2018
5010fe8
log args
takutosato Oct 23, 2018
9f2062e
log args for training mode
takutosato Oct 24, 2018
a940551
small changes
takutosato Oct 24, 2018
42e5ad1
support oq
takutosato Oct 29, 2018
cf4cbb4
option to skip q2
takutosato Oct 31, 2018
80a590b
KL divergence metric
takutosato Nov 1, 2018
afeb7ab
analysis notebook
takutosato Nov 1, 2018
a00742b
KL
takutosato Nov 1, 2018
c5e4101
abandon KL divergence metrics for now
takutosato Nov 6, 2018
c53a3d8
small
takutosato Nov 14, 2018
87481f9
data_dir
takutosato Nov 26, 2018
b870aba
generator supports oq bqsr coexistence
takutosato Nov 26, 2018
30469e7
move some funcitons to a utils file
takutosato Nov 26, 2018
cce3527
indentation mishap
takutosato Nov 26, 2018
0b7f61a
remove reads with q6 or lower from training data, and more flexble re…
takutosato Nov 29, 2018
7415c85
lesss gooooooooooo
takutosato Dec 4, 2018
c8531d9
comoeonnnnnnnnn
takutosato Dec 4, 2018
d59da01
kwickkkk
takutosato Dec 4, 2018
c435c1d
yup
takutosato Dec 6, 2018
1465dd2
sam wise well
takutosato Dec 7, 2018
1f40e24
get next batch in utils
takutosato Dec 7, 2018
c6f3114
add distnace in mean metric and kl divergence metric. the latter is s…
takutosato Dec 9, 2018
7b4c2b6
fixed KL divergence lets goooooo
takutosato Dec 9, 2018
1262e7e
distance in log space
takutosato Dec 9, 2018
d144480
fix bugs
takutosato Dec 10, 2018
beaed4d
clenaup
takutosato Dec 10, 2018
6df072f
indentation nightmare
takutosato Dec 10, 2018
1d4911f
tab hell
takutosato Dec 10, 2018
809fb7c
tabs to 4 spaces curteosy of sublime
takutosato Dec 10, 2018
5c938d5
variance in read
takutosato Dec 10, 2018
adfde4c
changes changes
takutosato Dec 11, 2018
66015fe
dna
takutosato Dec 11, 2018
34a776e
use spaces not tabs
takutosato Dec 11, 2018
a352230
plots notebook
takutosato Dec 17, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,282 changes: 1,282 additions & 0 deletions api_tutorials/bqsr_cnn.py

Large diffs are not rendered by default.

Empty file added api_tutorials/howd_we_do.py
Empty file.
69 changes: 69 additions & 0 deletions api_tutorials/recal_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import numpy as np
import numpy.ma as ma
from scipy.stats import entropy


def get_next_batch(generator, quality_correction=0.0):
batch = next(generator)
tensor = batch[0][OQ_TENSOR_NAME]
bqsr = batch[0][BQSR_TENSOR_NAME]
label = batch[1]
pred = model.predict_on_batch(tensor)
pred_qscores = -10 * np.log10(
pred[:, :, args.labels['BAD_BASE']]) + quality_correction # +10 only if the tensor is generated with a bias

orig_qscores = -10 * np.log10(1 - np.max(tensor[:, :, :4], axis=2))
annot = tensor[:, 0, (args.input_symbols['pair'], args.input_symbols['mq'])]

return pred_qscores, orig_qscores, bqsr, label, annot


def tensor_to_quality_array(tensor):
'''
tensor : (batch_size, 151, 7)

returns : (batch_size, 151)
'''
return -10 * np.log10(1 - np.max(tensor[:, :, :4], axis=2))

def KL_divergence_metric(y_true, y_pred):
''' KL divergence metrics for Keras - still under construction '''

# maybe scikit learn
predicted_qs = -10*np.log10(y_pred[:,:, args.labels['BAD_BASE']])
match_qs = (predicted_qs[:,:,np.newaxis] * y_true)[:,:, args.labels['GOOD_BASE']]
mismatch_qs = (predicted_qs[:,:,np.newaxis] * y_true)[:,:, args.labels['BAD_BASE']]
match_qs = match_qs[match_qs > 0]
mismatch_qs = mismatch_qs[mismatch_qs > 0]

# bins are half open: 1 will go in the [1,2) bin
max_quality=50
match_hist, match_bins = np.histogram(np.round(match_qs), bins=max_quality, range = (0,max_quality))
mismatch_hist, mismatch_bins = np.histogram(np.round(mismatch_qs), bins=max_quality, range = (0,max_quality))

# compute the KL divergence KL(match||mismatch) - the order chosen arbitrariliy i.e. could've easily chosen KL(mismatch||match)
# mask bins with 0 probability mass because numpy doens't know 0*log(0)=0
ma_match_hist = ma.array(match_hist/np.sum(match_hist), mask=match_hist == 0)
ma_mismatch_hist = ma.array(mismatch_hist/np.sum(mismatch_hist), mask=match_hist == 0)
print(ma_match_hist)
print(ma_mismatch_hist)
print(entropy(ma_match_hist, ma_mismatch_hist))
KL = -ma.sum(ma_match_hist * ma.log(ma_mismatch_hist)) - ma.sum(- ma_match_hist * ma.log(ma_match_hist))
return KL

def KL_divergence(match_qs, mismatch_qs):
''' compute the KL divergence between the predicted qualities of bases that match the reference and those that don't
match_qs and mismatch_qs are both arrays of qualities, unsorted and unrounded, straight out of the CNN or SAM.
greater the KL divergence, the greater the separation between the two distributions
'''
# bins are half open: 1 will go in the [1,2) bin
max_quality=50
match_hist, match_bins = np.histogram(np.round(match_qs), bins=max_quality, range = (0,max_quality))
mismatch_hist, mismatch_bins = np.histogram(np.round(mismatch_qs), bins=max_quality, range = (0,max_quality))

# compute the KL divergence KL(match||mismatch) - the order chosen arbitrariliy i.e. could've easily chosen KL(mismatch||match)
# mask bins with 0 probability mass because numpy doens't know 0*log(0)=0
ma_match_hist = ma.array(match_hist, mask=match_hist == 0)
ma_mismatch_hist = ma.array(mismatch_hist, mask=match_hist == 0)
KL = -ma.sum(ma_match_hist * ma.log(ma_mismatch_hist)) - ma.sum(- ma_match_hist * ma.log(ma_match_hist))
return KL
4 changes: 3 additions & 1 deletion arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ def is_broad_cluster():
import keras.backend as K

def parse_args():
import sys
print(sys.argv)
parser = argparse.ArgumentParser()

# Required mode argument: what would you like to do?
Expand Down Expand Up @@ -207,7 +209,7 @@ def weight_path_from_args(args):
Arguments:
args: puts arguments into the file name skips args in the ignore array
'''
save_weight_hd5 = args.output_dir + args.id + '.hd5'
save_weight_hd5 = args.output_dir + args.id + '.hd5'
print('save weight path:' , save_weight_hd5)
return save_weight_hd5

Expand Down
720 changes: 720 additions & 0 deletions quality/analyze.ipynb

Large diffs are not rendered by default.

643 changes: 643 additions & 0 deletions quality/quick_tests.ipynb

Large diffs are not rendered by default.

406 changes: 406 additions & 0 deletions quality/visualize.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion scripts/hc_vqsr_script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ SCATTER=200
# REFERENCE=/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta
# INTERVAL=/seq/references/Homo_sapiens_assembly38/v0/variant_calling/wgs_calling_regions.v1.interval_list

# Clinical project g947x NA12878 HG38
# Clinical project illumina NA12878 HG38
BAM=/dsde/data/deep/vqsr/bams/NA12878_S1.bam
BAMOUT=/dsde/data/deep/vqsr/bams/illumina_na12878_s1_bamout.bam
VCF=/dsde/data/deep/vqsr/vcfs/illumina_na12878_s1.vcf.gz
Expand Down
46 changes: 23 additions & 23 deletions scripts/write_tensors.sh
Original file line number Diff line number Diff line change
Expand Up @@ -308,50 +308,50 @@ SPLIT_INTERVALS=/dsde/data/deep/vqsr/beds/wgs_10m_split_genome.interval_list
# BED_FILE=/dsde/data/deep/vqsr/beds/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed

# Clinical NA12878 1ug
SAMPLE_NAME=SM-G9481
DOWNSAMPLE_SNPS=0.003
DOWNSAMPLE_INDELS=0.025
DOWNSAMPLE_NOT_SNPS=0.5
CHANNEL_ORDER=channels_last
MODE=write_paired_read_tensors
DATA_DIR=/dsde/data/deep/vqsr/tensors/g947_balanced_paired_read_channels_last/
TRAIN_VCF=/dsde/data/deep/vqsr/vcfs/nist_na12878_giab_hg38.vcf.gz
BAM_FILE=/dsde/data/deep/vqsr/bams/g94781_lod_1ug_na12878_bamout.bam
NEGATIVE_VCF=/dsde/data/deep/vqsr/vcfs/g94781_lod_1ug_na12878_hc4_merged.vcf.gz
SPLIT_INTERVALS=/dsde/data/deep/vqsr/beds/wgs_10m_split_genome_hg38.interval_list
REFERENCE=/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta
BED_FILE=/dsde/data/deep/vqsr/beds/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed

# Clinical NA12878 g947m
# SAMPLE_NAME=SM-G9481
# DOWNSAMPLE_SNPS=0.003
# DOWNSAMPLE_INDELS=0.025
# DOWNSAMPLE_NOT_SNPS=0.5
# MODE=write_paired_read_tensors
# SAMPLE_NAME=SM-G947M
# CHANNEL_ORDER=channels_last
# MODE=write_paired_read_tensors
# DATA_DIR=/dsde/data/deep/vqsr/tensors/g947_balanced_paired_read_channels_last/
# TRAIN_VCF=/dsde/data/deep/vqsr/vcfs/nist_na12878_giab_hg38.vcf.gz
# BAM_FILE=/dsde/data/deep/vqsr/bams/g947m_o1d2v1_na12878_bamout.bam
# NEGATIVE_VCF=/dsde/data/deep/vqsr/vcfs/g947m_o1d2v1_na12878_hc4_merged.vcf.gz
# BAM_FILE=/dsde/data/deep/vqsr/bams/g94781_lod_1ug_na12878_bamout.bam
# NEGATIVE_VCF=/dsde/data/deep/vqsr/vcfs/g94781_lod_1ug_na12878_hc4_merged.vcf.gz
# SPLIT_INTERVALS=/dsde/data/deep/vqsr/beds/wgs_10m_split_genome_hg38.interval_list
# REFERENCE=/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta
# BED_FILE=/dsde/data/deep/vqsr/beds/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed

# Clinical NA12878 g947x
# Clinical NA12878 g947m
# DOWNSAMPLE_SNPS=0.003
# DOWNSAMPLE_INDELS=0.025
# DOWNSAMPLE_NOT_SNPS=0.5
# MODE=write_paired_read_tensors
# SAMPLE_NAME=SM-G947X
# SAMPLE_NAME=SM-G947M
# CHANNEL_ORDER=channels_last
# DATA_DIR=/dsde/data/deep/vqsr/tensors/g947_balanced_paired_read_channels_last/
# TRAIN_VCF=/dsde/data/deep/vqsr/vcfs/nist_na12878_giab_hg38.vcf.gz
# BAM_FILE=/dsde/data/deep/vqsr/bams/g947x_o2d1v1_na12878_bamout.bam
# NEGATIVE_VCF=/dsde/data/deep/vqsr/vcfs/g947x_o2d1v1_na12878_cnn_scored.vcf.gz
# BAM_FILE=/dsde/data/deep/vqsr/bams/g947m_o1d2v1_na12878_bamout.bam
# NEGATIVE_VCF=/dsde/data/deep/vqsr/vcfs/g947m_o1d2v1_na12878_hc4_merged.vcf.gz
# SPLIT_INTERVALS=/dsde/data/deep/vqsr/beds/wgs_10m_split_genome_hg38.interval_list
# REFERENCE=/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta
# BED_FILE=/dsde/data/deep/vqsr/beds/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed

# Clinical NA12878 g947x
DOWNSAMPLE_SNPS=0.005
DOWNSAMPLE_INDELS=0.025
DOWNSAMPLE_NOT_SNPS=0.5
MODE=write_paired_read_tensors
SAMPLE_NAME=SM-G947X
DATA_DIR=/dsde/data/deep/vqsr/tensors/g947x_balanced_paired_read_ws128_na12878/
TRAIN_VCF=/dsde/data/deep/vqsr/vcfs/nist_na12878_giab_hg38.vcf.gz
BAM_FILE=/dsde/data/deep/vqsr/bams/g947x_o2d1v1_na12878_bamout.bam
NEGATIVE_VCF=/dsde/data/deep/vqsr/vcfs/g947x_o2d1v1_na12878_cnn_scored.vcf.gz
SPLIT_INTERVALS=/dsde/data/deep/vqsr/beds/wgs_10m_split_genome_hg38.interval_list
REFERENCE=/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta
BED_FILE=/dsde/data/deep/vqsr/beds/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed


# Clinical NA12878 g947z
# SAMPLE_NAME=SM-G947Z
# DOWNSAMPLE_SNPS=0.003
Expand Down