Skip to content

Commit

Permalink
Update version number in NanoSim and requirements.txt, add normalize …
Browse files Browse the repository at this point in the history
…by transcript length option
  • Loading branch information
cheny19 committed Oct 27, 2021
1 parent e87380d commit 3d046e3
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 67 deletions.
25 changes: 17 additions & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
HTSeq
numpy
pybedtools
pysam
scipy
six
scikit-learn
joblib
# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: linux-64
_anaconda_depends=2019.03=py37_0
_ipyw_jlab_nb_ext_conf=0.1.0=py37_0
_libgcc_mutex=0.1=main
htseq=0.11.2=py37h637b7d7_1
joblib=0.14.1=py_0
last=874=h8b12597_3
minimap2=2.17=h8b12597_1
numpy=1.17.2=py37haad9e8e_0
numpy-base=1.17.2=py37hde5b4d6_0
pybedtools=0.8.1=py37he513fc3_0
pysam=0.15.3=py37hda2845c_1
scikit-learn=0.21.3=py37hd81dba3_0
scipy=1.4.1=py37h0b6359f_0
six=1.12.0=py37_0
101 changes: 85 additions & 16 deletions src/get_primary_sam.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ def not_overlap(interval, interval_lst, interval_name=None, interval_name_list=N
return True


def EM(read_list, all_species):
def EM_meta(read_list, all_species):
"""
:param read_list: {(read1, interval1): [species1, species2]), ((read1, interval2)), [species1, species2]) ...}
:param read_list: {(read1, interval1): [species1, species2], (read1, interval2): [species1, species2] ...}
:return: abundance_list: {species1: abundance1, species2: abundance2, ...}
"""
sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Starting EM for quantification\n")
Expand All @@ -63,13 +63,11 @@ def EM(read_list, all_species):
for iter in range(0, 100):
# Expectation
base_count_tmp = {k: v for k, v in base_count_unique.items()}
added = 0
for read, species_list in multi_mapping_list.items():
length = read[1]
total_abun_species = sum([abundance_list[species] for species in species_list])
perc_for_species = [abundance_list[species] / total_abun_species for species in species_list]
for i in range(len(species_list)):
added += length * perc_for_species[i]
base_count_tmp[species_list[i]] += length * perc_for_species[i]

# Maximization
Expand All @@ -86,6 +84,61 @@ def EM(read_list, all_species):
return abundance_list


def EM_trans(read_list, all_trans, normalize):
"""
:param read_list: {(read1, interval1): [trans1, trans2]), (read1, interval2), [trans1, trans2]) ...}
:return: tpm_list: {species1: tpm1, species2: tpm2, ...}
"""
sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Starting EM for quantification\n")
read_count_unique = dict.fromkeys(all_trans, 0)
multi_mapping_list = {}
total_reads = 0
for read, trans_list in read_list.items():
total_reads += 1
if len(trans_list) == 1:
read_count_unique[trans_list[0]] += 1
else:
multi_mapping_list[read[0]] = trans_list
abundance_list = {trans: 100 / len(all_trans) for trans in all_trans}

diff = 100 * len(all_trans)
for iter in range(0, 1000):
# Expectation
read_count_tmp = {k: v for k, v in read_count_unique.items()}
for read, trans_list in multi_mapping_list.items():
all_trans_in_read = sum([abundance_list[trans] for trans in trans_list])
perc_for_trans = [abundance_list[trans] / all_trans_in_read for trans in trans_list]
for i in range(len(trans_list)):
read_count_tmp[trans_list[i]] += perc_for_trans[i]

# Maximization
abundance_list_tmp = {trans: reads * 100 / total_reads for trans, reads in read_count_tmp.items()}
diff_tmp = sum([abs(abundance_list_tmp[i] - abundance_list[i]) for i in abundance_list.keys()])
if iter % 10 == 0:
sys.stdout.write("Iteration: " + str(iter) + ", Diff: " + str(diff_tmp) + '\n')
sys.stdout.flush()

abundance_list = abundance_list_tmp
if diff_tmp <= min(abundance_list.values()) * 0.001 or diff - diff_tmp < min(abundance_list.values()) * 0.001:
break
diff = diff_tmp

# calculate TPM
tpm_list = {}
total_rpk = 0
if normalize:
for trans, count in read_count_tmp.items():
total_rpk += count / all_trans[trans] * 1e3 # read per kilobase
else:
total_rpk = sum(read_count_tmp.values())
for trans, count in read_count_tmp.items():
rpk = count / all_trans[trans] * 1e3 if normalize else count
tpm = rpk * 1e6 / total_rpk
tpm_list[trans] = (count, tpm)

return tpm_list


def primary_and_unaligned(sam_alnm_file, prefix, metagenome_list=None):
in_sam_file = pysam.AlignmentFile(sam_alnm_file)
out_sam_file = pysam.AlignmentFile(prefix + "_primary.bam", 'wb', template=in_sam_file, add_sam_header=True)
Expand Down Expand Up @@ -132,7 +185,7 @@ def primary_and_unaligned(sam_alnm_file, prefix, metagenome_list=None):
quantification_file = open(prefix + "_quantification.tsv", 'w')
quantification_file.write("Species\tAbundance\n")

abundance_list = EM(quant_dic, all_species)
abundance_list = EM_meta(quant_dic, all_species)
for k, v in abundance_list.items():
quantification_file.write(k + '\t' + str(v) + '\n')
metagenome_list[k]["real"] = v
Expand All @@ -157,7 +210,7 @@ def primary_and_unaligned(sam_alnm_file, prefix, metagenome_list=None):
return unaligned_len, strandness


def primary_and_unaligned_chimeric(sam_alnm_file, prefix, metagenome_list=None, q_mode=False):
def primary_and_unaligned_chimeric(sam_alnm_file, prefix, metagenome_list=None, q_mode=False, normalize=True):
"""
Function to extract alignments of reads at extremities of circular genome references
:param sam_alnm_file: path of input SAM file
Expand All @@ -174,6 +227,8 @@ def primary_and_unaligned_chimeric(sam_alnm_file, prefix, metagenome_list=None,
in_sam_file = pysam.AlignmentFile(sam_alnm_file)
if metagenome_list:
quant_dic = {}
is_trans = True if 'tpm' in metagenome_list else False

if not q_mode:
out_sam_file = pysam.AlignmentFile(prefix + "_primary.bam", 'wb', template=in_sam_file, add_sam_header=True)
chimeric_file = open(prefix + "_chimeric_info", 'w')
Expand All @@ -189,8 +244,11 @@ def primary_and_unaligned_chimeric(sam_alnm_file, prefix, metagenome_list=None,
for info in in_sam_file.header['SQ']:
species_chrom = info['SN']
ref_lengths[species_chrom] = info['LN']
species = '_'.join(species_chrom.split('_')[:-1])
all_species[species] = 0
if metagenome_list and is_trans:
species = species_chrom
else:
species = '_'.join(species_chrom.split('_')[:-1])
all_species[species] = info['LN']
chimeric_species_count[species] = [0, 0] # the succedent segment source [same species, other species]

if not q_mode:
Expand Down Expand Up @@ -256,7 +314,10 @@ def primary_and_unaligned_chimeric(sam_alnm_file, prefix, metagenome_list=None,
if not q_mode:
out_sam_file.write(aln)
# Quantification
if metagenome_list:
if metagenome_list and is_trans:
species = aln.reference_name
quant_dic[(aln.query_name, (seg["query"][0][0], seg["query"][0][1]))] = [species]
elif metagenome_list:
species = '_'.join(aln.reference_name.split('_')[:-1])
quant_dic[(aln.query_name, (seg["query"][0][0], seg["query"][0][1]))] = [species]
if primary_direction == '+':
Expand All @@ -277,8 +338,8 @@ def primary_and_unaligned_chimeric(sam_alnm_file, prefix, metagenome_list=None,
interval = seg["query"][i]
ref_interval = seg["ref"][i]
is_edge = edge_checker(ref_interval[0], ref_interval[1], ref_lengths[seg["rname"][i]])
species = '_'.join(seg["rname"][i].split('_')[:-1])
if metagenome_list:
species = seg["rname"][i] if is_trans else '_'.join(seg["rname"][i].split('_')[:-1])
quant_dic[(aln.query_name, (interval[0], interval[1]))] = [species]
# Record the gap
if i > 0:
Expand Down Expand Up @@ -316,7 +377,7 @@ def primary_and_unaligned_chimeric(sam_alnm_file, prefix, metagenome_list=None,
out_sam_file.write(aln)
# Quantification
if metagenome_list:
species = '_'.join(aln.reference_name.split('_')[:-1])
species = aln.reference_name if is_trans else '_'.join(aln.reference_name.split('_')[:-1])
quant_dic[(aln.query_name, (aln.query_alignment_start, aln.query_alignment_end))] = [species]
if primary_direction == '+':
pos_strand += 1
Expand All @@ -328,7 +389,7 @@ def primary_and_unaligned_chimeric(sam_alnm_file, prefix, metagenome_list=None,
if (aln.reference_name, qstart, qend, aln.reference_start) == supplementary_to_be_added[i]:
aln_queue[i] = aln
if metagenome_list and (aln.query_name, (qstart, qend)) in quant_dic:
species = '_'.join(aln.reference_name.split('_')[:-1])
species = aln.reference_name if is_trans else '_'.join(aln.reference_name.split('_')[:-1])
quant_dic[(aln.query_name, (qstart, qend))].append(species)

if not q_mode:
Expand All @@ -339,12 +400,20 @@ def primary_and_unaligned_chimeric(sam_alnm_file, prefix, metagenome_list=None,
in_sam_file.close()

# Write quantification information
if metagenome_list:
variation_flag = False
if is_trans:
quantification_file = open(prefix + "_quantification.tsv", 'w')
tpm_list = EM_trans(quant_dic, all_species, normalize)
quantification_file.write("ID\tcount\tTPM\n")
for trans, info in tpm_list.items():
quantification_file.write(trans + '\t' + str(info[0]) + '\t' + str(info[1]) + '\n')
quantification_file.close()

elif metagenome_list:
quantification_file = open(prefix + "_quantification.tsv", 'w')
variation_flag = False
quantification_file.write("Species\tAbundance\n")

abundance_list = EM(quant_dic, all_species)
abundance_list = EM_meta(quant_dic, all_species)
for k, v in abundance_list.items():
quantification_file.write(k + '\t' + str(v) + '\n')
metagenome_list[k]["real"] = v
Expand Down Expand Up @@ -390,7 +459,7 @@ def primary_and_unaligned_chimeric(sam_alnm_file, prefix, metagenome_list=None,
# Compute chimeric information
mean_segments = (len(gap_length) + num_aligned) / num_aligned
chimeric_file.write("Mean segments for each aligned read:\t" + str(mean_segments) + '\n')
if metagenome_list:
if metagenome_list and not is_trans:
chimeric_file.write("Shrinkage rate (beta):\t" + str(median(beta_list)))
chimeric_file.close()

Expand Down
5 changes: 3 additions & 2 deletions src/head_align_tail_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,11 @@ def head_align_tail(prefix, alnm_ext, mode):
head = min(head, head_new)
tail = min(tail, tail_new)
read_len_total = max(read_len_total, alnm.infer_read_length())
is_edge = edge_checker(alnm.reference_start, alnm.reference_end, dict_ref_len[ref])
if mode != "transcriptome":
is_edge = edge_checker(alnm.reference_start, alnm.reference_end, dict_ref_len[ref])

# Check for "circular" reads
if ref == last_ref and \
if mode != "transcriptome" and ref == last_ref and \
((last_is_edge[0] and is_edge[1]) or (last_is_edge[1] and is_edge[0])):
aligned_ref += alnm.reference_length
middle += alnm.query_alignment_length
Expand Down
16 changes: 5 additions & 11 deletions src/nanopore_transcript_abundance.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from file_handler import gzopen as open



def parse_paf(line):
out = dict()
fields = line.rstrip().split()
Expand All @@ -22,8 +21,9 @@ def parse_paf(line):
# that read to the set of transcripts the read is "compatible" with.
# Compatibility is defined by the length of an alignment relative
# to the longest alignment for the read.
def get_compatibility(records):


def get_compatibility(records):
full_length_min_distance = 20
min_read_length = 0

Expand Down Expand Up @@ -71,7 +71,6 @@ def calculate_abundance(compatibility):
abundance = collections.defaultdict(float)
total = 0
for read in compatibility:

if args.verbose > 1:
sys.stderr.write("[compatibility] %s: %s\n" % (read, compatibility[read]))

Expand All @@ -85,6 +84,7 @@ def calculate_abundance(compatibility):
sys.stderr.write("[abundance] %s: %.4f\n" % (transcript, abundance[transcript]))
return abundance


# Update read-transcript compatibility based on transcript abundances
def update_compatibility(compatibility, abundance):
for read in compatibility:
Expand All @@ -98,19 +98,17 @@ def update_compatibility(compatibility, abundance):
compatibility[read] = list()
for i in ids:
compatibility[read].append((i, abundance[i] / total))
#


# Read arguments
#
parser = argparse.ArgumentParser( description='Calculate transcript abundance from minimap2 alignment to transcripts')
parser.add_argument('-i', '--input', type=str, required=True)
parser.add_argument('-c', '--compatibility', type=str, required=False, default="")
parser.add_argument('-n', '--em-iterations', type=int, default=10)
parser.add_argument('-v', '--verbose', type=int, default=0)
args = parser.parse_args()

#
# Read the input PAF file and calculate the initial read-transcript compatibility
#
transcript_compatibility = collections.defaultdict(list)
prev_read_name = ""
curr_records = list()
Expand All @@ -127,9 +125,7 @@ def update_compatibility(compatibility, abundance):
# Process last batch
get_compatibility(curr_records)

#
# Run EM to calculate abundance and update read-transcript compatibility
#
for i in range(0, args.em_iterations):
sys.stderr.write("EM iteration %d\n" % (i))

Expand All @@ -153,13 +149,11 @@ def update_compatibility(compatibility, abundance):
if args.compatibility != "":
compatibility_writer = open(args.compatibility, "w")
for read in transcript_compatibility:

num_compat = len(transcript_compatibility[read])
ids = list()
probs = list()
tpms = list()
for t in transcript_compatibility[read]:

if t[1] > 0.1:
ids.append(t[0])
probs.append(t[1])
Expand Down
Loading

0 comments on commit 3d046e3

Please sign in to comment.