# NanoCount python API

### Activate virtual environment

In [3]:
from NanoCount.NanoCount import NanoCount
from NanoCount.common import jhelp, head

### Running NanoCount 

In [4]:
jhelp(NanoCount)

**NanoCount** (alignment_file, count_file, filter_bam_out, min_read_length, discard_suplementary, min_query_fraction_aligned, equivalent_threshold, scoring_value, convergence_target, max_em_rounds, extra_tx_info, primary_score, max_dist_3_prime, max_dist_5_prime, verbose, quiet)

Estimate abundance of transcripts using an EM

---

* **alignment_file** (required) [str]

BAM or SAM file containing aligned ONT dRNA-Seq reads including secondary and supplementary alignment

* **count_file** (default: "") [str]

Output file path where to write estimated counts (TSV format)

* **filter_bam_out** (default: "") [str]

Optional output file path where to write filtered reads selected by NanoCount to perform quantification estimation (BAM format)

* **min_read_length** (default: 50) [int]

Minimal length of the read to be considered valid

* **discard_suplementary** (default: False) [bool]

Discard any supplementary alignment. Otherwise they are considered like secondary alignments

* **min_query_fraction_aligned** (default: 0.5) [float]

Minimal fraction of the primary alignment query aligned to consider the read valid

* **equivalent_threshold** (default: 0.95) [float]

Fraction of the alignment score or the alignment length of secondary alignments compared to the primary alignment to be considered valid alignments

* **scoring_value** (default: alignment_score) [str]

Value to use for score thresholding of secondary alignments either "alignment_score" or "alignment_length"

* **convergence_target** (default: 0.005) [float]

Convergence target value of the cummulative difference between abundance values of successive EM round to trigger the end of the EM loop.

* **max_em_rounds** (default: 100) [int]

Maximum number of EM rounds before triggering stop

* **extra_tx_info** (default: False) [bool]

Add transcripts length and zero coverage transcripts to the output file (required valid bam/sam header)

* **primary_score** (default: primary) [str]

Method to pick the best alignment for each read. By default ("primary") it uses the primary read defined by the aligner but it can be changed to use either the best alignment score ("align_score") or the best alignment length ("align_len"). choices = [primary, align_score, align_len]

* **max_dist_3_prime** (default: 50) [int]

Maximum distance of alignment end to 3 prime of transcript. In ONT dRNA-Seq reads are assumed to start from the polyA tail (-1 to deactivate)

* **max_dist_5_prime** (default: -1) [int]

Maximum distance of alignment start to 5 prime of transcript. In conjunction with max_dist_3_prime it can be used to select near full transcript length reads only (-1 to deactivate).

* **verbose** (default: False) [bool]

Increase verbosity for QC and debugging

* **quiet** (default: False) [bool]

Reduce verbosity



#### Basic command

In [5]:
NanoCount (alignment_file="./data/aligned_reads_sorted.bam", count_file="./output/tx_counts.tsv")
head("./output/tx_counts.tsv")

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 150,779[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded alignment with invalid 3 prime end: 6,205[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 85,200[0m
[32m		Invalid secondary alignments: 60,168[0m
[32m		Valid secondary alignments: 2,626[0m
[32m		Reads with low query fraction aligned: 1,544[0m
[32m		Reads too short: 817[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 2.00 rounds [00:00, 7.82 rounds/s]
[32m	Exit EM loop after 2 rounds[0m
[32m	Convergence value: 0.002000099856041238[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to dataframe[0m
[32m	Co

transcript_name raw                   est_count          tpm                
YHR174W_mRNA    0.5929928027283197    50522.98679245284  592992.8027283197  
YGR192C_mRNA    0.021005944664969526  1789.7064854554037 21005.944664969527 
YLR110C_mRNA    0.009671361502347417  823.9999999999999  9671.361502347418  
YOL086C_mRNA    0.008368544600938967  713.0              8368.544600938967  
YKL060C_mRNA    0.006572769953051643  560.0              6572.769953051643  
YPR080W_mRNA    0.005299295774647887  451.5              5299.295774647888  
YBR118W_mRNA    0.005299295774647887  451.5              5299.295774647888  
YLR044C_mRNA    0.0051056338028169015 435.0              5105.633802816901  
YKL152C_mRNA    0.004237089201877934  361.0              4237.089201877934  



#### Using Best Alignment score rather than Primary reads as best reads

In [6]:
NanoCount (alignment_file="./data/aligned_reads_sorted.bam", count_file="./output/tx_counts.tsv", primary_score="align_score")
head("./output/tx_counts.tsv")

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 150,779[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded alignment with invalid 3 prime end: 6,205[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 84,978[0m
[32m		Invalid secondary alignments: 60,150[0m
[32m		Valid secondary alignments: 2,269[0m
[32m		Reads with low query fraction aligned: 1,532[0m
[32m		Reads too short: 1,051[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 2.00 rounds [00:00, 8.52 rounds/s]
[32m	Exit EM loop after 2 rounds[0m
[32m	Convergence value: 0.0019612500009223723[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to dataframe[0m
[32m

transcript_name raw                   est_count          tpm                
YHR174W_mRNA    0.5945419613600325    50522.98679245284  594541.9613600324  
YGR192C_mRNA    0.02106082145326324   1789.7064854554037 21060.82145326324  
YLR110C_mRNA    0.009696627362376144  824.0              9696.627362376144  
YOL086C_mRNA    0.008390406928852174  713.0              8390.406928852173  
YKL060C_mRNA    0.0065899409258867005 560.0              6589.940925886701  
YLR044C_mRNA    0.005118971969215562  435.0              5118.971969215562  
YPR080W_mRNA    0.004589423144813952  390.0              4589.423144813952  
YBR118W_mRNA    0.004589423144813952  390.0              4589.423144813952  
YKL152C_mRNA    0.004248158346866248  361.0              4248.1583468662475 



#### Write selected alignment to BAM file 

In [7]:
NanoCount (
    alignment_file="./data/aligned_reads_sorted.bam",
    count_file="./output/tx_counts.tsv",
    filter_bam_out = "./output/aligned_reads_selected.bam",
    primary_score="align_score")

head("./output/tx_counts.tsv")

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 150,779[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded alignment with invalid 3 prime end: 6,205[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 84,978[0m
[32m		Invalid secondary alignments: 60,150[0m
[32m		Valid secondary alignments: 2,269[0m
[32m		Reads with low query fraction aligned: 1,532[0m
[32m		Reads too short: 1,051[0m
[32m	Write selected alignments to BAM file[0m
[32m	Summary of alignments written to bam[0m
[32m		Alignments to select: 87,247[0m
[32m		Alignments written: 87,247[0m
[32m		Alignments skipped: 83,797[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	

transcript_name raw                   est_count          tpm                
YHR174W_mRNA    0.5945419613600325    50522.98679245284  594541.9613600324  
YGR192C_mRNA    0.02106082145326324   1789.7064854554037 21060.82145326324  
YLR110C_mRNA    0.009696627362376144  824.0              9696.627362376144  
YOL086C_mRNA    0.008390406928852174  713.0              8390.406928852173  
YKL060C_mRNA    0.0065899409258867005 560.0              6589.940925886701  
YLR044C_mRNA    0.005118971969215562  435.0              5118.971969215562  
YPR080W_mRNA    0.004589423144813952  390.0              4589.423144813952  
YBR118W_mRNA    0.004589423144813952  390.0              4589.423144813952  
YKL152C_mRNA    0.004248158346866248  361.0              4248.1583468662475 



#### Basic command without file writing and Dataframe output

In interactive mode it is also possible not to write the results out but instead to access the data directly as a pandas DataFrame

In [8]:
nc = NanoCount (alignment_file="./data/aligned_reads_sorted.bam")
display(nc.count_df)

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 150,779[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded alignment with invalid 3 prime end: 6,205[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 85,200[0m
[32m		Invalid secondary alignments: 60,168[0m
[32m		Valid secondary alignments: 2,626[0m
[32m		Reads with low query fraction aligned: 1,544[0m
[32m		Reads too short: 817[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 2.00 rounds [00:00, 8.30 rounds/s]
[32m	Exit EM loop after 2 rounds[0m
[32m	Convergence value: 0.002000099856041238[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to dataframe[0m
[32m	Co

Unnamed: 0_level_0,raw,est_count,tpm
transcript_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
YHR174W_mRNA,5.929928e-01,50522.986792,592992.802728
YGR192C_mRNA,2.100594e-02,1789.706485,21005.944665
YLR110C_mRNA,9.671362e-03,824.000000,9671.361502
YOL086C_mRNA,8.368545e-03,713.000000,8368.544601
YKL060C_mRNA,6.572770e-03,560.000000,6572.769953
...,...,...,...
YHR219W_mRNA,2.987545e-06,0.254539,2.987545
YDR433W_mRNA,2.347418e-06,0.200000,2.347418
YHL050C_mRNA,1.684364e-06,0.143508,1.684364
YPR204W_mRNA,1.684364e-06,0.143508,1.684364


#### Adding extra transcripts information

The `extra_tx_info` option adds a columns with the transcript lengths and also includes all the zero-coverage transcripts in the results   

In [9]:
nc = NanoCount (alignment_file="./data/aligned_reads_sorted.bam", extra_tx_info=True)
display(nc.count_df)

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 150,779[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded alignment with invalid 3 prime end: 6,205[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 85,200[0m
[32m		Invalid secondary alignments: 60,168[0m
[32m		Valid secondary alignments: 2,626[0m
[32m		Reads with low query fraction aligned: 1,544[0m
[32m		Reads too short: 817[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 2.00 rounds [00:00, 7.88 rounds/s]
[32m	Exit EM loop after 2 rounds[0m
[32m	Convergence value: 0.002000099856041238[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to dataframe[0m
[32m	Co

Unnamed: 0_level_0,raw,est_count,tpm,transcript_length
transcript_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
YHR174W_mRNA,0.592993,50522.986792,592992.802728,1314
YGR192C_mRNA,0.021006,1789.706485,21005.944665,999
YLR110C_mRNA,0.009671,824.000000,9671.361502,402
YOL086C_mRNA,0.008369,713.000000,8368.544601,1047
YKL060C_mRNA,0.006573,560.000000,6572.769953,1080
...,...,...,...,...
YPR200C_mRNA,0.000000,0.000000,0.000000,393
YPR201W_mRNA,0.000000,0.000000,0.000000,1215
YPR202W_mRNA,0.000000,0.000000,0.000000,717
YPR203W_mRNA,0.000000,0.000000,0.000000,309


#### Relaxing the equivalence threshold

The default value is 0.9 (90% of the alignment score of the primary alignment) but this value could be lowered to allow more secondary alignments to be included in the uncertainty calculation.
Lowering the value bellow 0.75 might not be relevant and will considerably increase the computation time.

In [10]:
NanoCount (alignment_file="./data/aligned_reads_sorted.bam", count_file="./output/tx_counts.tsv", equivalent_threshold=0.8, extra_tx_info=True)
head("./output/tx_counts.tsv")

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 150,779[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded alignment with invalid 3 prime end: 6,205[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 85,200[0m
[32m		Valid secondary alignments: 49,096[0m
[32m		Invalid secondary alignments: 13,698[0m
[32m		Reads with low query fraction aligned: 1,544[0m
[32m		Reads too short: 817[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 17.0 rounds [00:02, 6.41 rounds/s]
[32m	Exit EM loop after 17 rounds[0m
[32m	Convergence value: 0.004896640500345573[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to dataframe[0m
[32m	

transcript_name raw                  est_count          tpm                transcript_length 
YHR174W_mRNA    0.5827903493290055   49653.73776283127  582790.3493290056  1314              
YGR192C_mRNA    0.015110182037764753 1287.387509617557  15110.182037764753 999               
YGR254W_mRNA    0.011282420623667128 961.2622371364394  11282.420623667127 1314              
YLR110C_mRNA    0.009671361502347452 824.0000000000028  9671.361502347452  402               
YJR009C_mRNA    0.009496349770534818 809.0890004495665  9496.349770534818  999               
YOL086C_mRNA    0.008368544600938995 713.0000000000024  8368.544600938994  1047              
YKL060C_mRNA    0.006572769953051665 560.0000000000019  6572.769953051666  1080              
YPR080W_mRNA    0.005299295774647906 451.50000000000153 5299.295774647906  1377              
YBR118W_mRNA    0.005299295774647906 451.50000000000153 5299.295774647906  1377              

