# NanoCount python API

### Activate virtual environment

In [3]:
from NanoCount.NanoCount import NanoCount
from NanoCount.common import jhelp, head

### Running NanoCount 

In [4]:
jhelp(NanoCount)

**NanoCount** (alignment_file, count_file, filter_bam_out, min_alignment_length, keep_suplementary, min_query_fraction_aligned, sec_scoring_threshold, sec_scoring_value, convergence_target, max_em_rounds, extra_tx_info, primary_score, max_dist_3_prime, max_dist_5_prime, verbose, quiet)

Estimate abundance of transcripts using an EM

---

* **alignment_file** (required) [str]

BAM or SAM file containing aligned ONT dRNA-Seq reads including secondary and supplementary alignment

* **count_file** (default: "") [str]

Output file path where to write estimated counts (TSV format)

* **filter_bam_out** (default: "") [str]

Optional output file path where to write filtered reads selected by NanoCount to perform quantification estimation (BAM format)

* **min_alignment_length** (default: 50) [int]

Minimal length of the alignment to be considered valid

* **keep_suplementary** (default: False) [bool]

Retain any supplementary alignment and considered them like secondary alignments. discarded by default.

* **min_query_fraction_aligned** (default: 0.5) [float]

Minimal fraction of the primary alignment query aligned to consider the read valid

* **sec_scoring_threshold** (default: 0.95) [float]

Fraction of the alignment score or the alignment length of secondary alignments compared to the primary alignment to be considered valid alignments

* **sec_scoring_value** (default: alignment_score) [str]

Value to use for score thresholding of secondary alignments either "alignment_score" or "alignment_length"

* **convergence_target** (default: 0.005) [float]

Convergence target value of the cummulative difference between abundance values of successive EM round to trigger the end of the EM loop.

* **max_em_rounds** (default: 100) [int]

Maximum number of EM rounds before triggering stop

* **extra_tx_info** (default: False) [bool]

Add transcripts length and zero coverage transcripts to the output file (required valid bam/sam header)

* **primary_score** (default: primary) [str]

Method to pick the best alignment for each read. By default ("primary") it uses the primary read defined by the aligner but it can be changed to use either the best alignment score ("align_score") or the best alignment length ("align_len"). choices = [primary, align_score, align_len]

* **max_dist_3_prime** (default: 50) [int]

Maximum distance of alignment end to 3 prime of transcript. In ONT dRNA-Seq reads are assumed to start from the polyA tail (-1 to deactivate)

* **max_dist_5_prime** (default: -1) [int]

Maximum distance of alignment start to 5 prime of transcript. In conjunction with max_dist_3_prime it can be used to select near full transcript length reads only (-1 to deactivate).

* **verbose** (default: False) [bool]

Increase verbosity for QC and debugging

* **quiet** (default: False) [bool]

Reduce verbosity



#### Basic command

In [5]:
NanoCount (alignment_file="./data/aligned_reads_sorted.bam", count_file="./output/tx_counts.tsv")
head("./output/tx_counts.tsv")

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 150,517[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded alignment with invalid 3 prime end: 6,133[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m		Discarded supplementary alignments: 334[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 85,174[0m
[32m		Invalid secondary alignments: 59,993[0m
[32m		Valid secondary alignments: 2,618[0m
[32m		Reads with low query fraction aligned: 2,362[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 2.00 rounds [00:00, 7.40 rounds/s]
[32m	Exit EM loop after 2 rounds[0m
[32m	Convergence value: 0.0019459404177855882[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to da

transcript_name raw                  est_count          tpm                
YHR174W_mRNA    0.5931738182127508   50522.98679245284  593173.8182127508  
YGR192C_mRNA    0.021012356886554624 1789.7064854554035 21012.356886554626 
YLR110C_mRNA    0.009674313757719492 824.0              9674.313757719492  
YOL086C_mRNA    0.008371099161716017 713.0              8371.099161716018  
YKL060C_mRNA    0.006574776340197713 560.0              6574.776340197713  
YPR080W_mRNA    0.005300913424284406 451.5              5300.913424284407  
YBR118W_mRNA    0.005300913424284406 451.5              5300.913424284407  
YLR044C_mRNA    0.005107192335689295 435.0              5107.192335689295  
YKL152C_mRNA    0.004238382605020311 361.0              4238.3826050203115 



#### Using Best Alignment score rather than Primary reads as best reads

In [6]:
NanoCount (alignment_file="./data/aligned_reads_sorted.bam", count_file="./output/tx_counts.tsv", primary_score="align_score")
head("./output/tx_counts.tsv")

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 150,517[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded alignment with invalid 3 prime end: 6,133[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m		Discarded supplementary alignments: 334[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 84,953[0m
[32m		Invalid secondary alignments: 59,975[0m
[32m		Reads with low query fraction aligned: 2,583[0m
[32m		Valid secondary alignments: 2,262[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 2.00 rounds [00:00, 8.33 rounds/s]
[32m	Exit EM loop after 2 rounds[0m
[32m	Convergence value: 0.001906212433901406[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to dat

transcript_name raw                   est_count          tpm                
YHR174W_mRNA    0.5947169233864942    50522.98679245284  594716.9233864942  
YGR192C_mRNA    0.021067019239525427  1789.7064854554035 21067.019239525427 
YLR110C_mRNA    0.00969948088943298   824.0              9699.48088943298   
YOL086C_mRNA    0.008392876060880723  713.0              8392.876060880722  
YKL060C_mRNA    0.006591880216119501  560.0              6591.880216119501  
YLR044C_mRNA    0.0051204783821642555 435.0              5120.478382164256  
YPR080W_mRNA    0.004590773721940367  390.0              4590.773721940367  
YBR118W_mRNA    0.004590773721940367  390.0              4590.773721940367  
YKL152C_mRNA    0.00424940849646275   361.0              4249.40849646275   



#### Write selected alignment to BAM file 

In [7]:
NanoCount (
    alignment_file="./data/aligned_reads_sorted.bam",
    count_file="./output/tx_counts.tsv",
    filter_bam_out = "./output/aligned_reads_selected.bam",
    primary_score="align_score")

head("./output/tx_counts.tsv")

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 150,517[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded alignment with invalid 3 prime end: 6,133[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m		Discarded supplementary alignments: 334[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 84,953[0m
[32m		Invalid secondary alignments: 59,975[0m
[32m		Reads with low query fraction aligned: 2,583[0m
[32m		Valid secondary alignments: 2,262[0m
[32m	Write selected alignments to BAM file[0m
[32m	Summary of alignments written to bam[0m
[32m		Alignments to select: 87,215[0m
[32m		Alignments written: 87,215[0m
[32m		Alignments skipped: 83,829[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance 

transcript_name raw                   est_count          tpm                
YHR174W_mRNA    0.5947169233864942    50522.98679245284  594716.9233864942  
YGR192C_mRNA    0.021067019239525427  1789.7064854554035 21067.019239525427 
YLR110C_mRNA    0.00969948088943298   824.0              9699.48088943298   
YOL086C_mRNA    0.008392876060880723  713.0              8392.876060880722  
YKL060C_mRNA    0.006591880216119501  560.0              6591.880216119501  
YLR044C_mRNA    0.0051204783821642555 435.0              5120.478382164256  
YPR080W_mRNA    0.004590773721940367  390.0              4590.773721940367  
YBR118W_mRNA    0.004590773721940367  390.0              4590.773721940367  
YKL152C_mRNA    0.00424940849646275   361.0              4249.40849646275   



#### Basic command without file writing and Dataframe output

In interactive mode it is also possible not to write the results out but instead to access the data directly as a pandas DataFrame

In [8]:
nc = NanoCount (alignment_file="./data/aligned_reads_sorted.bam")
display(nc.count_df)

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 150,517[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded alignment with invalid 3 prime end: 6,133[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m		Discarded supplementary alignments: 334[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 85,174[0m
[32m		Invalid secondary alignments: 59,993[0m
[32m		Valid secondary alignments: 2,618[0m
[32m		Reads with low query fraction aligned: 2,362[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 2.00 rounds [00:00, 8.32 rounds/s]
[32m	Exit EM loop after 2 rounds[0m
[32m	Convergence value: 0.0019459404177855882[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to da

Unnamed: 0_level_0,raw,est_count,tpm
transcript_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
YHR174W_mRNA,5.931738e-01,50522.986792,593173.818213
YGR192C_mRNA,2.101236e-02,1789.706485,21012.356887
YLR110C_mRNA,9.674314e-03,824.000000,9674.313758
YOL086C_mRNA,8.371099e-03,713.000000,8371.099162
YKL060C_mRNA,6.574776e-03,560.000000,6574.776340
...,...,...,...
YDR433W_mRNA,2.348134e-06,0.200000,2.348134
YHL050C_mRNA,1.684879e-06,0.143508,1.684879
YPR204W_mRNA,1.684879e-06,0.143508,1.684879
YEL077C_mRNA,3.453139e-07,0.029412,0.345314


#### Adding extra transcripts information

The `extra_tx_info` option adds a columns with the transcript lengths and also includes all the zero-coverage transcripts in the results   

In [9]:
nc = NanoCount (alignment_file="./data/aligned_reads_sorted.bam", extra_tx_info=True)
display(nc.count_df)

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 150,517[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded alignment with invalid 3 prime end: 6,133[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m		Discarded supplementary alignments: 334[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 85,174[0m
[32m		Invalid secondary alignments: 59,993[0m
[32m		Valid secondary alignments: 2,618[0m
[32m		Reads with low query fraction aligned: 2,362[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 2.00 rounds [00:00, 7.95 rounds/s]
[32m	Exit EM loop after 2 rounds[0m
[32m	Convergence value: 0.0019459404177855882[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to da

Unnamed: 0_level_0,raw,est_count,tpm,transcript_length
transcript_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
YHR174W_mRNA,0.593174,50522.986792,593173.818213,1314
YGR192C_mRNA,0.021012,1789.706485,21012.356887,999
YLR110C_mRNA,0.009674,824.000000,9674.313758,402
YOL086C_mRNA,0.008371,713.000000,8371.099162,1047
YKL060C_mRNA,0.006575,560.000000,6574.776340,1080
...,...,...,...,...
YPR200C_mRNA,0.000000,0.000000,0.000000,393
YPR201W_mRNA,0.000000,0.000000,0.000000,1215
YPR202W_mRNA,0.000000,0.000000,0.000000,717
YPR203W_mRNA,0.000000,0.000000,0.000000,309


#### Relaxing the secondary alignment scoring threshold

The default value is 0.95 (95% of the alignment score of the primary alignment) but this value could be lowered to allow more secondary alignments to be included in the uncertainty calculation.
Lowering the value bellow 0.75 might not be relevant and will considerably increase the computation time.

In [11]:
NanoCount (alignment_file="./data/aligned_reads_sorted.bam", count_file="./output/tx_counts.tsv", sec_scoring_threshold=0.8, extra_tx_info=True)
head("./output/tx_counts.tsv")

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 150,517[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded alignment with invalid 3 prime end: 6,133[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m		Discarded supplementary alignments: 334[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 85,174[0m
[32m		Valid secondary alignments: 49,061[0m
[32m		Invalid secondary alignments: 13,550[0m
[32m		Reads with low query fraction aligned: 2,362[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 17.0 rounds [00:02, 6.18 rounds/s]
[32m	Exit EM loop after 17 rounds[0m
[32m	Convergence value: 0.004898135236430615[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to d

transcript_name raw                   est_count          tpm               transcript_length 
YHR174W_mRNA    0.5829682504382943    49653.737762831275 582968.2504382943 1314              
YGR192C_mRNA    0.01511479453374923   1287.3875096175568 15114.79453374923 999               
YGR254W_mRNA    0.01128586466687533   961.2622371364395  11285.86466687533 1314              
YLR110C_mRNA    0.009674313757719525  824.0000000000028  9674.313757719525 402               
YJR009C_mRNA    0.009499248602267904  809.0890004495665  9499.248602267904 999               
YOL086C_mRNA    0.008371099161716045  713.0000000000024  8371.099161716045 1047              
YKL060C_mRNA    0.0065747763401977355 560.0000000000019  6574.776340197735 1080              
YBR118W_mRNA    0.005300913424284424  451.5000000000016  5300.913424284425 1377              
YPR080W_mRNA    0.005300913424284424  451.5000000000016  5300.913424284425 1377              

