# NanoCount python API

### Activate virtual environment

In [5]:
from NanoCount.NanoCount import NanoCount
from NanoCount.common import jhelp, head

### Running NanoCount 

In [6]:
jhelp(NanoCount)

**NanoCount** (alignment_file, count_file, min_read_length, discard_suplementary, min_query_fraction_aligned, equivalent_threshold, scoring_value, convergence_target, max_em_rounds, extra_tx_info, primary_score, max_dist_3_prime, max_dist_5_prime, verbose, quiet)

Estimate abundance of transcripts using an EM

---

* **alignment_file** (required) [str]

BAM or SAM file containing aligned ONT dRNA-Seq reads including secondary and supplementary alignment

* **count_file** (default: "") [str]

Output file path where to write estimated counts (TSV format)

* **min_read_length** (default: 50) [int]

Minimal length of the read to be considered valid

* **discard_suplementary** (default: False) [bool]

Discard any supplementary alignment. Otherwise they are considered like secondary alignments

* **min_query_fraction_aligned** (default: 0.5) [float]

Minimal fraction of the primary alignment query aligned to consider the read valid

* **equivalent_threshold** (default: 0.9) [float]

Fraction of the alignment score or the alignment length of secondary alignments compared to the primary alignment to be considered valid alignments

* **scoring_value** (default: alignment_score) [str]

Value to use for score thresholding of secondary alignments either "alignment_score" or "alignment_length"

* **convergence_target** (default: 0.005) [float]

Convergence target value of the cummulative difference between abundance values of successive EM round to trigger the end of the EM loop.

* **max_em_rounds** (default: 100) [int]

Maximum number of EM rounds before triggering stop

* **extra_tx_info** (default: False) [bool]

Add transcripts length and zero coverage transcripts to the output file (required valid bam/sam header)

* **primary_score** (default: primary) [str]

Method to pick the best alignment for each read. By default ("primary") it uses the primary read defined by the aligner but it can be changed to use either the best alignment score ("align_score") or the best alignment length ("align_len"). choices = [primary, align_score, align_len]

* **max_dist_3_prime** (default: 100) [int]

Maximum distance of alignment end to 3 prime of transcript. In ONT dRNA-Seq reads are assumed to start from the polyA tail (-1 to deactivate)

* **max_dist_5_prime** (default: -1) [int]

Maximum distance of alignment start to 5 prime of transcript. In conjunction with max_dist_3_prime it can be used to select near full lenght reads only (-1 to deactivate).

* **verbose** (default: False) [bool]

Increase verbosity for QC and debugging

* **quiet** (default: False) [bool]

Reduce verbosity



#### Basic command

In [7]:
NanoCount (alignment_file="./data/aligned_reads.bam", count_file="./output/tx_counts.tsv")
head("./output/tx_counts.tsv")

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 153,201[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m		Discarded alignment with invalid 3 prime end: 3,783[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 85,899[0m
[32m		Invalid secondary alignments: 60,113[0m
[32m		Valid secondary alignments: 4,374[0m
[32m		Reads with low query fraction aligned: 1,580[0m
[32m		Reads too short: 798[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 3.00 rounds [00:00, 5.37 rounds/s]
[32m	Exit EM loop after 3 rounds[0m
[32m	Convergence value: 0.0019319339005333133[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to dataframe[0m
[32m	C

transcript_name raw                  est_count          tpm                
YHR174W_mRNA    0.5932131975393656   50956.42045543397  593213.1975393656  
YGR192C_mRNA    0.020661735359370926 1774.8224056346032 20661.735359370927 
YLR110C_mRNA    0.009615944306685765 826.0000000000006  9615.944306685766  
YOL086C_mRNA    0.00835865376779707  718.0000000000005  8358.65376779707   
YKL060C_mRNA    0.006635700066357005 570.0000000000005  6635.700066357005  
YPR080W_mRNA    0.005273635315894251 453.0000000000003  5273.635315894251  
YBR118W_mRNA    0.005273635315894251 453.0000000000003  5273.635315894251  
YLR044C_mRNA    0.005087370050873704 437.0000000000003  5087.370050873704  
YKL152C_mRNA    0.004237534779217456 364.0000000000002  4237.534779217455  



#### Using Best Alignment score rather than Primary reads as best reads

In [8]:
NanoCount (alignment_file="./data/aligned_reads.bam", count_file="./output/tx_counts.tsv", primary_score="align_score")
head("./output/tx_counts.tsv")

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 153,201[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m		Discarded alignment with invalid 3 prime end: 3,783[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 85,899[0m
[32m		Invalid secondary alignments: 60,113[0m
[32m		Valid secondary alignments: 4,374[0m
[32m		Reads with low query fraction aligned: 1,580[0m
[32m		Reads too short: 798[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 3.00 rounds [00:00, 5.78 rounds/s]
[32m	Exit EM loop after 3 rounds[0m
[32m	Convergence value: 0.0019319339005333133[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to dataframe[0m
[32m	C

transcript_name raw                  est_count          tpm                
YHR174W_mRNA    0.5932131975393656   50956.42045543397  593213.1975393656  
YGR192C_mRNA    0.020661735359370926 1774.8224056346032 20661.735359370927 
YLR110C_mRNA    0.009615944306685765 826.0000000000006  9615.944306685766  
YOL086C_mRNA    0.00835865376779707  718.0000000000005  8358.65376779707   
YKL060C_mRNA    0.006635700066357005 570.0000000000005  6635.700066357005  
YPR080W_mRNA    0.005273635315894251 453.0000000000003  5273.635315894251  
YBR118W_mRNA    0.005273635315894251 453.0000000000003  5273.635315894251  
YLR044C_mRNA    0.005087370050873704 437.0000000000003  5087.370050873704  
YKL152C_mRNA    0.004237534779217456 364.0000000000002  4237.534779217455  



#### Basic command without file writing and Dataframe output

In interactive mode it is also possible not to write the results out but instead to access the data directly as a pandas DataFrame

In [9]:
nc = NanoCount (alignment_file="./data/aligned_reads.bam")
display(nc.count_df)

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 153,201[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m		Discarded alignment with invalid 3 prime end: 3,783[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 85,899[0m
[32m		Invalid secondary alignments: 60,113[0m
[32m		Valid secondary alignments: 4,374[0m
[32m		Reads with low query fraction aligned: 1,580[0m
[32m		Reads too short: 798[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 3.00 rounds [00:00, 5.27 rounds/s]
[32m	Exit EM loop after 3 rounds[0m
[32m	Convergence value: 0.0019319339005333133[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to dataframe[0m
[32m	C

Unnamed: 0_level_0,raw,est_count,tpm
transcript_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
YHR174W_mRNA,5.932132e-01,50956.420455,593213.197539
YGR192C_mRNA,2.066174e-02,1774.822406,20661.735359
YLR110C_mRNA,9.615944e-03,826.000000,9615.944307
YOL086C_mRNA,8.358654e-03,718.000000,8358.653768
YKL060C_mRNA,6.635700e-03,570.000000,6635.700066
...,...,...,...
YBL083C_mRNA,6.467544e-07,0.055556,0.646754
YFL062W_mRNA,3.637993e-07,0.031250,0.363799
YIL047C-A_mRNA,3.637993e-07,0.031250,0.363799
YOR309C_mRNA,3.444254e-08,0.002959,0.034443


#### Adding extra transcripts information

The `extra_tx_info` option adds a columns with the transcript lengths and also includes all the zero-coverage transcripts in the results   

In [10]:
nc = NanoCount (alignment_file="./data/aligned_reads.bam", extra_tx_info=True)
display(nc.count_df)

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 153,201[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m		Discarded alignment with invalid 3 prime end: 3,783[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 85,899[0m
[32m		Invalid secondary alignments: 60,113[0m
[32m		Valid secondary alignments: 4,374[0m
[32m		Reads with low query fraction aligned: 1,580[0m
[32m		Reads too short: 798[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 3.00 rounds [00:00, 4.51 rounds/s]
[32m	Exit EM loop after 3 rounds[0m
[32m	Convergence value: 0.0019319339005333133[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to dataframe[0m
[32m	C

Unnamed: 0_level_0,raw,est_count,tpm,transcript_length
transcript_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
YHR174W_mRNA,0.593213,50956.420455,593213.197539,1314
YGR192C_mRNA,0.020662,1774.822406,20661.735359,999
YLR110C_mRNA,0.009616,826.000000,9615.944307,402
YOL086C_mRNA,0.008359,718.000000,8358.653768,1047
YKL060C_mRNA,0.006636,570.000000,6635.700066,1080
...,...,...,...,...
YPR200C_mRNA,0.000000,0.000000,0.000000,393
YPR201W_mRNA,0.000000,0.000000,0.000000,1215
YPR202W_mRNA,0.000000,0.000000,0.000000,717
YPR203W_mRNA,0.000000,0.000000,0.000000,309


#### Relaxing the equivalence threshold

The default value is 0.9 (90% of the alignment score of the primary alignment) but this value could be lowered to allow more secondary alignments to be included in the uncertainty calculation.
Lowering the value bellow 0.75 might not be relevant and will considerably increase the computation time.

In [11]:
NanoCount (alignment_file="./data/aligned_reads.bam", count_file="./output/tx_counts.tsv", equivalent_threshold=0.8, extra_tx_info=True)
head("./output/tx_counts.tsv")

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality alignments[0m
[32m	Summary of alignments parsed in input bam file[0m
[32m		Valid alignments: 153,201[0m
[32m		Discarded unmapped alignments: 9,545[0m
[32m		Discarded negative strand alignments: 4,515[0m
[32m		Discarded alignment with invalid 3 prime end: 3,783[0m
[32m	Summary of reads filtered[0m
[32m		Reads with valid best alignment: 85,899[0m
[32m		Valid secondary alignments: 50,019[0m
[32m		Invalid secondary alignments: 14,468[0m
[32m		Reads with low query fraction aligned: 1,580[0m
[32m		Reads too short: 798[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 18.0 rounds [00:04, 3.92 rounds/s]
[32m	Exit EM loop after 18 rounds[0m
[32m	Convergence value: 0.00462674973518923[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to dataframe[0m
[32m	C

transcript_name raw                   est_count          tpm                transcript_length 
YHR174W_mRNA    0.5828216847430426    50063.79989774262  582821.6847430427  1314              
YGR192C_mRNA    0.015176437715561169  1303.6408233289887 15176.43771556117  999               
YGR254W_mRNA    0.011446001726217545  983.200102280361   11446.001726217546 1314              
YLR110C_mRNA    0.009615944306685751  825.9999999999993  9615.944306685751  402               
YJR009C_mRNA    0.009392668850428238  806.8208615829352  9392.668850428237  999               
YOL086C_mRNA    0.008358653767797057  717.9999999999994  8358.653767797057  1047              
YKL060C_mRNA    0.006635700066356995  569.9999999999995  6635.700066356995  1080              
YBR118W_mRNA    0.0052736353158942435 452.9999999999996  5273.635315894244  1377              
YPR080W_mRNA    0.0052736353158942435 452.9999999999996  5273.635315894244  1377              

