# NanoCount python API

### Activate virtual environment

In [2]:
from NanoCount.NanoCount import NanoCount
from NanoCount.common import jhelp, head

### Running NanoCount 

In [3]:
jhelp(NanoCount)

**NanoCount** (alignment_file, count_file, min_read_length, min_query_fraction_aligned, equivalent_threshold, scoring_value, convergence_target, max_em_rounds, extra_tx_info, primary_score, verbose, quiet)

Estimate abundance of transcripts using an EM

---

* **alignment_file** (required) [str]

BAM or SAM file containing aligned ONT dRNA-Seq reads including secondary and supplementary alignment

* **count_file** (default: "") [str]

Output file path where to write estimated counts (TSV format)

* **min_read_length** (default: 50) [int]

Minimal length of the read to be considered valid

* **min_query_fraction_aligned** (default: 0.5) [float]

Minimal fraction of the primary hit query aligned to consider the read valid

* **equivalent_threshold** (default: 0.9) [float]

Fraction of the alignment score or the alignment length of secondary hits compared to the primary hit to be considered valid hits

* **scoring_value** (default: alignment_score) [str]

Value to use for score thresholding of secondary hits either "alignment_score" or "alignment_length"

* **convergence_target** (default: 0.005) [float]

Convergence target value of the cummulative difference between abundance values of successive EM round to trigger the end of the EM loop.

* **max_em_rounds** (default: 100) [int]

Maximum number of EM rounds before triggering stop

* **extra_tx_info** (default: False) [bool]

Add transcripts length and zero coverage transcripts to the output file (required valid bam/sam header)

* **primary_score** (default: "") [str]

Method to pick the best alignment for each read. By default it uses the primary read defined by the aligner but it can be changed to use either the best alignment score ("align_score") or the best alignment length ("align_len").

* **verbose** (default: False) [bool]

Increase verbosity for QC and debugging

* **quiet** (default: False) [bool]

Reduce verbosity



#### Basic command

In [12]:
NanoCount (alignment_file="./data/aligned_reads.bam", count_file="./output/tx_counts.tsv")
head("./output/tx_counts.tsv")

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality hits[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 2.00 rounds [00:00, 14.0 rounds/s]
[32m	Exit EM loop after 2 rounds[0m
[32m	Convergence value: 0.0026556625233718663[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to dataframe[0m
[32m	Compute estimated counts and TPM[0m
[32m	Write file[0m


transcript_name raw                  est_count          tpm                
YHR174W_mRNA    0.037525253303454735 921.3950696130275  37525.25330345474  
YLR110C_mRNA    0.032926045534605486 808.466122056703   32926.04553460549  
YKL060C_mRNA    0.023539952757188228 577.9999999999998  23539.952757188228 
YKL152C_mRNA    0.014865195080231321 364.99999999999983 14865.19508023132  
YCR012W_mRNA    0.014539382585322141 356.9999999999999  14539.38258532214  
YDR050C_mRNA    0.014539382585322141 356.9999999999999  14539.38258532214  
YOR369C_mRNA    0.013165221113714695 323.2588392261506  13165.221113714695 
YMR116C_mRNA    0.012340148244685179 302.9999999999999  12340.148244685179 
YLR340W_mRNA    0.011810702940457761 289.9999999999999  11810.70294045776  



#### Using Best Alignment score rather than Primary reads as best reads + verbose mode

In [11]:
NanoCount (alignment_file="./data/aligned_reads.bam", count_file="./output/tx_counts.tsv", verbose=True, primary_score="align_score")
head("./output/tx_counts.tsv")

[01;34m## Checking options and input files ##[0m
[37m	[DEBUG]: Options summary[0m
[37m	[DEBUG]: 	Package name: NanoCount[0m
[37m	[DEBUG]: 	Package version: 0.2.2[0m
[37m	[DEBUG]: 	Timestamp: 2020-06-08 15:31:54.276602[0m
[37m	[DEBUG]: 	quiet: False[0m
[37m	[DEBUG]: 	verbose: True[0m
[37m	[DEBUG]: 	primary_score: align_score[0m
[37m	[DEBUG]: 	extra_tx_info: False[0m
[37m	[DEBUG]: 	max_em_rounds: 100[0m
[37m	[DEBUG]: 	convergence_target: 0.005[0m
[37m	[DEBUG]: 	scoring_value: alignment_score[0m
[37m	[DEBUG]: 	equivalent_threshold: 0.9[0m
[37m	[DEBUG]: 	min_query_fraction_aligned: 0.5[0m
[37m	[DEBUG]: 	min_read_length: 50[0m
[37m	[DEBUG]: 	count_file: ./output/tx_counts.tsv[0m
[37m	[DEBUG]: 	alignment_file: ./data/aligned_reads.bam[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality hits[0m
[37m	[DEBUG]: Summary of reads parsed in input bam file[0m
[37m	[DEBUG]: 	Mapped hits: 156,984[0m
[37m	[DEBUG]: 	Unmapped hits:

transcript_name raw                  est_count          tpm                
YHR174W_mRNA    0.037525253303454735 921.3950696130275  37525.25330345474  
YLR110C_mRNA    0.032926045534605486 808.466122056703   32926.04553460549  
YKL060C_mRNA    0.023539952757188228 577.9999999999998  23539.952757188228 
YKL152C_mRNA    0.014865195080231321 364.99999999999983 14865.19508023132  
YCR012W_mRNA    0.014539382585322141 356.9999999999999  14539.38258532214  
YDR050C_mRNA    0.014539382585322141 356.9999999999999  14539.38258532214  
YOR369C_mRNA    0.013165221113714695 323.2588392261506  13165.221113714695 
YMR116C_mRNA    0.012340148244685179 302.9999999999999  12340.148244685179 
YLR340W_mRNA    0.011810702940457761 289.9999999999999  11810.70294045776  



#### Basic command without file writing and Dataframe output

In interactive mode it is also possible not to write the results out but instead to access the data directly as a pandas DataFrame

In [7]:
nc = NanoCount (alignment_file="./data/aligned_reads.bam")
display(nc.count_df)

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality hits[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 2.00 rounds [00:00, 13.7 rounds/s]
[32m	Exit EM loop after 2 rounds[0m
[32m	Convergence value: 0.0026556625233718663[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to dataframe[0m
[32m	Compute estimated counts and TPM[0m


Unnamed: 0_level_0,raw,est_count,tpm
transcript_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
YHR174W_mRNA,3.752525e-02,921.395070,37525.253303
YLR110C_mRNA,3.292605e-02,808.466122,32926.045535
YKL060C_mRNA,2.353995e-02,578.000000,23539.952757
YKL152C_mRNA,1.486520e-02,365.000000,14865.195080
YCR012W_mRNA,1.453938e-02,357.000000,14539.382585
...,...,...,...
YLR285C-A_mRNA,1.480966e-06,0.036364,1.480966
YCL041C_mRNA,7.272600e-07,0.017857,0.727260
YCL008C_mRNA,6.787760e-07,0.016667,0.678776
YOR192C-C_mRNA,5.656467e-07,0.013889,0.565647


#### Adding extra transcripts information

The `extra_tx_info` option adds a columns with the transcript lengths and also includes all the zero-coverage transcripts in the results   

In [8]:
nc = NanoCount (alignment_file="./data/aligned_reads.bam", extra_tx_info=True)
display(nc.count_df)

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality hits[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 2.00 rounds [00:00, 13.2 rounds/s]
[32m	Exit EM loop after 2 rounds[0m
[32m	Convergence value: 0.0026556625233718663[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to dataframe[0m
[32m	Compute estimated counts and TPM[0m


Unnamed: 0_level_0,raw,est_count,tpm,transcript_length
transcript_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
YHR174W_mRNA,0.037525,921.395070,37525.253303,1314
YLR110C_mRNA,0.032926,808.466122,32926.045535,402
YKL060C_mRNA,0.023540,578.000000,23539.952757,1080
YKL152C_mRNA,0.014865,365.000000,14865.195080,744
YCR012W_mRNA,0.014539,357.000000,14539.382585,1251
...,...,...,...,...
YPR201W_mRNA,0.000000,0.000000,0.000000,1215
YPR202W_mRNA,0.000000,0.000000,0.000000,717
YPR203W_mRNA,0.000000,0.000000,0.000000,309
YPR204C-A_mRNA,0.000000,0.000000,0.000000,483


#### Relaxing the equivalence threshold

The default value is 0.9 (90% of the alignment score of the primary alignment) but this value could be lowered to allow more secondary alignments to be included in the uncertainty calculation.
Lowering the value bellow 0.75 might not be relevant and will considerably increase the computation time.

In [9]:
NanoCount (alignment_file="./data/aligned_reads.bam", count_file="./output/tx_counts.tsv", equivalent_threshold=0.8, extra_tx_info=True)
head("./output/tx_counts.tsv")

[01;34m## Checking options and input files ##[0m
[01;34m## Initialise Nanocount ##[0m
[32m	Parse Bam file and filter low quality hits[0m
[32m	Generate initial read/transcript compatibility index[0m
[01;34m## Start EM abundance estimate ##[0m
	Progress: 2.00 rounds [00:00, 14.6 rounds/s]
[32m	Exit EM loop after 2 rounds[0m
[32m	Convergence value: 0.0026556625233718663[0m
[01;34m## Summarize data ##[0m
[32m	Convert results to dataframe[0m
[32m	Compute estimated counts and TPM[0m
[32m	Write file[0m


transcript_name raw                  est_count          tpm                transcript_length 
YHR174W_mRNA    0.037525253303454735 921.3950696130275  37525.25330345474  1314              
YLR110C_mRNA    0.032926045534605486 808.466122056703   32926.04553460549  402               
YKL060C_mRNA    0.023539952757188228 577.9999999999998  23539.952757188228 1080              
YKL152C_mRNA    0.014865195080231321 364.99999999999983 14865.19508023132  744               
YCR012W_mRNA    0.014539382585322141 356.9999999999999  14539.38258532214  1251              
YDR050C_mRNA    0.014539382585322141 356.9999999999999  14539.38258532214  747               
YOR369C_mRNA    0.013165221113714695 323.2588392261506  13165.221113714695 432               
YMR116C_mRNA    0.012340148244685179 302.9999999999999  12340.148244685179 960               
YLR340W_mRNA    0.011810702940457761 289.9999999999999  11810.70294045776  939               

