# CpG_Aggregate API usage

## Import module

In [10]:
# Import main module 
from pycoMeth.CpG_Aggregate import CpG_Aggregate

# optionally inport jupyter helper functions
from pycoMeth.common import head, jhelp

## Getting help

In [3]:
jhelp(CpG_Aggregate)

**CpG_Aggregate** (nanopolish_fn, ref_fasta_fn, output_bed_fn, output_tsv_fn, min_depth, sample_id, min_llr, verbose, quiet, progress, kwargs)

Calculate methylation frequency at genomic CpG sites from the output of `nanopolish call-methylation`

---

* **nanopolish_fn** (required) [list(str)]

Path to a nanopolish call_methylation tsv output file or a list of files or a regex matching several files

* **ref_fasta_fn** (required) [str]

Reference file used for alignment in Fasta format (ideally already indexed with samtools faidx)

* **output_bed_fn** (default: "") [str]

Path to write a summary result file in BED format (At least 1 output file is required)

* **output_tsv_fn** (default: "") [str]

Path to write an more extensive result report in TSV format (At least 1 output file is required)

* **min_depth** (default: 10) [int]

Minimal number of reads covering a site to be reported

* **sample_id** (default: "") [str]

Sample ID to be used for the BED track header

* **min_llr** (default: 2) [float]

Minimal log likelyhood ratio to consider a site significantly methylated or unmethylated in output BED file

* **verbose** (default: False) [bool]

* **quiet** (default: False) [bool]

* **progress** (default: False) [bool]

* **kwargs**



## Example usage

### Basic usage

In [4]:
ff = CpG_Aggregate (
    nanopolish_fn="./data/nanopolish_sample_1.tsv",
    ref_fasta_fn="./data/ref.fa",
    output_bed_fn="./results/CpG_Aggregate_sample_1.bed",
    output_tsv_fn="./results/CpG_Aggregate_sample_1.tsv",
    sample_id="sample_1",
    progress=True)

head("./results/CpG_Aggregate_sample_1.tsv")
head("./results/CpG_Aggregate_sample_1.bed")

## Checking options and input files ##
## Parsing methylation_calls file ##
	Starting to parse file Nanopolish methylation call file
	: 100%|██████████| 51.9M/51.9M [00:04<00:00, 11.2M bytes/s]
	Filtering out low coverage sites
	Sorting each chromosome by coordinates
	Parsing summary
		Input files: 1
		Lines Parsed: 543,135
		Line successfully parsed: 543,135
		Initial Sites: 218,353
		Total Valid Lines: 543,135
		Low Count Sites: 218,114
		Valid Sites Found: 239
## Processing valid sites found and write to file ##
	: 100%|██████████| 239/239 [00:00<00:00, 5.66k sites/s]
	Results summary
		Total Sites Writen: 239
		Unmethylated sites: 162
		Ambiguous sites: 77


chromosome start  end    sequence       num_motifs median_llr llr_list                                                                            
VIII       138415 138416 GGTCTCGCTTT    1          -2.355     [-9.42,-5.49,-5.18,-5.11,-2.43,-1.1,0.46,-0.68,1.07,-2.28]                          
VIII       138429 138430 AGCTTCGAGGA    1          -4.525     [-3.62,-5.58,1.12,-2.5,-10.4,-2.39,-8.33,-7.29,-0.44,-5.43]                         
VIII       212351 212352 TGGGGCGACAT    1          -2.770     [-2.95,-11.55,-9.31,-0.07,-11.21,-4.14,0.66,-2.54,2.05,0.54,-2.77]                  
VIII       212392 212393 ATTAACGTATA    1          -2.510     [-6.76,3.04,0.11,-2.51,0.32,-3.7,-2.92,-2.01,-3.52,-4.71,-1.2]                      
VIII       212457 212461 AGAATCGTCGATTA 2          -6.080     [-6.08,-13.01,-3.52,-1.3,-8.11,-8.88,-1.47,-4.78,-6.83,-3.04,-6.32,-0.17,-10.75]    
VIII       212530 212531 CTATTCGTTTC    1          -1.270     [-5.33,-1.27,1.12,-3.72,0.48,-4.4,-0.48,-1.02,-0.07,-5.5

### Example usage using a regex and with a lower depth threshold

In [5]:
ff = CpG_Aggregate (
    nanopolish_fn="./data/nanopolish_sample_*.tsv",
    ref_fasta_fn="./data/ref.fa",
    output_bed_fn="./results/CpG_Aggregate_sample_all.bed",
    output_tsv_fn="./results/CpG_Aggregate_sample_all.tsv",
    min_depth=5,
    sample_id="sample_all",
    progress=True)

head("./results/CpG_Aggregate_sample_all.tsv")
head("./results/CpG_Aggregate_sample_all.bed")

## Checking options and input files ##
## Parsing methylation_calls file ##
	Starting to parse file Nanopolish methylation call file
	: 100%|██████████| 209M/209M [00:17<00:00, 12.0M bytes/s] 
	Filtering out low coverage sites
	Sorting each chromosome by coordinates
	Parsing summary
		Input files: 4
		Lines Parsed: 2,180,231
		Line successfully parsed: 2,180,231
		Initial Sites: 251,674
		Total Valid Lines: 2,180,231
		Valid Sites Found: 228,163
		Low Count Sites: 23,511
## Processing valid sites found and write to file ##
	: 100%|██████████| 228k/228k [00:22<00:00, 10.3k sites/s] 
	Results summary
		Total Sites Writen: 228,163
		Unmethylated sites: 168,018
		Ambiguous sites: 60,129
		Methylated sites: 16


chromosome start end sequence              num_motifs median_llr llr_list                                  
I          144   145 CCACTCGTTAC           1          -2.200     [-0.7,2.77,-3.01,-2.2,-8.42]              
I          175   176 CACTCCGAACC           1          -1.350     [-1.35,-8.02,-1.07,1.94,-2.01]            
I          216   217 CCCACCGTTAC           1          -2.160     [-6.62,-2.16,-2.85,-0.27,-0.41]           
I          325   326 TGAAACGCTAA           1          -2.660     [-0.41,0.01,-5.79,-4.93,-2.66]            
I          339   340 ATGATCGTAAA           1          -1.210     [-0.02,-2.85,-4.49,-1.21,-1.08]           
I          354   355 ACACACGTGCT           1          -1.390     [-1.11,-4.6,-1.63,-1.39,-1.2]             
I          422   433 TTTTACGTACGCACACGGATG 3          -10.520    [-2.49,-7.21,-10.79,-13.29,-10.52]        
I          542   543 ATGCACGGCAC           1          -0.780     [-2.03,-3.57,0.47,-3.81,2.14,2.59]        
I          557   558 CTCAGCG

### Example with multiple files

In [9]:
import sys

for i in range (1, 5):
    sys.stdout.write(f"##### SAMPLE {i} #####")
    sys.stdout.flush()
    ff = CpG_Aggregate (
        nanopolish_fn=f"./data/nanopolish_sample_{i}.tsv",
        ref_fasta_fn="./data/ref.fa",
        output_bed_fn=f"./results/CpG_Aggregate_sample_{i}.bed",
        output_tsv_fn=f"./results/CpG_Aggregate_sample_{i}.tsv",
        sample_id=f"sample_{i}",
        min_depth=3,
        min_llr=1,
        quiet=True)

##### SAMPLE 1 #####

## Checking options and input files ##
## Parsing methylation_calls file ##
## Processing valid sites found and write to file ##


##### SAMPLE 2 #####

## Checking options and input files ##
## Parsing methylation_calls file ##
## Processing valid sites found and write to file ##


##### SAMPLE 3 #####

## Checking options and input files ##
## Parsing methylation_calls file ##
## Processing valid sites found and write to file ##


##### SAMPLE 4 #####

## Checking options and input files ##
## Parsing methylation_calls file ##
## Processing valid sites found and write to file ##
