# Interval_Aggregate API usage

## Import module

In [2]:
# Import main module 
from pycoMeth.Interval_Aggregate import Interval_Aggregate

# optionally inport jupyter helper functions
from pycoMeth.common import head, jhelp, stdout_print

## Getting help

In [3]:
jhelp (Interval_Aggregate)

**Interval_Aggregate** (cpg_aggregate_fn, ref_fasta_fn, interval_bed_fn, output_bed_fn, output_tsv_fn, interval_size, min_cpg_per_interval, sample_id, min_llr, verbose, quiet, progress, kwargs)

Bin the output of `pycoMeth CpG_Aggregate` in genomic intervals, using either an annotation file containing intervals or a sliding window.

---

* **cpg_aggregate_fn** (required) [str]

Output tsv file generated by CpG_Aggregate (can be gzipped)

* **ref_fasta_fn** (required) [str]

Reference file used for alignment in Fasta format (ideally already indexed with samtools faidx)

* **interval_bed_fn** (default: None) [str]

SORTED bed file containing **non-overlapping** intervals to bin CpG data into (Optional) (can be gzipped)

* **output_bed_fn** (default: None) [str]

Path to write a summary result file in BED format (At least 1 output file is required) (can be gzipped)

* **output_tsv_fn** (default: None) [str]

Path to write a more extensive result report in TSV format (At least 1 output file is required) (can be gzipped)

* **interval_size** (default: 1000) [int]

Size of the sliding window in which to aggregate CpG sites data from if no BED file is provided

* **min_cpg_per_interval** (default: 5) [int]

Minimal number of CpG sites per interval.

* **sample_id** (default: "") [str]

Sample ID to be used for the BED track header

* **min_llr** (default: 2) [float]

Minimal log likelyhood ratio to consider a site significantly methylated or unmethylated in output BED file

* **verbose** (default: False) [bool]

* **quiet** (default: False) [bool]

* **progress** (default: False) [bool]

* **kwargs**



## Example usage

### Default usage with sliding windows

In [4]:
Interval_Aggregate (
    cpg_aggregate_fn="./data/CpG_Aggregate_sample_1.tsv",
    ref_fasta_fn="./data/ref.fa",
    output_bed_fn="./results/Interval_Aggregate_sample_1.bed",
    output_tsv_fn="./results/Interval_Aggregate_sample_1.tsv",
    interval_size=500,
    min_cpg_per_interval=3,
    sample_id="sample_1",
    progress=True)

head("./results/Interval_Aggregate_sample_1.tsv")
head("./results/Interval_Aggregate_sample_1.bed")

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing CpG_aggregate file ##[0m
	Progress: 100%|██████████| 5.82M/5.82M [00:02<00:00, 2.30M bytes/s]
[32m	Results summary[0m
[32m		Lines parsed: 89,392[0m
[32m		Total number of intervals: 24,319[0m
[32m	Writter summary[0m
[32m		Empty intervals skipped: 14,390[0m
[32m		Valid intervals written: 9,195[0m
[32m		Low CpG intervals skipped: 734[0m


chromosome start end  num_motifs median_llr llr_list                                           pos_list                                           
I          500   1000 12         -3.35      [-1.14,-3.54,-7.24,-4.3,0.56,-0.65,-4.37,-3.78,...[557,587,628,665,834,868,890,936,955,967,988]      
I          1000  1500 22         -3.65      [-2.48,-5.035,-4.16,-3.315,-3.295,-1.69,-9.885,...[1036,1095,1119,1136,1158,1178,1199,1217,1345,1...
I          1500  2000 19         -3.4       [-5.71,-6.05,-0.925,-7.165,-3.975,0.56,-1.78,-1...[1523,1584,1630,1654,1707,1755,1784,1797,1814,1...
I          2000  2500 15         -4.272     [-5.24,-3.07,-4.33,-19.055,-7.55,-1.255,-2.565,...[2003,2051,2084,2137,2302,2396,2421,2445,2462,2...
I          2500  3000 19         -2.5       [-0.705,0.385,-6.685,-10.175,-4.27,-2.3,-2.5,-2...[2546,2563,2584,2634,2666,2680,2694,2729,2752,2...
I          3000  3500 9          -1.4       [-2.34,-2.19,-1.4,-1.0,0.29,-1.3,-6.9,-1.22,-1.92] [3000,3024,3044,3056,3071,3148,3

### Usage with a CpG Islands annotation Bed file

In [5]:
ff = Interval_Aggregate (
    cpg_aggregate_fn="./data/CpG_Aggregate_sample_1.tsv",
    ref_fasta_fn="./data/ref.fa",
    interval_bed_fn="./data/Yeast_CGI.bed",
    output_bed_fn="./results/CGI_Aggregate_sample_1.bed",
    output_tsv_fn="./results/CGI_Aggregate_sample_1.tsv",
    sample_id="sample_1",
    min_cpg_per_interval=1,
    progress=True)

head("./results/CGI_Aggregate_sample_1.tsv")
head("./results/CGI_Aggregate_sample_1.bed")

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing CpG_aggregate file ##[0m
	Progress: 100%|█████████▉| 5.81M/5.82M [00:01<00:00, 5.35M bytes/s]
[32m	Results summary[0m
[32m		Lines parsed: 89,235[0m
[32m		Total number of intervals: 2,041[0m
[32m	Writter summary[0m
[32m		Empty intervals skipped: 1,323[0m
[32m		Valid intervals written: 718[0m


chromosome start end   num_motifs median_llr llr_list                                           pos_list                                           
I          1804  2170  14         -3.67      [-3.67,-3.4,-5.53,-1.06,-1.79,-1.94,-6.22,-5.24...[1814,1829,1889,1925,1949,1961,1976,2003,2051,2...
I          31835 32949 10         -5.65      [-2.925,-6.055,-1.785,-5.65,-6.83,-1.695,-12.32]   [31867,31889,31937,31960,32006,32031,32056]        
I          33497 34371 19         -3.295     [-4.38,-3.32,-1.29,-3.27,-5.89,-8.96,-6.88,-2.2...[33947,33967,34001,34021,34049,34068,34099,3416...
I          44730 44988 9          -3.2       [-2.37,-4.9,-1.63,-1.69,-8.09,-4.03]               [44748,44789,44808,44841,44877,44930]              
I          47889 48187 13         -4.55      [-4.55,-9.41,-3.37,-4.66,-3.24,-4.66,-4.535]       [47897,48003,48036,48050,48084,48100,48115]        
I          57175 57391 9          -4.76      [-7.96,-4.76,-0.33,-3.77,-1.03,-7.68,-6.66]        [57200,57255,57274,5

### Example with multiple files

In [6]:
for i in range (1, 5):
    stdout_print (f"##### SAMPLE {i} #####")
    Interval_Aggregate (
        cpg_aggregate_fn=f"./data/CpG_Aggregate_sample_{i}.tsv",
        ref_fasta_fn="./data/ref.fa",
        output_bed_fn=f"./results/Interval_Aggregate_sample_{i}.bed",
        output_tsv_fn=f"./results/Interval_Aggregate_sample_{i}.tsv",
        sample_id=f"sample_{i}",
        interval_size=500,
        min_cpg_per_interval=3,
        min_llr=1,
        quiet=True)

##### SAMPLE 1 #####

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing CpG_aggregate file ##[0m


##### SAMPLE 2 #####

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing CpG_aggregate file ##[0m


##### SAMPLE 3 #####

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing CpG_aggregate file ##[0m


##### SAMPLE 4 #####

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing CpG_aggregate file ##[0m
