# Meth_Comp API usage

## Import module

In [2]:
# Import main module 
from pycoMeth.Meth_Comp import Meth_Comp

# optionally inport jupyter helper functions
from pycoMeth.common import head, jhelp

## Getting help

In [3]:
jhelp(Meth_Comp)

**Meth_Comp** (aggregate_fn_list, ref_fasta_fn, output_tsv_fn, output_bed_fn, max_missing, min_diff_llr, sample_id_list, pvalue_adj_method, pvalue_threshold, only_tested_sites, verbose, quiet, progress, kwargs)

Compare methylation values for each CpG positions or intervals between n samples and perform a statistical test to evaluate if the positions are significantly different. For 2 samples a Mann_Withney test is performed otherwise multiples samples are compared with a Kruskal Wallis test. pValues are adjusted for multiple tests using the Benjamini & Hochberg procedure for controlling the false discovery rate.

---

* **aggregate_fn_list** (required) [list(str)]

A list of output tsv files corresponding to several samples to compare generated by either CpG_Aggregate or Interval_Aggregate. (can be gzipped)

* **ref_fasta_fn** (required) [str]

Reference file used for alignment in Fasta format (ideally already indexed with samtools faidx)

* **output_tsv_fn** (default: None) [str]

Path to write an more extensive result report in TSV format (At least 1 output file is required) (can be gzipped)

* **output_bed_fn** (default: None) [str]

Path to write a summary result file in BED format (At least 1 output file is required) (can be gzipped)

* **max_missing** (default: 0) [int]

Max number of missing samples to perform the test

* **min_diff_llr** (default: 2) [float]

Minimal llr boundary for negative and positive median llr. The test if only performed if at least one sample has a median llr above (methylated) and 1 sample has a median llr below (unmethylated)

* **sample_id_list** (default: None) [list(str)]

list of sample ids to annotate results in tsv file

* **pvalue_adj_method** (default: fdr_bh) [str]

Method to use for pValue multiple test adjustment

* **pvalue_threshold** (default: 0.01) [float]

Alpha parameter (family-wise error rate) for pValue adjustment

* **only_tested_sites** (default: False) [bool]

Do not include sites that were not tested because of insufficient samples or effect size in the report

* **verbose** (default: False) [bool]

* **quiet** (default: False) [bool]

* **progress** (default: False) [bool]

* **kwargs**



## Example usage

#### Usage with CpG Aggregate output

In [7]:
ff = Meth_Comp (
    aggregate_fn_list=[
        "./data/Yeast_CpG_1.tsv.gz", 
        "./data/Yeast_CpG_2.tsv.gz", 
        "./data/Yeast_CpG_3.tsv.gz", 
        "./data/Yeast_CpG_4.tsv.gz"],
    ref_fasta_fn="./data/yeast.fa",
    output_bed_fn="./results/Yeast_CpG_meth_comp.bed",
    output_tsv_fn="./results/Yeast_CpG_meth_comp.tsv.gz",
    sample_id_list=["S1","S2","S3","S4"],
    only_tested_sites=True,
    progress=True)

head("./results/Yeast_CpG_meth_comp.tsv.gz")
head("./results/Yeast_CpG_meth_comp.bed")

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing files ##[0m
[32m	Reading input files header and checking consistancy between headers[0m
[32m	Starting asynchronous file parsing[0m
	Progress: 37.1M bytes [00:07, 5.12M bytes/s]                       
[32m	Adjust pvalues[0m
[32m	Writing output file[0m
	Progress: 100%|██████████| 344/344 [00:00<00:00, 18.8k sites/s]
[32m	Results summary[0m
[32m		Insufficient samples: 211,400[0m
[32m		Insufficient effect size: 33,071[0m
[32m		Valid: 344[0m
[32m		Non-significant pvalue: 344[0m


chromosome start  end    n_samples pvalue              adj_pvalue          neg_med pos_med ambiguous_med labels                med_llr_list                raw_llr_list                                       comment                
I          118799 118800 4         0.29978058859571194 0.4279805197751711  3       1       0             ["S1","S2","S3","S4"] [2.005,-2.645,-6.64,-5.51]  [[-1.82,5.83],[-0.09,-5.2],[-2.67,-10.61],[-6.7...Non-significant pvalue 
I          141415 141416 4         0.11800901597381579 0.3742064977444858  1       1       2             ["S1","S2","S3","S4"] [-4.175,0.135,3.02,-0.02]   [[-1.33,-4.52,-3.83,-4.67],[-1.42,1.69],[6.62,-...Non-significant pvalue 
I          151819 151820 4         0.1315327225513005  0.3742064977444858  1       1       2             ["S1","S2","S3","S4"] [-2.98,-1.95,2.93,-1.48]    [[-3.68,-4.4,0.54,-2.28],[-3.45,-1.95,-1.21],[4...Non-significant pvalue 
I          167230 167237 4         0.10985439586067039 0.3742064977444858  3       

#### Usage with Interval Aggregate output with a single significant result

In [6]:
ff = Meth_Comp (
    aggregate_fn_list=[
        "./data/Yeast_CGI_1.tsv.gz", 
        "./data/Yeast_CGI_2.tsv.gz", 
        "./data/Yeast_CGI_3.tsv.gz", 
        "./data/Yeast_CGI_4.tsv.gz"],
    ref_fasta_fn="./data/yeast.fa",
    output_bed_fn="./results/Yeast_CGI_meth_comp.bed",
    output_tsv_fn="./results/Yeast_CGI_meth_comp.tsv.gz",
    sample_id_list=["S1","S2","S3","S4"],
    max_missing = 1,
    min_diff_llr = 0,
    only_tested_sites=False,
    progress=True)

head("./results/Yeast_CGI_meth_comp.tsv.gz")
head("./results/Yeast_CGI_meth_comp.bed")

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing files ##[0m
[32m	Reading input files header and checking consistancy between headers[0m
[32m	Starting asynchronous file parsing[0m
	Progress: 776k bytes [00:00, 6.48M bytes/s]              
[32m	Adjust pvalues[0m
[32m	Writing output file[0m
	Progress: 100%|██████████| 1.86k/1.86k [00:00<00:00, 36.0k sites/s]
[32m	Results summary[0m
[32m		Insufficient effect size: 938[0m
[32m		Insufficient samples: 921[0m
[32m		Valid: 4[0m
[32m		Non-significant pvalue: 4[0m


chromosome start end   n_samples pvalue adj_pvalue neg_med pos_med ambiguous_med unique_cpg_pos labels           med_llr_list raw_llr_list raw_pos_list comment                  
I          17    333   2         nan    nan        2       0       0             0              ["S1","S4"]      []           []           []           Insufficient samples     
I          1804  2170  3         nan    nan        3       0       0             0              ["S1","S2","S4"] []           []           []           Insufficient effect size 
I          31835 32949 1         nan    nan        1       0       0             0              ["S1"]           []           []           []           Insufficient samples     
I          33497 34371 2         nan    nan        2       0       0             0              ["S1","S2"]      []           []           []           Insufficient samples     
I          38163 38471 2         nan    nan        2       0       0             0              ["S1","S2"]   

#### Usage with Interval Aggregate output and larger dataset

In [8]:
# Generate list of file paths and sample ids from source directory
from glob import glob
import os
fn_list = sorted(glob("./data/medaka_CGI_*"))
id_list = [os.path.split(fn)[-1].strip("medaka_CGI_").strip(".tsv.gz") for fn in fn_list]

Meth_Comp (
    aggregate_fn_list=fn_list,
    ref_fasta_fn="./data/medaka.fa",
    output_bed_fn="./results/Medaka_CGI_meth_comp.bed",
    output_tsv_fn="./results/Medaka_CGI_meth_comp.tsv.gz",
    sample_id_list = id_list,
    max_missing = 1,
    min_diff_llr = 1,
    progress=True)

head("./results/Medaka_CGI_meth_comp.tsv.gz", max_char_col=40)
head("./results/Medaka_CGI_meth_comp.bed", max_char_col=40)

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing files ##[0m
[32m	Reading input files header and checking consistancy between headers[0m
[32m	Starting asynchronous file parsing[0m
	Progress: 556M bytes [00:47, 11.7M bytes/s]                       
[32m	Adjust pvalues[0m
[32m	Writing output file[0m
	Progress: 100%|██████████| 266k/266k [00:08<00:00, 31.1k sites/s] 
[32m	Results summary[0m
[32m		Insufficient effect size: 201,365[0m
[32m		Insufficient samples: 54,136[0m
[32m		Valid: 10,784[0m
[32m		Non-significant pvalue: 7,252[0m
[32m		Significant pvalue: 3,532[0m


chromosome start end   n_samples pvalue adj_pvalue neg_med pos_med ambiguous_med unique_cpg_pos labels                                   med_llr_list raw_llr_list raw_pos_list comment                  
1          1657  1963  1         nan    nan        0       1       0             0              ["7-2_F2"]                               []           []           []           Insufficient samples     
1          15653 15966 12        nan    nan        0       12      0             0              ["11-1_A3","117-2_C4","131-1_F4","134...[]           []           []           Insufficient effect size 
1          17092 17597 12        nan    nan        0       12      0             0              ["11-1_A3","117-2_C4","131-1_F4","134...[]           []           []           Insufficient effect size 
1          18071 18621 12        nan    nan        0       12      0             0              ["11-1_A3","117-2_C4","131-1_F4","134...[]           []           []           Insufficient effect