# Meth_Comp API usage

## Import module

In [2]:
# Import main module 
from pycoMeth.Meth_Comp import Meth_Comp

# optionally inport jupyter helper functions
from pycoMeth.common import head, jhelp

## Getting help

In [3]:
jhelp(Meth_Comp)

**Meth_Comp** (aggregate_fn_list, ref_fasta_fn, output_tsv_fn, output_bed_fn, max_missing, min_diff_llr, sample_id_list, pvalue_adj_method, pvalue_threshold, verbose, quiet, progress, kwargs)

Compare methylation values for each CpG positions or intervals between n samples and perform a statistical test to evaluate if the positions are significantly different. For 2 samples a Mann_Withney test is performed otherwise multiples samples are compared with a Kruskal Wallis test. pValues are adjusted for multiple tests using the Benjamini & Hochberg procedure for controlling the false discovery rate.

---

* **aggregate_fn_list** (required) [list(str)]

A list of output tsv files corresponding to several samples to compare generated by either CpG_Aggregate or Interval_Aggregate. (can be gzipped)

* **ref_fasta_fn** (required) [str]

Reference file used for alignment in Fasta format (ideally already indexed with samtools faidx)

* **output_tsv_fn** (default: None) [str]

Path to write an more extensive result report in TSV format (At least 1 output file is required) (can be gzipped)

* **output_bed_fn** (default: None) [str]

Path to write a summary result file in BED format (At least 1 output file is required) (can be gzipped)

* **max_missing** (default: 0) [int]

Max number of missing samples to perform the test

* **min_diff_llr** (default: 2) [float]

Minimal llr boundary for negative and positive median llr. The test if only performed if at least one sample has a median llr above (methylated) and 1 sample has a median llr below (unmethylated)

* **sample_id_list** (default: None) [list(str)]

list of sample ids to annotate results in tsv file

* **pvalue_adj_method** (default: fdr_bh) [str]

Method to use for pValue multiple test adjustment

* **pvalue_threshold** (default: 0.01) [float]

Alpha parameter (family-wise error rate) for pValue adjustment

* **verbose** (default: False) [bool]

* **quiet** (default: False) [bool]

* **progress** (default: False) [bool]

* **kwargs**



## Example usage

#### Usage with CpG Aggregate output

In [4]:
ff = Meth_Comp (
    aggregate_fn_list=[
        "./data/Yeast_CpG_1.tsv.gz", 
        "./data/Yeast_CpG_2.tsv.gz", 
        "./data/Yeast_CpG_3.tsv.gz", 
        "./data/Yeast_CpG_4.tsv.gz"],
    ref_fasta_fn="./data/yeast.fa",
    output_bed_fn="./results/Yeast_CpG_meth_comp.bed",
    output_tsv_fn="./results/Yeast_CpG_meth_comp.tsv.gz",
    sample_id_list=["S1","S2","S3","S4"],
    progress=True)

head("./results/Yeast_CpG_meth_comp.tsv.gz")
head("./results/Yeast_CpG_meth_comp.bed")

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing files ##[0m
[32m	Reading input files header and checking consistancy between headers[0m
[32m	Starting asynchronous file parsing[0m
	Progress: 37.1M bytes [00:07, 4.80M bytes/s]                       
[32m	Adjust pvalues[0m
[32m	Writing output file[0m
	Progress: 100%|██████████| 344/344 [00:00<00:00, 20.0k sites/s]
[32m	Results summary[0m
[32m		Sites with insufficient samples: 211,400[0m
[32m		Sites with insufficient effect size: 33,071[0m
[32m		Valid sites: 344[0m
[32m		Sites with non-significant pvalue: 344[0m
[32m		Sites with non-significant adjusted pvalue: 344[0m


chromosome start  end    n_samples pvalue              adj_pvalue          neg_med pos_med ambiguous_med labels                med_llr_list                raw_llr_list                                       
I          118799 118800 4         0.29978058859571194 0.4279805197751711  3       1       0             ["S1","S2","S3","S4"] [2.005,-2.645,-6.64,-5.51]  [[-1.82,5.83],[-0.09,-5.2],[-2.67,-10.61],[-6.7...
I          141415 141416 4         0.11800901597381579 0.3742064977444858  1       1       2             ["S1","S2","S3","S4"] [-4.175,0.135,3.02,-0.02]   [[-1.33,-4.52,-3.83,-4.67],[-1.42,1.69],[6.62,-...
I          151819 151820 4         0.1315327225513005  0.3742064977444858  1       1       2             ["S1","S2","S3","S4"] [-2.98,-1.95,2.93,-1.48]    [[-3.68,-4.4,0.54,-2.28],[-3.45,-1.95,-1.21],[4...
I          167230 167237 4         0.10985439586067039 0.3742064977444858  3       1       0             ["S1","S2","S3","S4"] [-6.305,-6.94,3.6,-4.17]    [[-5.06,-7.55],[-6.9

#### Usage with Interval Aggregate output with a single significant result

In [5]:
ff = Meth_Comp (
    aggregate_fn_list=[
        "./data/Yeast_CGI_1.tsv.gz", 
        "./data/Yeast_CGI_2.tsv.gz", 
        "./data/Yeast_CGI_3.tsv.gz", 
        "./data/Yeast_CGI_4.tsv.gz"],
    ref_fasta_fn="./data/yeast.fa",
    output_bed_fn="./results/Yeast_CGI_meth_comp.bed",
    output_tsv_fn="./results/Yeast_CGI_meth_comp.tsv.gz",
    sample_id_list=["S1","S2","S3","S4"],
    max_missing = 1,
    min_diff_llr = 0,
    progress=True)

head("./results/Yeast_CGI_meth_comp.tsv.gz")
head("./results/Yeast_CGI_meth_comp.bed")

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing files ##[0m
[32m	Reading input files header and checking consistancy between headers[0m
[32m	Starting asynchronous file parsing[0m
	Progress: 776k bytes [00:00, 7.05M bytes/s]              
[32m	Adjust pvalues[0m
[32m	Writing output file[0m
	Progress: 100%|██████████| 4.00/4.00 [00:00<00:00, 9.99k sites/s]
[32m	Results summary[0m
[32m		Sites with insufficient effect size: 938[0m
[32m		Sites with insufficient samples: 921[0m
[32m		Valid sites: 4[0m
[32m		Sites with non-significant adjusted pvalue: 4[0m
[32m		Sites with non-significant pvalue: 3[0m
[32m		Sites with significant pvalue: 1[0m


chromosome start   end     n_samples pvalue               adj_pvalue           neg_med pos_med ambiguous_med unique_cpg_pos labels                med_llr_list               raw_llr_list                                       raw_pos_list                                       
V          65380   65612   3         0.007390942610433672 0.029563770441734687 2       1       0             8              ["S1","S3","S4"]      [-2.28,0.542,-2.125]       [[-2.515,-0.06,-3.84,-0.475,-3.905,-11.445,-2.0...[[65382,65416,65434,65470,65482,65497,65527,655...
X          745486  745750  3         0.1931198217867195   0.257493095715626    2       1       0             3              ["S1","S3","S4"]      [-3.42,-2.475,0.28]        [[-3.65,-3.42,-0.755],[-2.475,-9.35,-1.88],[0.2...[[745492,745563,745592],[745492,745563,745592],...
XV         108248  108543  3         0.7757164275739235   0.7757164275739235   2       1       0             9              ["S2","S3","S4"]      [0.425,-2.445,-2.77]       [[0

#### Usage with Interval Aggregate output and larger dataset

In [6]:
# Generate list of file paths and sample ids from source directory
from glob import glob
import os
fn_list = sorted(glob("./data/medaka_CGI_*"))
id_list = [os.path.split(fn)[-1].strip("medaka_CGI_").strip(".tsv.gz") for fn in fn_list]

Meth_Comp (
    aggregate_fn_list=fn_list,
    ref_fasta_fn="./data/medaka.fa",
    output_bed_fn="./results/Medaka_CGI_meth_comp.bed",
    output_tsv_fn="./results/Medaka_CGI_meth_comp.tsv.gz",
    sample_id_list = id_list,
    max_missing = 1,
    min_diff_llr = 1,
    progress=True)

head("./results/Medaka_CGI_meth_comp.tsv.gz", max_char_col=40)
head("./results/Medaka_CGI_meth_comp.bed", max_char_col=40)

[01;34m## Checking options and input files ##[0m
[01;34m## Parsing files ##[0m
[32m	Reading input files header and checking consistancy between headers[0m
[32m	Starting asynchronous file parsing[0m
	Progress: 556M bytes [00:49, 11.2M bytes/s]                       
[32m	Adjust pvalues[0m
[32m	Writing output file[0m
	Progress: 100%|██████████| 10.8k/10.8k [00:02<00:00, 3.60k sites/s]
[32m	Results summary[0m
[32m		Sites with insufficient effect size: 201,365[0m
[32m		Sites with insufficient samples: 54,136[0m
[32m		Valid sites: 10,784[0m
[32m		Sites with non-significant adjusted pvalue: 7,252[0m
[32m		Sites with non-significant pvalue: 6,506[0m
[32m		Sites with significant pvalue: 4,278[0m
[32m		Sites with significant adjusted pvalue: 3,532[0m


chromosome start  end    n_samples pvalue                adj_pvalue             neg_med pos_med ambiguous_med unique_cpg_pos labels                                   med_llr_list                             raw_llr_list                             raw_pos_list                             
1          136203 136504 12        0.7664606178319608    0.8325424217694418     1       5       6             9              ["11-1_A3","117-2_C4","131-1_F4","134...[0.46,0.47,0.66,1.6,1.14,1.01,0.18,0....[[1.19,-0.05,4.49,-1.22,-3.13,0.46,-1...[[136270,136281,136306,136348,136368,...
1          308880 309181 12        0.0006937034588303196 0.002644361293752621   8       1       3             6              ["11-1_A3","117-2_C4","131-1_F4","134...[0.655,-1.875,0.57,-1.152,2.145,-2.92...[[-0.1,1.41,5.22,-1.51,-0.23,1.59],[-...[[308888,308972,309024,309052,309091,...
1          416519 416885 11        0.4452430035616395    0.5407095214424235     2       3       6             12             ["11-1_A3","1