In [6]:
from pathlib import Path
import polars as pl


uniformed_dir: Path = Path('/mnt/eqa/zhangyuanfeng/methylation/unified_informed/formatted')
rev_files: list[Path] = [p for p in uniformed_dir.glob('*lz4') if any(p.name.startswith(i)
                                                                      for i in {'BS', 'EM', 'RR'})]
dir_files: list[Path] = [p for p in uniformed_dir.glob('*lz4') if p.name.startswith('PS')]

In [10]:
rev_df_list: list[pl.DataFrame] = []

for rev_f in rev_files:
    fname = rev_f.name.split('.')[0]
    lab = fname.split('_')[0]

    if 'RR' in lab:
        controls = ['chrM']
    else:
        controls = ['lambda', 'pUC19', 'chrM']    

    f_df: pl.DataFrame = (pl.scan_parquet(rev_f)
                            .filter(pl.col('depth') >= 10,
                                    pl.col('chrom').is_in(controls))
                            .with_columns(pl.lit(lab).alias('lab'),
                                          pl.col('chrom')
                                            .replace(old=['chrM', 'lambda', 'pUC19'],
                                                     new=['conversion insufficency',
                                                          'conversion insufficency',
                                                          'excessive conversion'])
                                            .alias('type'))
                            .with_columns(pl.when(pl.col('chrom') == 'pUC19')
                                            .then(100 - pl.col('beta'))
                                            .otherwise(pl.col('beta'))
                                            .alias('cr'))
                            .select('lab', 'chrom', 'type', 'cr')
                            .collect())
    rev_df_list.append(f_df)


for dir_f in dir_files:
    fname = dir_f.name.split('.')[0]
    lab = fname.split('_')[0]

    controls = ['lambda', 'pUC19', 'chrM']    

    f_df: pl.DataFrame = (pl.scan_parquet(dir_f)
                            .filter(pl.col('depth') >= 10,
                                    pl.col('chrom').is_in(controls))
                            .with_columns(pl.lit(lab).alias('lab'),
                                          pl.col('chrom')
                                            .replace(old=['chrM', 'lambda', 'pUC19'],
                                                     new=['excessive conversion',
                                                          'excessive conversion',
                                                          'conversion insufficency'])
                                            .alias('type'))
                            .with_columns(pl.when(pl.col('chrom') == 'pUC19')
                                            .then(100 - pl.col('beta'))
                                            .otherwise(pl.col('beta'))
                                            .alias('cr'))
                            .select('lab', 'chrom', 'type', 'cr')
                            .collect())
    rev_df_list.append(f_df)

In [11]:
total_df = pl.concat(rev_df_list).sort('lab', 'type', 'chrom', 'cr')

In [12]:
total_df.filter(pl.col('lab') == 'BS1')

lab,chrom,type,cr
str,str,str,f64
"""BS1""","""chrM""","""conversion insufficency""",0.0
"""BS1""","""chrM""","""conversion insufficency""",0.0
"""BS1""","""chrM""","""conversion insufficency""",0.0
"""BS1""","""chrM""","""conversion insufficency""",0.0
"""BS1""","""chrM""","""conversion insufficency""",0.0
…,…,…,…
"""BS1""","""pUC19""","""excessive conversion""",59.139785
"""BS1""","""pUC19""","""excessive conversion""",61.585366
"""BS1""","""pUC19""","""excessive conversion""",61.96319
"""BS1""","""pUC19""","""excessive conversion""",62.758621


In [14]:
total_df.write_csv('/mnt/eqa/zhangyuanfeng/methylation/data_for_plot/2_conversion_rate_bias/cr_cytosines.csv')

In [13]:
hf_df: pl.DataFrame

hf_f: Path = Path('/mnt/eqa/zhangyuanfeng/methylation/evaluated/hf_bias_choosed.csv')

hf_df = (pl.scan_csv(hf_f)
           .drop('chrom')
           .select('lab', 'direction', 'bias')
           .collect())

In [14]:
hf_df

lab,direction,bias
str,str,f64
"""MA1""","""over-estimate""",12.7706
"""MA1""","""over-estimate""",16.5098
"""MA1""","""over-estimate""",4.1842
"""MA1""","""over-estimate""",8.1982
"""MA1""","""over-estimate""",3.4307
…,…,…
"""PS1""","""under-estimate""",-10.0
"""PS2""","""no bias""",0.0
"""PS2""","""over-estimate""",8.064516
"""PS3""","""no bias""",0.0


In [16]:
hf_df.write_csv('/mnt/eqa/zhangyuanfeng/methylation/data_for_plot/2_conversion rate_bias/hf_choosed_cytosines.csv')