In [1]:
import os
import pandas as pd
import yaml

# from tsv2bedRMod import bid_mouse2bedRMod, bid_human2bedRMod, etam2bedRMod, csv2bedRMod
from tsv2bedRMod import csv2bedRMod

from helper import write_header
from helper import get_modification_color
from helper import parse_excel


In [None]:
jacusa_file = "/home/annebusch/anne02/euf-data/mepme/Supp2.xlsx"
file = pd.read_excel(jacusa_file, header=1)
file.columns

## columns that I need for data
    chrom       'arrest Chr'
    chromStart  'modification'
    chromEnd    
    name        m6A or m5C - manually for each file
    score       convert "Score" column  -> reps 1 and 2
    strand      'Strand'
    thickStart   
    thickEnd    
    itemRGB     figure it out
    coverage    'coverage sample'  -> reps 1 and 2
    frequency   'delta arrest rate'  -> reps 1 and 2
    
## columns that I need for header
    general: rep1 or 2
    organism human
    modification_type RNA
    assembly GRCh38
    annotation_source None
    annotation_version None
    sequencing_platform Illumina NovaSeq 6000
    basecalling None
    bioinformatics_workflow https://github.com/dieterich-lab/JACUSA2
    experiment https://doi.org/10.1038/s41467-023-42832-z
    external_source SRA:PRJNA811414
    methods MePMe-seq
    references pubmed_id:37935679
    conversion_information the original "Score" column is normalized by dividing each value by the maximum value

In [None]:
jacusa_file = "/home/annebusch/anne02/euf-data/mepme/Supp2.xlsx"
file = pd.read_excel(jacusa_file, header=1)

# save the first 5 columns, as they are identical for rep1 and rep2
keep_cols = file.iloc[:, :5]

# split rep1 and rep2 into own dataframes
rep1_cols = file.iloc[:, 20:32]
rep2_cols = file.iloc[:, 32:44]

# concatenate the chromosome, position etc. columns with the values
rep1_df = pd.concat([keep_cols, rep1_cols], axis=1)
rep2_df = pd.concat([keep_cols, rep2_cols], axis=1)
print(len(rep2_df))

# rename columns, so they can be accessed by name
rep2_df.columns = rep1_df.columns

# drop rows, where no score is set to avoid having sparse data
rep1_df = rep1_df.dropna(subset="Score")
rep2_df = rep2_df.dropna(subset="Score")

# save maximum score of each df in an extra column, so the bedrmod score can be calculated, later
max1_score_col = [max(rep1_df["Score"]) for i in range(len(rep1_df))]
max2_score_col = [max(rep2_df["Score"]) for i in range(len(rep2_df))]
rep1_df["max_score"] = max1_score_col
rep2_df["max_score"] = max2_score_col

# save rep1 and rep2 independently in a csv
rep1_df.to_csv("/home/annebusch/anne02/euf-data/mepme/supp2_mepme_m6a_rep1.csv", index=False)
rep2_df.to_csv("/home/annebusch/anne02/euf-data/mepme/supp2_mepme_m6a_rep2.csv", index=False)

In [2]:
def convert_mepme():
    """
    Function to convert individual csv files from mepme into bedrmod
    """
    def score_func(score_value):
        score, max_score = score_value
        return round((score / max_score) * 1000) 

    def frequency_func(ratio):
        return round(ratio)
    
    def start_func(pos):
        return pos - 1

    dirpath = "/home/annebusch/anne02/euf-data/mepme/"
    os.chdir(dirpath)
    for file in os.listdir():
        if file.endswith(".csv"):
            if file.startswith("supp2") or file.startswith("supp5"):
                modification = "m6A"
            if file.startswith("supp6"):
                modification = "m5C"
            conf = "hela.yaml"
            csv2bedRMod(file, conf,
                        delimiter=",",
                        ref_seg="arrest Chr", start="modification",
                        start_function=start_func,
                        strand="Strand",
                        modi=modification,
                        coverage="coverage sample",
                        score=["Score", "max_score"],
                        score_function=score_func,
                        frequency="delta arrest rate",
                        frequency_function=frequency_func)
    

    

In [3]:
convert_mepme()

output file: supp6_mepme_m5c_rep2.bedrmod
output file: supp5_mettl16_m6a_rep2.bedrmod
output file: supp2_mepme_m6a_rep1.bedrmod
output file: supp5_mettl16_m6a_rep1.bedrmod
output file: supp2_mepme_m6a_rep2.bedrmod
output file: supp5_mettl16_m6a_rep4.bedrmod
output file: supp6_mepme_m5c_rep1.bedrmod
output file: supp5_mettl16_m6a_rep3.bedrmod
