In [2]:
import os
import pandas as pd
import yaml

# from tsv2bedRMod import bid_mouse2bedRMod, bid_human2bedRMod, etam2bedRMod, csv2bedRMod
from tsv2bedRMod import csv2bedRMod

from helper import write_header
from helper import get_modification_color
from helper import parse_excel


## columns that I need for data
    chrom       'chrM'
    chromStart  number
    chromEnd    
    name        m1G or m22G - manually for each file
    score       none given. 
    strand      strand
    thickStart   
    thickEnd    
    itemRGB     figure it out
    coverage    add ACTG
    frequency   mu
    
## columns that I need for header
    organism human
    modification_type RNA
    assembly GRCh37
    annotation_source None
    annotation_version None
    sequencing_platform Illumina NovaSeq 6000
    basecalling None
    bioinformatics_workflow 
    experiment https://doi.org/10.1038/s41467-023-42832-z
    external_source SRA:PRJNA623501
    methods DAMM-seq
    references pubmed_id:34253897
    conversion_information the original "Score" column is normalized by dividing each value by the maximum value

In [312]:
def convert_damm_mt_tRNA():
    def score_func(mu_col):
        return int(float(mu_col) * 1000)
    def frequency_func(mu_col):
        return int(float(mu_col) * 100)
    keys = parse_excel("/home/annebusch/anne02/euf-data/damm-seq/GSE148202_DAMM-seq_mt-tRNA_AllMethyl.xlsx")
    for key in keys: 
        print(key)
        df = pd.read_excel("/home/annebusch/anne02/euf-data/damm-seq/GSE148202_DAMM-seq_mt-tRNA_AllMethyl.xlsx", sheet_name=key)
        empty_rows = df.isnull().all(axis=1)
        df_list = []
        if empty_rows.any():
            empty_row_index = df[df.isna().all(axis=1)].index[0]
            df_list.append(df.iloc[:empty_row_index])
            df_list.append(df.iloc[empty_row_index +1:])
        else: 
            df_list.append(df)
        for dfl in df_list:
            chrom = "MT"
            pos = dfl.iloc[0, 1]
            strand = dfl.iloc[1, 1]
            modification = dfl.iloc[0, 2]
            split_at = "N"
            if "A" in modification:
                split_at = "A"
            if "G" in modification:
                split_at = "G"
            if "C" in modification:
                split_at = "C"
            split_mod, _ = modification.split(split_at)
            modi = split_mod + split_at
            
            if dfl.shape[1] == 12:
                df_input_subset = dfl.iloc[1:12, 3:12]
                column_names = ["Input", "A", "C", "G", "T", "GT", "GC", "GA", "Mu"]
                df_input_subset.columns = column_names
            else:
                df_input_subset = dfl.iloc[1:12, 3:9]
                column_names = ["Input", "A", "C", "G", "T", "Mu"]
                df_input_subset.columns = column_names
                
            directory = "/home/annebusch/anne02/euf-data/damm-seq/"
            config_yaml = "/home/annebusch/anne02/euf-data/damm-seq/hela_config.yaml"
            for index, row in df_input_subset.iterrows():
                file = row.Input.replace(" ", "")
                coverage = int(row.A) + int(row.C) + int(row.G) + int(row["T"]) 
                frequency = int(float(row.Mu) * 100)
                score = int(float(row.Mu) * 1000)
                if os.path.isfile(directory+file+"DAMM-seq_mt-tRNA_AllMethyl.bedrmod"):
                    with open(directory+file+"DAMM-seq_mt-tRNA_AllMethyl.bedrmod", "a") as data:
                        data.write(f"{chrom}\t{pos}\t{pos+1}\t{modi}\t{score}\t{strand}\t{pos}\t{pos+1}\t{0,0,0}\t{coverage}\t{frequency}\n")   
                else:
                    with open(directory+file+"DAMM-seq_mt-tRNA_AllMethyl.bedrmod", "w") as data:
                        config = yaml.safe_load(open(config_yaml, "r"))
                        write_header(config, data)
                        data.write("#chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tcoverage\tfrequency\n")
                        data.write(f"{chrom}\t{pos}\t{pos+1}\t{modi}\t{score}\t{strand}\t{pos}\t{pos+1}\t{0,0,0}\t{coverage}\t{frequency}\n")

    
convert_damm_mt_tRNA()
    
    
    
    

Ile
Arg
Asp
Lys
Leu1(CUN)
Trp
His
Phe
Gly
Val
Thr
Leu2(UUR)
Ala
Tyr
Ser2(UCN)
Cys
Asn
Gln
Glu
Pro


In [353]:
import re

In [369]:
def convert_damm_vivo_ds():
    def score_func(mu_col):
        return int(float(mu_col) * 1000)
    def frequency_func(mu_col):
        return int(float(mu_col) * 100)

    keys = parse_excel("/home/annebusch/anne02/euf-data/damm-seq/GSE148202_Misincorporation_in-vivo_dsRNA_AllMethyl.xlsx")
    for key in keys: 
        print(key)
        df = pd.read_excel("/home/annebusch/anne02/euf-data/damm-seq/GSE148202_Misincorporation_in-vivo_dsRNA_AllMethyl.xlsx", sheet_name=key)
        empty_columns = df.columns[df.isna().all()].tolist()
        empty_col_indices = [df.columns.get_loc(col) for col in empty_columns]
        empty_rows = df.index[df.isna().all(axis=1)].tolist()
        # print(empty_rows)
        # print(empty_columns)
        df_list = []
        row_splits = [-1] + empty_rows + [df.shape[0]]
        col_splits = [-1] + empty_col_indices + [df.shape[1]]
        # one sheet of this file contains multiple tables and this seperates them into subtables
        for i in range(len(row_splits) - 1):
            for j in range(len(col_splits) - 1):
                start_row, end_row = row_splits[i] + 1, row_splits[i + 1]
                start_col, end_col = col_splits[j] + 1, col_splits[j + 1]
                sub_df = df.iloc[start_row:end_row, start_col:end_col]
                if j != 0:
                    #print("here")
                    meta_info_start_row, metainfo_end_row = row_splits[i] + 1, row_splits[i] + 4
                    meta_info_start_col, metainfo_end_col = col_splits[0] + 1, col_splits[0] + 4
                    meta_info_df = df.iloc[meta_info_start_row:metainfo_end_row, meta_info_start_col:metainfo_end_col]
                    #print(meta_info_df)
                    if not sub_df.empty and sub_df.notna().any().any():
                        sub_df = pd.concat([meta_info_df, sub_df], ignore_index=True, axis=1)
                if not sub_df.empty and sub_df.notna().any().any():
                    sub_df = sub_df.dropna(how='all')
                    df_list.append(sub_df)
                    
        print(df_list)
        for dfl in df_list:
            print(dfl)
            chrom = "MT"
            pos = dfl.iloc[0, 1]
            strand = dfl.iloc[1, 1]
            modification = dfl.iloc[0, 2]
            split_at = "N"
            if "A" in modification:
                split_at = "A"
            if "G" in modification:
                split_at = "G"
            if "C" in modification:
                split_at = "C"
            split_mod, _ = modification.split(split_at)
            modi = split_mod + split_at
            if "22" in modi:
                modi = re.sub(r'22', '2,2', modi)
            # print(pos)
            # print(strand)
            # print(modi)
            if dfl.shape[1] == 12:
                df_input_subset = dfl.iloc[1:12, 3:12]
                column_names = ["Input", "A", "C", "G", "T", "GT", "GC", "GA", "Mu"]
                df_input_subset.columns = column_names
            else:
                df_input_subset = dfl.iloc[1:12, 3:9]
                column_names = ["Input", "A", "C", "G", "T", "Mu"]
                df_input_subset.columns = column_names
            print(df_input_subset)
            directory = "/home/annebusch/anne02/euf-data/damm-seq/"
            config_yaml = "/home/annebusch/anne02/euf-data/damm-seq/hela_config.yaml"
            for index, row in df_input_subset.iterrows():
                file = row.Input.replace(" ", "")
                file = file.replace("/", "")
                is_nan = pd.isnull(row.A)
                if isinstance(row.A, (int, float)) and (not is_nan):
                    coverage = int(row.A) + int(row.C) + int(row.G) + int(row["T"]) 
                    frequency = int(float(row.Mu) * 100)
                    score = int(float(row.Mu) * 1000)
                    # print(score)
                    # print(frequency)
                    print(file)
                    if os.path.isfile(directory+file+"Misincorporation_in-vivo_dsRNA_AllMethyl.bedrmod"):
                        with open(directory+file+"Misincorporation_in-vivo_dsRNA_AllMethyl.bedrmod", "a") as data:
                            data.write(f"{chrom}\t{pos}\t{pos+1}\t{modi}\t{score}\t{strand}\t{pos}\t{pos+1}\t{0,0,0}\t{coverage}\t{frequency}\n")   
                    else:
                        with open(directory+file+"Misincorporation_in-vivo_dsRNA_AllMethyl.bedrmod", "w") as data:
                            config = yaml.safe_load(open(config_yaml, "r"))
                            write_header(config, data)
                            data.write("#chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tcoverage\tfrequency\n")
                            data.write(f"{chrom}\t{pos}\t{pos+1}\t{modi}\t{score}\t{strand}\t{pos}\t{pos+1}\t{0,0,0}\t{coverage}\t{frequency}\n")

convert_damm_vivo_ds()
    

Ile
[  mt-Ile region Unnamed: 1 Unnamed: 2          Unnamed: 3 Unnamed: 4  \
0          chrM       4285     m22G26                 NaN          A   
1           NaN          +        NaN  HeLa o/e Control-1          0   
2           NaN        NaN        NaN  HeLa o/e Control-2          1   
3           NaN        NaN        NaN   HeLa o/e ALKBH7-1          0   
4           NaN        NaN        NaN   HeLa o/e ALKBH7-2          1   

  Unnamed: 5 Unnamed: 6 Unnamed: 7 Unnamed: 8  
0          C          G          T         Mu  
1         21         45         15   0.444444  
2         15         40         10   0.393939  
3         20         59          9   0.329545  
4         14         60         13   0.318182  ,       0     1       2                  3  4   5   6   7         8
0  chrM  4285  m22G26                NaN  A   C   G   T        Mu
1   NaN     +     NaN         HepG2 WT-1  1  16  31  12  0.483333
2   NaN   NaN     NaN         HepG2 WT-2  2  12  22  12  0.541667
3   NaN  

[  mt-His region Unnamed: 1 Unnamed: 2          Unnamed: 3 Unnamed: 4  \
0          chrM      12147       m1A9                 NaN          A   
1           NaN          +        NaN  HeLa o/e Control-1          7   
2           NaN        NaN        NaN  HeLa o/e Control-2         10   
3           NaN        NaN        NaN   HeLa o/e ALKBH7-1         15   
4           NaN        NaN        NaN   HeLa o/e ALKBH7-2          9   

  Unnamed: 5 Unnamed: 6 Unnamed: 7 Unnamed: 8  
0          C          G          T         Mu  
1          7          4         36    0.87037  
2          2          1         24    0.72973  
3          7          1         43   0.772727  
4          3          6         25   0.790698  ,       0      1     2                  3   4  5  6   7         8
0  chrM  12147  m1A9                NaN   A  C  G   T        Mu
1   NaN      +   NaN         HepG2 WT-1   4  1  1  22  0.857143
2   NaN    NaN   NaN         HepG2 WT-2  12  0  1  20  0.636364
3   NaN    NaN   NaN 

HepG2WT-2
HepG2ALKBH7KO-1
HepG2ALKBH7KO-2
HepG2siControl-1
HepG2siControl-2
HepG2siALKBH7-1
HepG2siALKBH7-2
Ser2(UCN)
[  mt-Ser2 region Unnamed: 1 Unnamed: 2          Unnamed: 3 Unnamed: 4  \
0           chrM       7462      m1A58                 NaN          A   
1            NaN          -        NaN  HeLa o/e Control-1       2482   
2            NaN        NaN        NaN  HeLa o/e Control-2       2241   
3            NaN        NaN        NaN   HeLa o/e ALKBH7-1       1382   
4            NaN        NaN        NaN   HeLa o/e ALKBH7-2       1096   

  Unnamed: 5 Unnamed: 6 Unnamed: 7 Unnamed: 8  
0          C          G          T         Mu  
1         24         23        244    0.10494  
2         18         21        199   0.096006  
3         16         14        147   0.113534  
4         12          8        125   0.116841  ,       0     1      2                  3     4   5  6    7         8
0  chrM  7462  m1A58                NaN     A   C  G    T        Mu
1   NaN     -    

In [370]:
def convert_damm_vivo_mt():
    def score_func(mu_col):
        return int(float(mu_col) * 1000)
    def frequency_func(mu_col):
        return int(float(mu_col) * 100)

    keys = parse_excel("/home/annebusch/anne02/euf-data/damm-seq/GSE148202_Misincorporation_in-vivo_mt-tRNA_AllMethyl.xlsx")
    for key in keys: 
        print(key)
        df = pd.read_excel("/home/annebusch/anne02/euf-data/damm-seq/GSE148202_Misincorporation_in-vivo_mt-tRNA_AllMethyl.xlsx", sheet_name=key)
        empty_columns = df.columns[df.isna().all()].tolist()
        empty_col_indices = [df.columns.get_loc(col) for col in empty_columns]
        empty_rows = df.index[df.isna().all(axis=1)].tolist()
        # print(empty_rows)
        # print(empty_columns)
        df_list = []
        row_splits = [-1] + empty_rows + [df.shape[0]]
        col_splits = [-1] + empty_col_indices + [df.shape[1]]
        # one sheet of this file contains multiple tables and this seperates them into subtables
        for i in range(len(row_splits) - 1):
            for j in range(len(col_splits) - 1):
                start_row, end_row = row_splits[i] + 1, row_splits[i + 1]
                start_col, end_col = col_splits[j] + 1, col_splits[j + 1]
                sub_df = df.iloc[start_row:end_row, start_col:end_col]
                if j != 0:
                    #print("here")
                    meta_info_start_row, metainfo_end_row = row_splits[i] + 1, row_splits[i] + 4
                    meta_info_start_col, metainfo_end_col = col_splits[0] + 1, col_splits[0] + 4
                    meta_info_df = df.iloc[meta_info_start_row:metainfo_end_row, meta_info_start_col:metainfo_end_col]
                    #print(meta_info_df)
                    if not sub_df.empty and sub_df.notna().any().any():
                        sub_df = pd.concat([meta_info_df, sub_df], ignore_index=True, axis=1)
                if not sub_df.empty and sub_df.notna().any().any():
                    sub_df = sub_df.dropna(how='all')
                    df_list.append(sub_df)
                    
        print(df_list)
        for dfl in df_list:
            print(dfl)
            chrom = "MT"
            pos = dfl.iloc[0, 1]
            strand = dfl.iloc[1, 1]
            modification = dfl.iloc[0, 2]
            split_at = "N"
            if "A" in modification:
                split_at = "A"
            if "G" in modification:
                split_at = "G"
            if "C" in modification:
                split_at = "C"
            split_mod, _ = modification.split(split_at)
            modi = split_mod + split_at
            if "22" in modi:
                modi = re.sub(r'22', '2,2', modi)
            # print(pos)
            # print(strand)
            # print(modi)
            if dfl.shape[1] == 12:
                df_input_subset = dfl.iloc[1:12, 3:12]
                column_names = ["Input", "A", "C", "G", "T", "GT", "GC", "GA", "Mu"]
                df_input_subset.columns = column_names
            else:
                df_input_subset = dfl.iloc[1:12, 3:9]
                column_names = ["Input", "A", "C", "G", "T", "Mu"]
                df_input_subset.columns = column_names
            print(df_input_subset)
            directory = "/home/annebusch/anne02/euf-data/damm-seq/"
            config_yaml = "/home/annebusch/anne02/euf-data/damm-seq/hela_config.yaml"
            for index, row in df_input_subset.iterrows():
                file = row.Input.replace(" ", "")
                file = file.replace("/", "")
                is_nan = pd.isnull(row.A)
                if isinstance(row.A, (int, float)) and (not is_nan):
                    coverage = int(row.A) + int(row.C) + int(row.G) + int(row["T"]) 
                    frequency = int(float(row.Mu) * 100)
                    score = int(float(row.Mu) * 1000)
                    # print(score)
                    # print(frequency)
                    print(file)
                    if os.path.isfile(directory+file+"Misincorporation_in-vivo_mt-tRNA_AllMethyl.bedrmod"):
                        with open(directory+file+"Misincorporation_in-vivo_mt-tRNA_AllMethyl.bedrmod", "a") as data:
                            data.write(f"{chrom}\t{pos}\t{pos+1}\t{modi}\t{score}\t{strand}\t{pos}\t{pos+1}\t{0,0,0}\t{coverage}\t{frequency}\n")   
                    else:
                        with open(directory+file+"Misincorporation_in-vivo_mt-tRNA_AllMethyl.bedrmod", "w") as data:
                            config = yaml.safe_load(open(config_yaml, "r"))
                            write_header(config, data)
                            data.write("#chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tcoverage\tfrequency\n")
                            data.write(f"{chrom}\t{pos}\t{pos+1}\t{modi}\t{score}\t{strand}\t{pos}\t{pos+1}\t{0,0,0}\t{coverage}\t{frequency}\n")

convert_damm_vivo_mt()
    





Ile
[  mt-Ile Unnamed: 1 Unnamed: 2          Unnamed: 3 Unnamed: 4 Unnamed: 5  \
0   chrM       4285     m22G26                 NaN          A          C   
1    NaN          +        NaN   HepG2 siControl-1         91       1079   
2    NaN        NaN        NaN   HepG2 siControl-2        107       1367   
3    NaN        NaN        NaN    HepG2 siALKBH7-1        149       1637   
4    NaN        NaN        NaN    HepG2 siALKBH7-2        114       1434   
5    NaN        NaN        NaN  HeLa o/e Control-1         67       1318   
6    NaN        NaN        NaN  HeLa o/e Control-2         71       1367   
7    NaN        NaN        NaN   HeLa o/e ALKBH7-1         46       1218   
8    NaN        NaN        NaN   HeLa o/e ALKBH7-2         59       1212   

  Unnamed: 6 Unnamed: 7 Unnamed: 8  
0          G          T         Mu  
1        332       1003   0.867465  
2        328       1115   0.887556  
3        460       1472   0.876278  
4        348       1258   0.889664  
5        151

[  mt-Thr Unnamed: 1 Unnamed: 2          Unnamed: 3 Unnamed: 4 Unnamed: 5  \
0   chrM      15897       m1A9                 NaN          A          C   
1    NaN          +        NaN   HepG2 siControl-1        476        139   
2    NaN        NaN        NaN   HepG2 siControl-2        419        138   
3    NaN        NaN        NaN    HepG2 siALKBH7-1        584        177   
4    NaN        NaN        NaN    HepG2 siALKBH7-2        476        139   
5    NaN        NaN        NaN  HeLa o/e Control-1        341        175   
6    NaN        NaN        NaN  HeLa o/e Control-2        378        172   
7    NaN        NaN        NaN   HeLa o/e ALKBH7-1        294        161   
8    NaN        NaN        NaN   HeLa o/e ALKBH7-2        347        156   

  Unnamed: 6 Unnamed: 7 Unnamed: 8  
0          G          T         Mu  
1        345       3322   0.888837  
2        299       2923   0.889124  
3        396       3856   0.883503  
4        370       3330   0.889687  
5        311    