In [85]:
import pandas as pd
import os
mod_file = "/Users/andrewbailey/CLionProjects/personal/projects/ares_rRNA/mod_files/misc/mod_file.csv"
exp_bases = {'2’-O-Methyladenosine' : "A",
            '2’-O-Methylcytidine' : "C",
            '2’-O-Methylguanosine' : "G",
            '2’-O-Methyluridine' : "T",
            'C5-Methylcytidine' : "C",
            'N1-Methyladenosine' : "A",
            'N1-methyl-N3-aminocarboxypropyl_pseudouridine' : "T",
            'N3-Methyluridine' : "T",
            'N4-Acetylcytidine' : "C",
            'N6-N6-Dimethyladenosine' : "A",
            'N7-Methylguanosine' : "G",
            'Pseudouridine' : "T", 
            "2’-O-Methyluridine_pseudouridine": "T",
            "2'-O-methylated_pseudouridine_2’-O-Methyluridine" : "T"}

mod_bases = {'2’-O-Methyladenosine' : "a",
        '2’-O-Methylcytidine' : "b",
        '2’-O-Methylguanosine' : "c",
        '2’-O-Methyluridine' : "d",
        'C5-Methylcytidine' : "e",
        'N1-Methyladenosine' : "f",
        'N1-methyl-N3-aminocarboxypropyl_pseudouridine' : "g",
        'N3-Methyluridine' : "h",
        'N4-Acetylcytidine' : "i",
        'N6-N6-Dimethyladenosine' : "j",
        'N7-Methylguanosine' : "k",
        'Pseudouridine' : "l",
         "2'-O-methylated_pseudouridine" : "m"}


mod_variants = {'2’-O-Methyladenosine' : "Aa",
        '2’-O-Methylcytidine' : "Cb",
        '2’-O-Methylguanosine' : "Gc",
        '2’-O-Methyluridine' : "Td",
        'C5-Methylcytidine' : "Ce",
        'N1-Methyladenosine' : "Af",
        'N1-methyl-N3-aminocarboxypropyl_pseudouridine' : "Tg",
        'N3-Methyluridine' : "Th",
        'N4-Acetylcytidine' : "Ci",
        'N6-N6-Dimethyladenosine' : "Aj",
        'N7-Methylguanosine' : "Gk",
        'Pseudouridine' : "Tl", 
        "2'-O-methylated_pseudouridine_2’-O-Methyluridine": "Tmd"}




In [88]:
mod_data = pd.read_csv(mod_file)
mod_data['pos'] = mod_data['pos'] -1


In [89]:
mod_data['variants']= mod_data['mod'].map(mod_variants)
mod_data['base_char']= mod_data['mod'].map(exp_bases)
mod_data['find']= mod_data['mod'].map(exp_bases)
mod_data['mod_char']= mod_data['mod'].map(mod_bases)

In [90]:
mod_data = mod_data.dropna()


In [91]:
def write_filtered_positions_file(mod_data, path, percent_threshold, ignore_mods, geq=True):
    data = []
    if geq:
        new_data = mod_data[mod_data["percent"] >= percent_threshold]
    else:
        new_data = mod_data[mod_data["percent"] <= percent_threshold]

    mod_char_data = new_data[~new_data['mod_char'].isin(ignore_mods)][["contig", "pos", "strand", "find", "mod_char"]]
    mod_char_data = mod_char_data.rename(columns={"mod_char": "a"})
    data.append(mod_char_data)

    base_char_data = new_data[new_data['mod_char'].isin(ignore_mods)][["contig", "pos", "strand", "find", "base_char"]]
    base_char_data = base_char_data.rename(columns={"base_char": "a"})
    data.append(base_char_data)
    write_data = pd.concat(data)
    write_data.to_csv(path, sep="\t", header=False, index=False)
    

In [92]:
HOME = "/Users/andrewbailey/CLionProjects/personal/projects/ares_rRNA/mod_files/misc_position_files"
# Write high prob (>=95) position files 
write_filtered_positions_file(mod_data, os.path.join(HOME, "high_prob_95_all_mods.positions"), 
                              95, [], geq=True)   
write_filtered_positions_file(mod_data, os.path.join(HOME, "high_prob_95_no_pseduoU.positions"), 
                              95, ["l"], geq=True)  
write_filtered_positions_file(mod_data, os.path.join(HOME, "high_prob_95_no_2methyl.positions"), 
                              95, ["a", "b", "c", "d"], geq=True)   
write_filtered_positions_file(mod_data, os.path.join(HOME, "high_prob_95_canonical.positions"), 
                              95, 
                              ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "md"], 
                              geq=True)   
# Write low prob (<=80) position files 
write_filtered_positions_file(mod_data, os.path.join(HOME, "low_prob_80_all_mods.positions"), 
                              80, [], geq=False)   
write_filtered_positions_file(mod_data, os.path.join(HOME, "low_prob_80_no_pseduoU.positions"), 
                              80, ["l"], geq=False)  
write_filtered_positions_file(mod_data, os.path.join(HOME, "low_prob_80_no_2methyl.positions"), 
                              80, ["a", "b", "c", "d"], geq=False)   
write_filtered_positions_file(mod_data, os.path.join(HOME, "low_prob_80_canonical.positions"), 
                              80, 
                              ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "md"], 
                              geq=False)   
