In [None]:
#  structure modeling for aggregate data 

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as spstats
import matplotlib.cm
from cairosvg import svg2pdf

In [None]:
## 1. Supp_Figure_7c --- --- 

In [2]:
def parse_struct(struct):
    list_of_pairs = []
    list_queue = []
    for i2, ss in enumerate(struct):
        if ss == "(":
            list_queue.append(i2)
        elif ss == ")":
            i1 = list_queue.pop()
            list_of_pairs.append((i1, i2))
    return list_of_pairs


def parse_svg(f_svg):

    ### part 1 

    nuc_positions, nuc_type = [], []
    pos_position, pos_count = [], []
    with open(f_svg, "r") as f:
        for line in f:
            if "</text>" in line:
                if "rgb(0%, 0%, 0%)" in line:
                    row = line.split(">")[1].split("<")[0].replace("T","U")
                    if len(row) == 1:
                        nuc_type.append(row)
                        x = float(line.split('x="')[1].split('"')[0])
                        y = float(line.split('y="')[1].split('"')[0])
                        nuc_positions.append([x, y])
                else:
                    posmark = int(line.split(">")[1].split("<")[0])
                    x = float(line.split('x="')[1].split('"')[0])
                    y = float(line.split('y="')[1].split('"')[0])
                    pos_position.append([x, y])
                    pos_count.append(posmark) # no need offset since the svh alrd has offset coordinates
    

    ### part 2
                    
    pos_position_line_start, pos_position_line_end= [], []
    
    with open(f_svg, "r") as f:
        
        for line in f:

            if "</text>" not in line:
                
                if 'stroke-width="1.0"' and "rgb(25%, 25%, 25%)" in line:

                    row = line.split(" ")
                    #print(row)

                
                    x1 = float(row[1].split('x1="')[1].split('"')[0])
                    y1 = float(row[2].split('y1="')[1].split('"')[0])
                    x2 = float(row[3].split('x2="')[1].split('"')[0])
                    y2 = float(row[4].split('y2="')[1].split('"')[0])

                    pos_position_line_start.append([x1, y1])
                    pos_position_line_end.append([x2, y2])
            
                   
    return nuc_positions, nuc_type, pos_position, pos_count, pos_position_line_start, pos_position_line_end


def append_header(list_svg_output, span_x, span_y, margin):
    
    list_svg_output.append('''<?xml version="1.0" encoding="UTF-8"?>
    <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" 
    "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
    ''')

    list_svg_output.append('<svg width="%f" height="%f" version="1.1" xmlns="http://www.w3.org/2000/svg">'%(span_x + 2*margin, span_y + 2*margin))

def append_bp_line(list_svg_output, pairs, nuc_positions, margin, nuc_type, pair_circle_radius):

    # draw base pairing line
    for p in pairs:
        pos1, pos2 = p[0], p[1]  # alrd corrected to zero based
        x1 = nuc_positions[p[0]][0] + margin
        y1 = nuc_positions[p[0]][1] + margin
        x2 = nuc_positions[p[1]][0] + margin
        y2 = nuc_positions[p[1]][1] + margin
        if not ((nuc_type[pos1]=="G" or nuc_type[pos1]=="C") and (nuc_type[pos2]=="G" or nuc_type[pos2]=="C")): # draw a circle to note GC pairing
            list_svg_output.append('<line x1="%f" y1="%f" x2="%f" y2="%f" stroke="rgb(0%%, 0%%, 0%%)" stroke-width="1.0" />'%(x1,y1,x2,y2))
            list_svg_output.append('<circle cx="%f" cy="%f" r="%f" stroke="rgb(0%%, 0%%, 0%%)" stroke-width="1.0" fill="rgb(0%%, 0%%, 0%%)" />'%((x1+x2)/2, (y1+y2)/2, pair_circle_radius))
        else:
            list_svg_output.append('<line x1="%f" y1="%f" x2="%f" y2="%f" stroke="rgb(0%%, 0%%, 0%%)" stroke-width="1.0" />'%(x1,y1,x2,y2))


def append_nt_line(list_svg_output, nuc_positions, margin):

    # draw adjacent nt line
    for n in range(len(nuc_positions)-1): 
        x1 = nuc_positions[n][0] + margin
        y1 = nuc_positions[n][1] + margin
        x2 = nuc_positions[n+1][0] + margin
        y2 = nuc_positions[n+1][1] + margin
        list_svg_output.append('<line x1="%f" y1="%f" x2="%f" y2="%f" stroke="rgb(0%%, 0%%, 0%%)" stroke-width="1.0" />'%(x1,y1,x2,y2))

def append_circles(list_svg_output, nuc_positions, nuc_type, nuc_color, margin, nt_circle_radius):
    # draw circles
    for n, nuc in enumerate(nuc_type):
        x = nuc_positions[n][0]+ margin
        y = nuc_positions[n][1]+ margin
        list_svg_output.append('<circle cx="%f" cy="%f" r="%f" stroke="None" fill="%s"/>'%(x, y, nt_circle_radius, nuc_color[n]))
        list_svg_output.append('<text x="%f" y="%f" text-anchor="middle" font-family="Verdana" font-size="10.5" >%s</text>'%(x,y+3.5,nuc))



def append_positions(list_svg_output, pos_count, pos_position, margin, offset):
    # draw position labels
    for p, pos in enumerate(pos_count):
        x = pos_position[p][0] + margin/1.5
        y = pos_position[p][1] + margin/1.5
        list_svg_output.append('<text x="%f" y="%f" text-anchor="start" font-family="Verdana" font-size="10" >%s</text>'%(x, y, pos+offset)) # change anchor to "end"

def append_footer(list_svg_output):
    list_svg_output.append('</svg>')


def shape_to_color_v5(shape):
    if shape < -998:
        return "rgb(100%, 100%, 100%)"
    else:
        if shape >0.85:
            return 'rgb(90%,0%,20%)'
        elif shape > 0.4:
            return 'rgb(255,127,0)'
        else:
            return 'rgb(128,128,128)'      



def annotate_svg(f_svg, pairs, shape, f_out_svg, offset, p_colormap=True):

    nuc_positions, nuc_type, pos_position, pos_count, pos_position_line_start, pos_position_line_end = parse_svg(f_svg)

    if not p_colormap:
        nuc_color = [shape_to_color_v5(s) for s in shape]
    else:
        cmap = matplotlib.cm.get_cmap('Reds')
        shape_nona = [x for x in shape if x!=-999]
        if shape_nona==[]:
            nuc_color=[(100,100,100)]*len(shape)
        else:
            print(shape_nona)
            norm = matplotlib.colors.Normalize(vmin=min(shape_nona), vmax=max(shape_nona)) # vmax is black, offset a bit
            nuc_color = list(map(lambda x: cmap(norm(x)) if x!=-999 else (100,100,100), shape)) # scale only non NA shape, if NA make it white
        nuc_color = list(map(lambda tup: f"rgb({tup[0]*100}%, {tup[1]*100}%, {tup[2]*100}%)", nuc_color)) # render to string


    # get maximum and minimum of x and y to define svg size
    nuc_positions = np.array(nuc_positions)
    min_x = min(nuc_positions[:,0])
    max_x = max(nuc_positions[:,0])
    min_y = min(nuc_positions[:,1])
    max_y = max(nuc_positions[:,1])

    span_x = abs(max_x-min_x)
    span_y = abs(max_y-min_y)

    for n in range(len(nuc_positions)):
        nuc_positions[n] = nuc_positions[n]-np.array(min_x,min_y)
        

    # draw setting
    nt_circle_radius = 6
    #nt_circle_radius = 3
    pair_circle_radius = 2.75 #same as varna, for the AT pairing
    margin = 50 # the edges of the svg always get cutoff. add margin to avoid this. 
    
    #offset = 824
    list_svg_output = []
    append_header(list_svg_output, span_x, span_y, margin)
    append_bp_line(list_svg_output, pairs, nuc_positions, margin, nuc_type, pair_circle_radius)
    append_nt_line(list_svg_output, nuc_positions, margin)
    append_circles(list_svg_output, nuc_positions, nuc_type, nuc_color, margin, nt_circle_radius)
    append_positions(list_svg_output, pos_count, pos_position, margin, offset)
    append_footer(list_svg_output)

    with open(f_out_svg, "w") as f:
        for line in list_svg_output:
            f.write("%s\n" % line) 

In [3]:
def get_structure_model(transcript,C0,C1,order,offset): 

    pairs_C0= parse_struct(C0)
    pairs_C1= parse_struct(C1)

    try: 
        ## 1. Cluster 0 ------------

        dir = '/home/han/proj_het_AC/rerun_analysis/0_Manuscript_codes_submission_20250723/Supp_Fig7/Data/svg/' 
        pred = pd.read_csv(dir+transcript+'_EPI_ISL_407987_full_length_observed_cluster0.shape', header = None, sep='\t')
        pred.columns = ['position','mod_rate']
        mod = pred[['mod_rate']]
        norm_factor = max(spstats.iqr(mod['mod_rate'].dropna()) * 1.5, np.percentile(mod['mod_rate'].dropna(), 90))
        print(norm_factor)
        modrate_pred = list(mod.mod_rate/norm_factor)


        output="/home/han/proj_het_AC/rerun_analysis/0_Manuscript_codes_submission_20250723/Supp_Fig7/Data/svg/res/"

        annotate_svg("/home/han/proj_het_AC/rerun_analysis/0_Manuscript_codes_submission_20250723/Supp_Fig7/Data/svg/"+transcript+"_cluster0_centroid"+str(order)+".svg", 
                pairs_C0, modrate_pred, 
                output+transcript+"_cluster0.color."+str(order)+".svg", 0, p_colormap=False) 
        svg2pdf(url=output+transcript+"_cluster0.color."+str(order)+".svg", write_to=output+transcript+"_cluster0.color."+str(order)+".pdf", dpi=300)
    except Exception:
                pass 

    try: 
    
        ## 1. Cluster 1 ------------

        dir = '/home/han/proj_het_AC/rerun_analysis/0_Manuscript_codes_submission_20250723/Supp_Fig7/Data/svg/' 
        pred = pd.read_csv(dir+transcript+'_EPI_ISL_407987_full_length_observed_cluster1.shape', header = None, sep='\t')
        pred.columns = ['position','mod_rate']
        mod = pred[['mod_rate']]
        norm_factor = max(spstats.iqr(mod['mod_rate'].dropna()) * 1.5, np.percentile(mod['mod_rate'].dropna(), 90))
        print(norm_factor)
        modrate_pred = list(mod.mod_rate/norm_factor)

        output="/home/han/proj_het_AC/rerun_analysis/0_Manuscript_codes_submission_20250723/Supp_Fig7/Data/svg/res/"
        annotate_svg("/home/han/proj_het_AC/rerun_analysis/0_Manuscript_codes_submission_20250723/Supp_Fig7/Data/svg/"+transcript+"_cluster1_centroid"+str(order)+".svg", 
                pairs_C1, modrate_pred, 
                output+transcript+"_cluster1.color."+str(order)+".svg", 0, p_colormap=False) 
        svg2pdf(url=output+transcript+"_cluster1.color."+str(order)+".svg", write_to=output+transcript+"_cluster1.color."+str(order)+".pdf", dpi=300)

    except Exception:
                pass 

In [4]:
get_structure_model('ORF8',
                    '.......(((............((((((............)))))).(((.....)))......((((.(((.((((......(((((((((.((....))..)))))))))..)))).)))))))...((((((((..(((((((...))))).))))))).))).....)))....(((...((((((....((.....))....))).)))....))).',
                    '..((((((..((.(((((((((((..(((...((......))...))).)))))))))))))))))))...............(((((((((.((....))..)))))))))..........(((.(((......))).((.(((((........((..((.(((....))))).))....((((...)))).))))).))(((((......))))).))).',
                    1,
                    0)

1.0
1.0


In [6]:
get_structure_model('N',
                    '',
                    '..((((((..((.(((((((((((..(((...((......))...))).)))))))))))))))))))...............(((((((((.((....))..))))))))).........((((.(((......)))....(((((........((..((.(((....))))).))....((((...)))).)))))...(((((......))))).))))',
                    1,
                    0)


get_structure_model('N',
                    '.......(((....................................................((((((.(((.((((......(((((((((.((....))..)))))))))..)))).))))))).))...((((((((...........))))))))............)))..........((((((....((.....))....))).)))........',
                    '',
                    4,
                    0)

1.0
1.0
1.0
1.0


In [7]:
get_structure_model('ORF7a',
                    '',
                    '..((((((..((.(((((((((((..(((...((......))...))).)))))))))))))))))))...............(((((((((.((....))..))))))))).........((((......(((..((.((.(((((........((..((.(((....))))).))....((((...)))).))))).)).)).)))..........))))',
                    1,
                    0)  


get_structure_model('ORF7a',
                    '.......(((.....................................(((.....))).......(((.((((((........(((((((((.((....))..))))))))).(((((((...(((((...............)))))...))))))).((.(((....)))))....)))))))))....))).......(((((......))))).....',
                    '',
                    2,
                    0)

1.0
1.0000000000000002
1.0
1.0000000000000002
