In [2]:
import os
import sys
import pandas as pd 
import argparse
import numpy as np 
import glob
import time 
import logging
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import pickle

In [34]:
def filter_bed(bed, outname, p5nt = ['G', 'A'], min_len = 21, max_len = 23, orientation = '-') : 
    
    output = open(outname, 'w')
    lines = ''
    with open(bed, 'r') as f : 
        for line in f :
            if not line.startswith("gene") : 
                info = line.strip().split('\t')
                
                chrom = info[0]
                start = int(info[1])
                end = int(info[2])
                seq = str(info[3])
                count = float(info[7])
                strand = info[5]

                if len(seq) >= min_len and len(seq) <= max_len : 
                    if seq[0] in p5nt : 
                        if strand == orientation : 
                            lines += f"{chrom}\t{start}\t{end}\t{seq}\t{count}\t{strand}\n"
    f.close() 
    
    output.write(lines)
    
                    

In [35]:
def intersect_bed(bed1, bed2, output) : 
    
    os.system(f"sh intersect.sh {bed1} {bed2} {output}")
    
    return output
    

In [36]:
def return_total_lines(file) : 
    
    i = 0
    with open(file, 'r') as f : 
        for line in f : 
            i += 1
    f.close() 
    
    return i

In [37]:
def group_and_agg_dist(df, dist_col) : 
    
    grped = df.groupby(
        [dist_col]
    ).agg(
        sense_rpm = ('sense_rpm', 'sum'), 
        anti_rpm = ('anti_rpm', 'sum')
    ).reset_index()
    
    
    grped['sense_zscore'] = 2**((grped['sense_rpm'] - grped['sense_rpm'].mean())/grped['sense_rpm'].std(ddof=0))
    
    grped['anti_zscore'] = 2**((grped['anti_rpm'] - grped['anti_rpm'].mean())/grped['anti_rpm'].std(ddof=0))
    
    grped = grped.rename(columns = {f'{dist_col}' : 'dist'})
    
    grped['id'] = dist_col
    
    return grped
    
    

In [38]:
def calc_p_distance(intersect, targets = None) :
    
    # calculate 
    # 5p to 5p anchored at 22G 5' end
    # 5p to 3p
    dat = pd.read_csv(intersect,
                     usecols = [0, 1, 2, 4, 7, 8, 10, 12],
                     names = ['gene', 'sense_5p', 'sense_3p', 'sense_rpm', 'anti_3p', 'anti_5p', 'anti_rpm', 'overlap'],
                     header = None,
                     sep = "\t"
                    )
    
    if targets : 
        df_filt = dat.query('gene in @targets').reset_index()
    else : 
        df_filt = dat
    
    df_filt['5pto5p'] = df_filt['anti_5p'] - df_filt['sense_5p']
    
    df_filt['5pto3p'] = df_filt['anti_5p'] - df_filt['sense_3p']
    
    grouped_5p5p = group_and_agg_dist(df_filt, '5pto5p')
    
    grouped_5p3p = group_and_agg_dist(df_filt, '5pto3p')
    
    grouped = pd.concat([grouped_5p5p, grouped_5p3p])
    
    return grouped


In [39]:
def measure_dist_adj_reads(bedfile, targets = None) : 
    
    df = pd.read_csv(bedfile, names = ['gene', 'start', 'end', 'seq', 'rpm', 'strand'], sep = "\t", header = None)
    
    df = df.sort_values(by = ['end'], ascending = False).reset_index(drop = True)
    
    df['seq_len'] = df['seq'].str.len()
    
    df = df.query('seq_len == 22').reset_index(drop = True)
    
    if targets : 
        df = df.query('gene in @targets').reset_index(drop = True) 
        
    genes = df.gene.unique().tolist()
    tot_gene = len(genes)
    
    for i,gene in enumerate(genes) : 
        
        sub = df.query('gene == @gene').reset_index(drop = True)

        sub['3p'] = sub['start']

        sub['5p_next'] = sub['end'].shift(-1)

        sub['3p_5p'] = sub['3p'] - sub['5p_next']

        sub = sub.query('`3p_5p` >= -25 and `3p_5p` <= 25')

        if i == 0 : 
            res = sub
        else : 
            res = pd.concat([res, sub], ignore_index = True)

        print(f'{i+1} of {tot_gene} ({ round(( (i+1)/(tot_gene) )*100,2) }%) total genes', end='\r')

    res_group = res.groupby(
        ['3p_5p']
    ).agg(
        count = ('rpm', 'sum')
    ).reset_index() 
    
    res_group['zscore'] = 2**((res_group['count'] - res_group['count'].mean())/res_group['count'].std(ddof=0))
    
    return res_group

In [71]:
def calc_phasing(bedfile, targets = None) :
    
    outfile = './tmp/phasing_intersect.bed'
    os.system(f"sh ./phasing.sh {bedfile} {outfile}")
    
    dat = pd.read_csv(outfile,
                     usecols = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13],
                     names = ['gene', 'up_3p', 'up_5p', 'up_seq', 'up_rpm', 'up_strand',
                              'down_3p', 'down_5p', 'down_seq', 'down_rpm', 'down_strand',
                              'overlap', 'p3p5_distance'],
                     header = None,
                     sep = "\t"
                    )
    print(dat.head())
    dat['rpm'] = dat['up_rpm'] + dat['down_rpm']
    
    if targets : 
        dat = dat.query('gene in @targets').reset_index(drop = True)
    
    dat_grouped = dat.groupby(
        ['p3p5_distance']
    ).agg(
        count = ('rpm', 'sum')
    ).reset_index()
                    
    dat_grouped['zscore'] = 2**((dat_grouped['count'] - dat_grouped['count'].mean())/dat_grouped['count'].std(ddof=0))
    
    return dat_grouped
    

In [72]:
def run(bed, targets = None, output = "./results",  d = False) :
    
    mt1 = time.time()
    
    if not os.path.exists(output) : 
        os.mkdir(output)
        
    if not os.path.exists("./tmp") : 
        os.mkdir("./tmp")
    
    if d : 
        BEDFILES = glob.glob(f"{bed}/*csr1tor104csr1_exon1::GFP::Flag_IV:7957568_IV_IPArgonaute*.bed.tsv")
    else :
        BEDFILES = [ bed ] 

    print(BEDFILES)

    runtime = []
    #BEDFILES = BEDFILES[0:2] + [ i for i in BEDFILES if "csr" in i ]
    for i,B in enumerate(BEDFILES) : 
        
        t1 = time.time() 
        
        print(f"Processing {os.path.basename(B)}...\n")
        
        sen = './tmp/sense.bed'
        ant = './tmp/anti.bed'
        inter = './tmp/inter.bed'
        
        #print("Filtering sense reads...")
        #filter_bed(B, sen, p5nt = ['A', 'G', 'C', 'T'], min_len = 15, max_len = 25, orientation = '+')
        
        #print("Filtering antisense reads...")
        filter_bed(B, ant, p5nt = ['A', 'G'], min_len = 21, max_len = 23, orientation = '-')
        
        #print("Intersecting sense to antisense reads...")
        #intersect = intersect_bed(sen, ant, inter)
        
        #anti_to_sense = calc_p_distance(intersect, targets)
        
        print("Measuring 5' to 3' distance of adjacent antisense reads...")
        phasing = calc_phasing(ant, targets)
        
        phasing['id'] = '3pto5p_anti_phasing'
        #phasing['sense_rpm'] = 0
        #phasing['sense_zscore'] = 0
        restmp = phasing.rename(columns = {'count' : 'anti_rpm', 'zscore' : 'anti_zscore', 'p3p5_distance':'dist'})
        
        #restmp = pd.concat([anti_to_sense, phasing])
        
        restmp['sample'] = os.path.basename(B).replace(".bed.tsv", "")
        
        if i == 0 : 
            res = restmp
        else : 
            res = pd.concat([res, restmp])
        
        t2 = time.time() 
        
        runtime.append(round(t2-t1,3))
        #if i == 3 : 
        #    print(f"Estimated time to run pipeline for {len(BEDFILES)} bedfiles is { round( round( sum(runtime) / len(runtime), 2 ) * len(BEDFILES), 2) / 60 }")
        
        print(f"\nProcessing {os.path.basename(B)} took {round(t2 - t1,3)} s.\n")
        
    mt2 = time.time()
    print(f"\nPipeline took {round(mt2 - mt1,3)} s to complete.\n")
    return res
        
        
        

In [73]:
def plot_phasing(df) : 
    
    p = ( 
        ggplot(df) + 
            geom_bar(aes(x='3p_5p', y = 'zscore'), stat = 'identity', color = 'white') + 
            theme_classic() + 
            theme(figure_size=(8,8), 
                 aspect_ratio = 1) + 
            scale_x_continuous(limits = (-25,25), breaks = np.arange(-25,25,5))
            
        )
    
    return p

# Analysis

### Read in targets file

In [74]:
%%bash

head "/fs/ess/PCON0160/ben/projects/2025_csr1_cleavage/00AA_AGO_cleavage/20250421/transcripts/bed/tor123GFP::3xFLAG::sago1_V_IPArgonaute_2.trimmed.uniq.xartifacts.v2.m10.transcripts.bed.tsv"

gene	start	end	seq	count	strand	feature	count_total_norm
CELE45	50	72	GTGAGGCCGGACTTGAACTCGT	1.0	-	anti_transcriptome	0.03940094177709545
CELE45	51	73	GGTGAGGCCGGACTTGAACTCG	3.0	-	anti_transcriptome	0.11820282533128636
CELE45	53	74	GGGTGAGGCCGGACTTGAACT	1.0	-	anti_transcriptome	0.03940094177709545
CELE45	53	75	GGGGTGAGGCCGGACTTGAACT	1.0	-	anti_transcriptome	0.03940094177709545
CELE45	53	76	GGGGGTGAGGCCGGACTTGAACT	1.0	-	anti_transcriptome	0.03940094177709545
CELE45	54	75	GGGGTGAGGCCGGACTTGAAC	1.0	-	anti_transcriptome	0.03940094177709545
CELE45	56	77	AGGGGGTGAGGCCGGACTTGA	1.0	-	anti_transcriptome	0.03940094177709545
CELE45	62	83	GAACCTAGGGGGTGAGGCCGG	1.0	-	anti_transcriptome	0.03940094177709545
CELE45	71	93	GAGGCTGGGTGAACCTAGGGGG	13.0	-	anti_transcriptome	0.5122122431022409


In [75]:

targets = '/fs/ess/PCON0160/ben/projects/2023_claycomb_argonomics/target_lists.txt'

target_df = pd.read_csv(targets, sep = "\t", names = ['ago', 'target'], header = None)
target_df

my_targets = target_df.query('ago == "CSR-1"').reset_index(drop = True).target.tolist()

In [76]:
bed = "/fs/ess/PCON0160/ben/projects/2025_csr1_cleavage/00AA_AGO_cleavage/20250421/transcripts/bed"

In [77]:
### me_result = run(bed = bed, d = True, targets = my_targets)

['/fs/ess/PCON0160/ben/projects/2025_csr1_cleavage/00AA_AGO_cleavage/20250421/transcripts/bed/csr1tor104csr1_exon1::GFP::Flag_IV:7957568_IV_IPArgonaute_1.trimmed.uniq.xartifacts.v2.m10.transcripts.bed.tsv', '/fs/ess/PCON0160/ben/projects/2025_csr1_cleavage/00AA_AGO_cleavage/20250421/transcripts/bed/csr1tor104csr1_exon1::GFP::Flag_IV:7957568_IV_IPArgonaute_2.trimmed.uniq.xartifacts.v2.m10.transcripts.bed.tsv']
Processing csr1tor104csr1_exon1::GFP::Flag_IV:7957568_IV_IPArgonaute_1.trimmed.uniq.xartifacts.v2.m10.transcripts.bed.tsv...

Measuring 5' to 3' distance of adjacent antisense reads...



Lmod is automatically replacing "python/3.12" with "miniconda3/24.1.2-py310".



     gene  up_3p  up_5p                  up_seq    up_rpm up_strand  down_3p  \
0  CELE45     61     93  GAGGCTGGGTGAACCTAGGGGG  0.087221         -       63   
1  CELE45     61     93  GAGGCTGGGTGAACCTAGGGGG  0.087221         -       61   
2  CELE45     61     93  GAGGCTGGGTGAACCTAGGGGG  0.087221         -       62   
3  CELE45     61     93  GAGGCTGGGTGAACCTAGGGGG  0.087221         -       53   
4  CELE45     61     93  GAGGCTGGGTGAACCTAGGGGG  0.087221         -       54   

   down_5p                 down_seq  down_rpm down_strand  overlap  \
0       85   GTGAACCTAGGGGGTGAGGCCG  0.087221           -       22   
1       83   GAACCTAGGGGGTGAGGCCGGA  0.959432           -       22   
2       83    GAACCTAGGGGGTGAGGCCGG  0.087221           -       21   
3       76  GGGGGTGAGGCCGGACTTGAACT  0.174442           -       15   
4       76   GGGGGTGAGGCCGGACTTGAAC  0.087221           -       15   

   p3p5_distance  
0            -14  
1            -12  
2            -12  
3             -5  
4  


Lmod is automatically replacing "python/3.12" with "miniconda3/24.1.2-py310".



     gene  up_3p  up_5p                   up_seq    up_rpm up_strand  down_3p  \
0  CELE45     60     93  GAGGCTGGGTGAACCTAGGGGGT  0.093513         -       62   
1  CELE45     60     93  GAGGCTGGGTGAACCTAGGGGGT  0.093513         -       63   
2  CELE45     60     93  GAGGCTGGGTGAACCTAGGGGGT  0.093513         -       61   
3  CELE45     60     93  GAGGCTGGGTGAACCTAGGGGGT  0.093513         -       62   
4  CELE45     60     93  GAGGCTGGGTGAACCTAGGGGGT  0.093513         -       55   

   down_5p                 down_seq  down_rpm down_strand  overlap  \
0       85  GTGAACCTAGGGGGTGAGGCCGG  0.046756           -       23   
1       85   GTGAACCTAGGGGGTGAGGCCG  0.093513           -       22   
2       83   GAACCTAGGGGGTGAGGCCGGA  0.420807           -       22   
3       83    GAACCTAGGGGGTGAGGCCGG  0.093513           -       21   
4       77   AGGGGGTGAGGCCGGACTTGAA  0.046756           -       17   

   p3p5_distance  
0            -15  
1            -15  
2            -13  
3            -13

In [65]:
me_result.to_csv("/fs/ess/PCON0160/ben/projects/2025_csr1_cleavage/phasing_analysis.tsv", sep = "\t", header = True, index = False)

In [66]:
me_result

Unnamed: 0,dist,anti_rpm,anti_zscore,id,sample
0,-21,0.523326,0.613454,3pto5p_anti_phasing,csr1tor104csr1_exon1::GFP::Flag_IV:7957568_IV_...
1,-20,19.014194,2.404691,3pto5p_anti_phasing,csr1tor104csr1_exon1::GFP::Flag_IV:7957568_IV_...
2,-19,37.766725,9.610186,3pto5p_anti_phasing,csr1tor104csr1_exon1::GFP::Flag_IV:7957568_IV_...
3,-18,23.811353,3.427484,3pto5p_anti_phasing,csr1tor104csr1_exon1::GFP::Flag_IV:7957568_IV_...
4,-17,19.363079,2.467477,3pto5p_anti_phasing,csr1tor104csr1_exon1::GFP::Flag_IV:7957568_IV_...
...,...,...,...,...,...
26,5,0.794857,0.559840,3pto5p_anti_phasing,csr1tor104csr1_exon1::GFP::Flag_IV:7957568_IV_...
27,6,0.935126,0.566309,3pto5p_anti_phasing,csr1tor104csr1_exon1::GFP::Flag_IV:7957568_IV_...
28,7,1.917009,0.613732,3pto5p_anti_phasing,csr1tor104csr1_exon1::GFP::Flag_IV:7957568_IV_...
29,8,0.794857,0.559840,3pto5p_anti_phasing,csr1tor104csr1_exon1::GFP::Flag_IV:7957568_IV_...
