In [167]:
import os
import sys
import pandas as pd 
import argparse
import numpy as np 
import glob
import time 
import logging
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import pickle
from plotnine import *

In [168]:
def filter_bed(bed, outname, p5nt = ['G', 'A'], min_len = 21, max_len = 23, orientation = '-') : 
    
    output = open(outname, 'w')
    lines = ''
    with open(bed, 'r') as f : 
        for line in f :
            if not line.startswith("gene_") : 
                info = line.strip().split('\t')

                chrom = info[0]
                start = int(info[1])
                end = int(info[2])
                seq = str(info[3])
                count = float(info[7])
                strand = info[5]

                if len(seq) >= min_len and len(seq) <= max_len : 
                    if seq[0] in p5nt : 
                        if strand == orientation : 
                            lines += f"{chrom}\t{start}\t{end}\t{seq}\t{count}\t{strand}\n"
    f.close() 
    
    output.write(lines)
    
                    

In [169]:
def intersect_bed(bed1, bed2, output) : 
    
    os.system(f"sh intersect.sh {bed1} {bed2} {output}")
    
    return output
    

In [170]:
def return_total_lines(file) : 
    
    i = 0
    with open(file, 'r') as f : 
        for line in f : 
            i += 1
    f.close() 
    
    return i

In [171]:
def group_and_agg_dist(df, dist_col) : 
    
    grped = df.groupby(
        [dist_col]
    ).agg(
        sense_rpm = ('sense_rpm', 'sum'), 
        anti_rpm = ('anti_rpm', 'sum')
    ).reset_index()
    
    
    grped['sense_zscore'] = 2**((grped['sense_rpm'] - grped['sense_rpm'].mean())/grped['sense_rpm'].std(ddof=0))
    
    grped['anti_zscore'] = 2**((grped['anti_rpm'] - grped['anti_rpm'].mean())/grped['anti_rpm'].std(ddof=0))
    
    grped = grped.rename(columns = {f'{dist_col}' : 'dist'})
    
    grped['id'] = dist_col
    
    return grped
    
    

In [172]:
def calc_p_distance(intersect, targets = None) :
    
    # calculate 
    # 5p to 5p anchored at 22G 5' end
    # 5p to 3p
    dat = pd.read_csv(intersect,
                     usecols = [0, 1, 2, 4, 7, 8, 10, 12],
                     names = ['gene', 'sense_5p', 'sense_3p', 'sense_rpm', 'anti_3p', 'anti_5p', 'anti_rpm', 'overlap'],
                     header = None,
                     sep = "\t"
                    )
    
    if targets : 
        df_filt = dat.query('gene in @targets').reset_index()
    else : 
        df_filt = dat
    
    df_filt['5pto5p'] = df_filt['anti_5p'] - df_filt['sense_5p']
    
    df_filt['5pto3p'] = df_filt['anti_5p'] - df_filt['sense_3p']
    
    grouped_5p5p = group_and_agg_dist(df_filt, '5pto5p')
    
    grouped_5p3p = group_and_agg_dist(df_filt, '5pto3p')
    
    grouped = pd.concat([grouped_5p5p, grouped_5p3p])
    
    return grouped


In [173]:
def measure_dist_adj_reads(bedfile, targets = None) : 
    
    df = pd.read_csv(bedfile, names = ['gene', 'start', 'end', 'seq', 'rpm', 'strand'], sep = "\t", header = None)
    
    df['seq_len'] = df['seq'].str.len()
        
    df = df.query('seq_len == 22').reset_index(drop = True)
    
    if targets : 
        df = df.query('gene in @targets').reset_index(drop = True) 
        
    genes = df.gene.unique().tolist()
    tot_gene = len(genes)
    
    for i,gene in enumerate(genes) : 
        
        sub = df.query('gene == @gene').sort_values(by = ['end'], ascending = False)

        sub['3p'] = sub['start']

        sub['5p_next'] = sub['end'].shift(-1)


        sub['3p_5p'] = sub['3p'] - sub['5p_next']

        sub = sub.query('`3p_5p` >= -25 and `3p_5p` <= 25')

        if i == 0 : 
            res = sub
        else : 
            res = pd.concat([res, sub], ignore_index = True)


        print(f'{i+1} of {tot_gene} ({ round(( (i+1)/(tot_gene) )*100,2) }%) total genes', end='\r')

    res_group = res.groupby(
        ['3p_5p']
    ).agg(
        count = ('rpm', 'sum')
    ).reset_index() 
    
    res_group['zscore'] = 2**((res_group['count'] - res_group['count'].mean())/res_group['count'].std(ddof=0))
    
    return res_group

In [179]:
def run(bed, targets = None, output = "./results",  d = False) :
    
    mt1 = time.time()
    
    if not os.path.exists(output) : 
        os.mkdir(output)
        
    if not os.path.exists("./tmp") : 
        os.mkdir("./tmp")
    
    if d : 
        BEDFILES = glob.glob(f"{bed}/*.bed.tsv")
    else :
        BEDFILES = [ bed ] 

    for i,B in enumerate(BEDFILES) : 
        
        t1 = time.time() 
        
        print(f"Processing {os.path.basename(B)}...\n")
        
        sen = './tmp/sense.bed'
        ant = './tmp/anti.bed'
        inter = './tmp/inter.bed'
        
        print("Filtering sense reads...")
        filter_bed(B, sen, p5nt = ['A', 'G', 'C', 'T'], min_len = 15, max_len = 25, orientation = '+')
        
        print("Filtering antisense reads...")
        filter_bed(B, ant, p5nt = ['A', 'G'], min_len = 21, max_len = 23, orientation = '-')
        
        print("Intersecting sense to antisense reads...")
        intersect = intersect_bed(sen, ant, inter)
        
        anti_to_sense = calc_p_distance(intersect, targets)
        
        print("Measuring 5' to 3' distance of adjacent antisense reads...")
        phasing = measure_dist_adj_reads(ant, targets)
        
        phasing['id'] = '3pto5p_anti_phasing'
        phasing['sense_rpm'] = 0
        phasing['sense_zscore'] = 0
        phasing = phasing.rename(columns = {'count' : 'anti_rpm', 'zscore' : 'anti_zscore', '3p_5p':'dist'})
        
        restmp = pd.concat([anti_to_sense, phasing])
        
        if i == 0 : 
            res = restmp
        else : 
            res = pd.concat([res, restmp])
        
        t2 = time.time() 
        print(f"\nProcessing {os.path.basename(B)} took {round(t2 - t1,3)} s.\n")
        
    mt2 = time.time()
    print(f"\nPipeline took {round(mt2 - mt1,3)} s to complete.\n")
    return res
        
        
        

In [180]:
def plot_phasing(df) : 
    
    p = ( 
        ggplot(df) + 
            geom_bar(aes(x='3p_5p', y = 'zscore'), stat = 'identity', color = 'white') + 
            theme_classic() + 
            theme(figure_size=(8,8), 
                 aspect_ratio = 1) + 
            scale_x_continuous(limits = (-25,25), breaks = np.arange(-25,25,5))
            
        )
    
    return p

# Analysis

### Read in targets file

In [181]:
targets = "/fs/ess/PCON0160/ben/genomes/c_elegans/WS279/csr1_targets.txt"

csr_targets = pd.read_csv(targets, sep = "\t", names = ['gene'], header = None)

csr_targets_list = csr_targets.gene.tolist()

csr_targets_list[0:10]

['WBGene00022277',
 'WBGene00022278',
 'WBGene00022279',
 'WBGene00021677',
 'WBGene00000812',
 'WBGene00021682',
 'WBGene00021676',
 'WBGene00021681',
 'WBGene00004274',
 'WBGene00004418']

In [182]:
bed = '/fs/ess/PAS1473/znfx1_CSRIP_WAGO9IP/metagene_alignment_03242023/transcripts/csr1_IP_2.trimmed.uniq.xc.v0.m1000.transcripts.bed.tsv'


In [183]:
me_result = run(bed = bed, targets = csr_targets_list, d = False)

Processing csr1_IP_2.trimmed.uniq.xc.v0.m1000.transcripts.bed.tsv...

Filtering sense reads...
Filtering antisense reads...
Intersecting sense to antisense reads...



The following have been reloaded with a version change:
  1) python/3.9-2022.05 => python/3.6-conda5.2



Measuring 5' to 3' distance of adjacent antisense reads...
4007 of 4007 (100.0%) total genes
Processing csr1_IP_2.trimmed.uniq.xc.v0.m1000.transcripts.bed.tsv took 121.3 s.


Pipeline took 121.301 s to complete.



Unnamed: 0,dist,sense_rpm,anti_rpm,sense_zscore,anti_zscore,id
0,1.0,1279.390528,6160.910293,0.765386,0.709430,5pto5p
1,2.0,1677.405916,11697.295965,1.365065,1.893398,5pto5p
2,3.0,1788.482484,8672.593164,1.604271,1.107447,5pto5p
3,4.0,1776.534333,7744.577005,1.576647,0.939422,5pto5p
4,5.0,1582.179523,7213.375533,1.188600,0.854978,5pto5p
...,...,...,...,...,...,...
42,21.0,0.000000,230.256887,0.000000,0.655638,3pto5p_anti_phasing
43,22.0,0.000000,214.562378,0.000000,0.653357,3pto5p_anti_phasing
44,23.0,0.000000,192.564295,0.000000,0.650173,3pto5p_anti_phasing
45,24.0,0.000000,207.530005,0.000000,0.652337,3pto5p_anti_phasing
