In [1]:
import os 
import sys
import pandas as pd 
import numpy as np
import glob 
from Bio.Seq import Seq
from Bio import SeqIO
from scipy import stats
import multiprocessing
from multiprocessing import  Pool
import time
import logging
from functools import partial
import random
from functools import reduce


In [9]:
random.randint(0,100)

80

"""
Workflow: 

1. index genomes 
    - smRNA (bowtie1)
    - mRNA (genes, pseudogenes, lincRNAs, transposons...) (bowtie2)
    - genome (remove non-chimeras) (bowtie2)
    
2. Collapse trim reads of adapters and collapse fastq files.

3. Index collapsed fasta file 

4. Assign which chimeras have smRNAs. 
    - align smRNAs to collapsed fasta index. 
    - convert to bedfile.
    - bowtie1 settings: -m 1 -v0
    
5. Parse smRNA from target RNA. 

6. Align target RNA to targets
    - convert to bedfile
    - inner join to table with 
    - bowtie2
    
7. Extend coordinates by 10 nt on each side of target, extract transcriptomic sequence. 

8. Output fasta file in the form >hybridN \n TARGET&smRNA. Fold with RNAup.

9. Merge RNAup output with table.

"""

In [10]:
def timing(f):
    """
    Helper function for timing other functions
    Parameters
    ----------
    f : function
    Returns
    -------
    function
        new function wrap with timer and logging 
    """
    def wrap(*args):
        time1 = time.time()
        ret = f(*args)
        time2 = time.time()
        logging.debug('{:s} function took {:.10f} s'.format(f.__name__, (time2-time1)))
        return ret
    
    return wrap
logging.basicConfig(level=logging.DEBUG)

In [11]:
def parallelize_dataframe(df, func, n_cores = 20):
    
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()

    return df

In [12]:
# ex of parallelization of dataframe
#fin = parallelize_dataframe(df, ttest, n_cores = 30)


In [13]:
def btindex(fasta, idx_path) : 
        
    os.system(f"sh ./bowtie_build.sh {fasta} {idx_path}")

In [14]:
def bt2index(fasta, idx_path) : 
    
    os.system(f"sh ./bowtie2_build.sh {fasta} {idx_path}")

In [15]:
def trim_adapter(fastq, outdir, outname, adapter = "AGATCGGAAGAGC", min_len = 15) : 
    
    """
    Inputs : 
        - output from collapse_reads() 
        - adapter sequence
    
    Outputs : 
        - trimmed fastq.gz file
    """

    os.system(f"sh ./trim_adapter.sh {adapter} {min_len} {fastq} {outdir}")
    
    return outname
    
    

In [16]:
def collapse_reads(fastq, collapsed) : 
    
    """
    Inputs : 
        - gzipped fastq file (from trim_adapter)
        
    Outputs : 
        - collapsed fasta file in the form
        - >sequence:count
          sequence
        
        returns path to collapsed fasta
    """
    
    os.system(f"sh ./collapse.sh {fastq} {collapsed}")
    
    return collapsed

In [17]:
def align_genome(fasta, genome_index, outname) : 
    
    """
    Inputs : 
        - trimmed & collapsed fasta file 
    
    Outputs : 
        - fasta file of non-aligned reads
    """
    
    os.system(f"sh ./bowtie_align_genome.sh {fasta} {genome_index} {outname}")
    
    return outname

In [18]:
def align_smRNA_chimera(fasta, index, outname) : 
    
    """
    Inputs :
        - fasta file of smRNAs
        - index of chimeras
        - index name
        - outdir
    
    Outputs :
        - dataframe in bed format
        
    """
        
    os.system(f"sh ./bowtie_align_smRNA_chimera.sh {fasta} {index} {outname}")
    
    return outname
    

In [19]:
def align_targets_transcriptome(fasta, index, outname) : 
    
    """
    Inputs :
        - fasta file of target RNAs parsed from chimeras
        - index name
        - outdir
    
    Outputs :
        - dataframe in bed format
        
    """
        
    os.system(f"sh ./bowtie2_align_targets.sh {fasta} {index} {outname}")
    
    return outname
    

In [20]:
def parse_target(target, smRNA) : 
    
    splt = target.split(smRNA)
    
    return max(splt, key = len)

In [21]:
def process_smRNA_alignment(dat) : 
    
    # make sure smRNA aligns sense to reference
    dat = dat.query('smRNA_strand == "+"').reset_index(drop = True)
        
    # split clash read into count and seq cols
    dat[['chimera', 'count']] = dat.clash_read.str.split(":",expand=True)
    
    # remove smRNA sequence from chimera
    dat['target_seq'] = dat.apply(lambda x : parse_target(x['chimera'], x['smRNA_seq']), axis = 1)    
    
    dat = dat.drop(columns = ['score'])
    # return the dataframe
    return dat
    
    

In [22]:
def get_seq(records, chrom, start, end) : 
    
    seq = records[chrom].seq
    
    return (str(seq)[start : end])

In [23]:
def get_target_sequence(df, *args) :
    
    records = args['records']
    
    df['pad_up'] = df.apply(lambda x : x['target_start']-10 if x['target_start']-10 > 0 else 0, axis = 1 )
    
    df['pad_down'] = df.apply(lambda x : x['target_end']+10, axis = 1 )
    
    df['target_pad'] = df.apply(lambda x : get_seq(records, x['target'], x['pad_up'], x['pad_down']) , axis = 1 )
    

In [24]:
def RNAup(fasta) : 
    
    outname = fasta.replace(".fa", "") + ".tsv"
    
    os.system(f"sh ./RNAup.sh {fasta} {outname}")

def parallel_RNAup(files, n_cores = 5) : 
    
    pool = Pool(n_cores)
    pool.map(RNAup, files)
    pool.close()
    pool.join()
    

In [31]:
def run_RNAup(RNAup_input, outdir, ncores = 4) : 
    RNAup_out = RNAup_input.replace(".fa", ".tsv")
    rand_num = random.randint(0,10000)
    if not os.path.exists(RNAup_out) :
        print("Folding RNA")
        i = 0
        with open(RNAup_input, 'r') as f : 
            for line in f : 
                i += 1
        f.close() 

        Nlines = i / 2
        chunksize = 10000
        if Nlines > chunksize : 
            tmpdir = os.path.join(outdir, "RNAup_tmp")

            if not os.path.exists(tmpdir) : 
                os.mkdir(tmpdir)

            Nfiles = round(Nlines / chunksize,0)
            os.system(f'split -l {chunksize} {RNAup_input} {tmpdir}/tmp{rand_num}')

            files = glob.glob(f"{tmpdir}/tmp{rand_num}*")
            parallel_RNAup(files, n_cores = ncores)

            # join all RNAup files together
            os.system(f"cat {tmpdir}/tmp{rand_num}*.tsv > {RNAup_out}")

        else : 
            RNAup(RNAup_input)
    

In [32]:
def process_rna_up(RNAup_out) :
    rna_up = pd.read_csv(RNAup_out, sep = "\t", names = ['ID', 'info', 'interaction_seq'])
    rna_up['ID'] = rna_up['ID'].str.replace(">", "")
    rna_up['info_split'] = rna_up.apply(lambda x : [ i for i in x['info'].split(" ") if i != "" ], axis = 1)
    rna_up['pairing'] = rna_up['info_split'].str[0]
    rna_up['dG'] = rna_up['info_split'].str[4]
    rna_up['dG'] = rna_up['dG'].str.replace("(", "", regex = False).astype(float)
    rna_up['target_pairing_region']  = rna_up['info_split'].str[1]
    rna_up['smRNA_pairing_region']  = rna_up['info_split'].str[3]

    rna_up['target_tmp'] = rna_up['interaction_seq'].str.split("&").str[0]
    rna_up['len_target'] = rna_up['target_tmp'].str.len()

    # remove target sites that are less than 15 nucleotides long
    rna_up = rna_up.query('len_target >= 15').reset_index(drop = True)
    rna_up = rna_up.drop(columns = ['target_tmp', 'len_target'])

    # remove target sites with no predicted basepairing
    rna_up['piRNA_pairing'] = rna_up['pairing'].str.split("&", expand = False).str[1]
    rna_up['N_piRNA_bp'] = rna_up.apply(lambda x : len( [ i for i in x['piRNA_pairing'] if i == ")"]), axis = 1)

    # fill in blanks in basepairing with "." (i.e. most of the time base pairing begins at position 2 of piRNA not 5' U)
    # each piRNA starts at 1 and goes to 21
    # if piRNA pairing region starts at 2 add 1 "." at beginning of string "."*start-1


    rna_up['modified_smRNA_pairing'] = rna_up.apply(lambda x : 
                                                ( int(x['smRNA_pairing_region'].split(",")[0])-1 )*"." + x['piRNA_pairing'] + (21-int(x['smRNA_pairing_region'].split(",")[1]))*".", 
                                                axis = 1)

    return rna_up

In [85]:
def run(smRNA_fasta, target_fasta, genome_fasta, fastq, index_dir = os.getcwd(), outdir = os.getcwd(), ncores = 4) : 
    
    ############################################################
    # timing function
    t1 = time.time()
    
    ############################################################
    # check to make sure idx dir exists
    if not os.path.exists(index_dir) : 
        os.mkdir(index_dir)
    
    ############################################################
    # check to make sure out dir exists
    if not os.path.exists(outdir) : 
        os.mkdir(outdir)
        
    ############################################################
    # build smRNA_fasta (bt)
    smRNA_name = os.path.basename(smRNA_fasta).replace(".fa", "")
    if not os.path.exists( os.path.join( index_dir, f"{smRNA_name}.1.ebwt" ) ) :
        print("Building smRNA index")
        smRNA_index = os.path.join( index_dir, f"{smRNA_name}" )
        btindex(smRNA_fasta, smRNA_index)
    else : 
        smRNA_index = os.path.join( index_dir, f"{smRNA_name}" )
    
    ############################################################
    # build target_fasta index (bt2)
    target_name = os.path.basename(target_fasta).replace(".fa", "")
    if not os.path.exists( os.path.join( index_dir, f"{target_name}.1.bt2" ) ) :
        print("Building target index")
        target_index = os.path.join( index_dir, f"{target_name}" )
        bt2index(target_fasta, target_index)
    else : 
        target_index = os.path.join( index_dir, f"{target_name}" )
    
    ############################################################
    # build genome index (bt2)
    genome_name = os.path.basename(genome_fasta).replace(".fa", "")
    if not os.path.exists( os.path.join( index_dir, f"{genome_name}.1.bt2" ) ) :
        print("Building genome index")
        genome_index = os.path.join( index_dir, f"{genome_name}" )
        bt2index(genome_fasta, genome_index)
    else : 
        genome_index = os.path.join( index_dir, f"{genome_name}" )
    
    ############################################################
    # trim adapter
    t_fq = os.path.join(outdir, os.path.basename(fastq).replace(".fq.gz", "_trimmed.fq.gz").replace(".fastq.gz", "_trimmed.fq.gz"))
    if not os.path.exists(t_fq) : 
        print("Trimming adapters")
        t_fq = trim_adapter(fastq, outdir, t_fq)
    
    ############################################################
    # collapse reads
    c_fa = t_fq.replace(".fq.gz", ".uni.fa")
    print(c_fa)
    if not os.path.exists(c_fa) : 
        print("Collapsing trimmed reads")
        c_fa = collapse_reads(t_fq, c_fa)
    
    ############################################################
    # align collapsed reads to genome
    genome_unmapped = c_fa.replace(".fa", ".genomeUnmapped.fa")
    if not os.path.exists(genome_unmapped) : 
        print("Aligning to genome")
        align_genome(c_fa, genome_index, genome_unmapped)
    
    ############################################################
    # build bt index of unaligned reads
    chimera_index_name = os.path.basename(genome_unmapped).replace(".fa", "")
    if not os.path.exists( os.path.join( outdir, f"{chimera_index_name}.1.ebwt" ) ) :
        print("Build chimera index")
        chimera_index = os.path.join( outdir, f"{chimera_index_name}" )
        btindex(genome_unmapped, chimera_index)
    else : 
        chimera_index = os.path.join( outdir, f"{chimera_index_name}" )
    
    ############################################################
    # align smRNA fasta to chimera index
    smRNA_aligned = genome_unmapped.replace(".fa", ".smRNAaligned")
    smRNA_aligned_bed = genome_unmapped.replace(".fa", ".smRNAaligned.bed")
    if not os.path.exists(smRNA_aligned_bed) : 
        print("Align smRNA to chimera index")
        align_smRNA_chimera(smRNA_fasta, chimera_index, smRNA_aligned)
    
    ############################################################
    # parse smRNA alignment
    print("Process smRNA alignment")
    dat = pd.read_csv(smRNA_aligned_bed, 
                      sep = "\t",
                      names = ['clash_read', 'smRNA_start', 'smRNA_end', 'smRNA', 'score', 'smRNA_strand', 'smRNA_seq'], 
                      header = 0)
    master = parallelize_dataframe(dat, process_smRNA_alignment, n_cores = ncores)
    master = master.query('target_seq.str.len() >= 15').reset_index(drop = True)
    
    # write fasta file with target sequences to align to the genome
    lines = ''
    for index,row in master.iterrows() : 
        lines += f">{row['clash_read']}\n{row['target_seq']}\n"
        
    targets_seq_parsed = f"{smRNA_aligned}.targets.fa"
    op = open(targets_seq_parsed, 'w')
    op.write(lines)
    op.close() 
    lines = ''
    
    ############################################################
    # align target fasta to targets
    targets_aligned = targets_seq_parsed.replace(".fa", ".targetsAligned")
    targets_aligned_bed = targets_seq_parsed.replace(".fa", ".targetsAligned.bed")
    if not os.path.exists(targets_aligned_bed) : 
        print("Align targets to target RNA index")
        align_targets_transcriptome(targets_seq_parsed, target_index, targets_aligned)
    
    
    ############################################################
    # merge aligned targets to master table
    print("Process mRNA alignment")
    dat = pd.read_csv(targets_aligned_bed, 
                      sep = "\t",
                      names = ['target', 'target_start', 'target_end', 'clash_read', 'score', 'target_strand', 'aligned_region'], 
                      header = 0)
    
    dat = dat.drop(columns = ['score'])
    
    ############################################################
    # merge aligned targets to df
    master = dat.merge(master, on = ['clash_read'], how = 'left')
    
    ############################################################
    # extend target region by 0 nt on both sides
    records = SeqIO.to_dict(SeqIO.parse(target_fasta, 'fasta'))
    
    pad = 5
    master['pad_up'] = master.apply(lambda x : x['target_start']-pad if x['target_start']-pad > 0 else 0, axis = 1 )
    master['pad_down'] = master.apply(lambda x : x['target_end']+pad, axis = 1 )
    master['target_pad'] = master.apply(lambda x : get_seq(records, x['target'], x['pad_up'], x['pad_down']) , axis = 1 )
    
    master['ID'] = [ f'id_{i}' for i in range(0,master.shape[0]) ]
    
    ############################################################
    # iterate through df_merge and make RNAup input 
    lines = ''
    for index,row in master.iterrows() : 
        lines += f">{row['ID']}\n{row['target_pad']}&{row['smRNA_seq']}\n"
        
    RNAup_input = f"{targets_aligned}.RNAup.fa"
    op = open(RNAup_input, 'w')
    op.write(lines)
    op.close() 
    lines = ''
    
    ############################################################
    # RNAup folding 
    print("RNAup")
    RNAup_out = RNAup_input.replace(".fa", ".tsv")
    
    # check size of file if lines > chunk size break into smaller files for RNA folding
    # fasta file has two lines per entry so divide i / 2 to get number of computations
    run_RNAup(RNAup_input, outdir)
    
    rna_up = process_rna_up(RNAup_out)
    
    no_bp = rna_up.query('N_piRNA_bp == 0').reset_index(drop = True)
    
    rna_up = rna_up.query('N_piRNA_bp >= 4').reset_index(drop = True)

    rna_up = rna_up[['ID', 'interaction_seq', 'pairing', 'dG', 'modified_smRNA_pairing', 'target_pairing_region', 'smRNA_pairing_region']]

    ####################
    # Do a refined search , for target RNA interactions with no predicted base-pairs re-do folding with 
    # better sequence prediction of where targeting might occur
    no_bp = no_bp.merge(master, on = ['ID'], how = 'left')
    no_bp['s'] = no_bp['target_pairing_region'].str.split(",").str[0].astype(int) - 5
    no_bp['e'] = no_bp['target_pairing_region'].str.split(",").str[1].astype(int) + 5
    
    def subseq(seq, s, e) : 
        return seq[int(s) : int(e)]    
    
    no_bp['refined_target_seq'] = no_bp.apply(lambda x : subseq(x['target_pad'], x['s']-1, x['e']), axis = 1)

    no_bp = no_bp.query('refined_target_seq.str.len() >= 15')
    
    ####################
    # adjust target start and end from master table 
    ids = no_bp['ID'].tolist()
    master_sub = master.query('ID.isin(@ids)').reset_index(drop = True)

    no_bp_sub = no_bp[['ID', 's', 'e', 'refined_target_seq']]

    master_sub = master_sub.merge(no_bp_sub, on = ['ID'], how = 'left')
    
    lines = ''
    for index,row in master_sub.iterrows() : 
        lines += f">{row['ID']}\n{row['refined_target_seq']}&{row['smRNA_seq']}\n"
        
    RNAup_input = f"{targets_aligned}.RNAup.refined.fa"
    op = open(RNAup_input, 'w')
    op.write(lines)
    op.close() 
    lines = ''
    
    RNAup_out = RNAup_input.replace(".fa", ".tsv")
    run_RNAup(RNAup_input, outdir)
    rna_up_refined = process_rna_up(RNAup_out)
    rna_up_refined = rna_up_refined.query('N_piRNA_bp >= 4').reset_index(drop = True)
    rna_up_refined = rna_up_refined[['ID', 'interaction_seq', 'pairing', 'dG', 'modified_smRNA_pairing', 'target_pairing_region', 'smRNA_pairing_region']]
    
    master_sub = master.query('ID.isin(@ids)').reset_index(drop = True)
    
    ####################
    # merge rna_up to master
    master_rnaup = rna_up.merge(master, on = ['ID'], how = 'left')
    
    ####################
    # merge rna_up_refined to master table
    master_sub_rnaup = rna_up_refined.merge(master_sub, on = ['ID'], how = 'left')
    
    ####################
    # merge master tables
    master_rnaup = pd.concat([master_rnaup, master_sub_rnaup], ignore_index = True)
    master_rnaup = master_rnaup.rename(columns = {'count':'clash_count'})
    master_rnaup['clash_count'] = master_rnaup['clash_count'].astype(float)
    
    # calculate transcriptomic coordinates of target site
    master_rnaup['target_site_start'] = master_rnaup['target_start'] + master_rnaup['target_pairing_region'].str.split(",").str[0].astype(int)
    master_rnaup['target_site_end'] = master_rnaup['target_start'] + master_rnaup['target_pairing_region'].str.split(",").str[1].astype(int)
    master_rnaup['strand'] = '-'
    
    # clean up master table
    master_rnaup = master_rnaup[['ID', 'target', 'target_site_start', 'target_site_end', 'smRNA', 'clash_count', 'strand', 'dG', 'modified_smRNA_pairing', 'interaction_seq', 'pairing', 'clash_read']]
    
    # sort values by count
    master_rnaup = master_rnaup.sort_values(by = ['clash_count'], ascending = False).reset_index(drop = True)
    print(f"total target sites remaining == {master_rnaup.shape[0]}")

    # combine identical target sites, groupby pairing, target, smRNA, target_start, target_end, strand, 
    master_rnaup_grouped = master_rnaup.groupby( by = ['target', 'target_site_start', 'target_site_end', 'smRNA', 'modified_smRNA_pairing', 'strand', 'interaction_seq']).agg({'dG':'mean', 'clash_count':'sum'}).reset_index()
    
    t2 = time.time()
    print(f"CLASH Analysis Pipeline took {round( (t2-t1), 5)} seconds to run!")
    print(f"total target sites remaining == {master_rnaup_grouped.shape[0]}")
    return([ master_rnaup, master_rnaup_grouped ])
    

In [86]:
res = run(
    smRNA_fasta = "/fs/ess/PCON0160/ben/genomes/c_elegans/WS279/piRNA.fa",
    target_fasta = "/fs/ess/PCON0160/ben/genomes/c_elegans/WS279/ce_ws279.linc.pseudo.pc.repbase.fa",
    genome_fasta = "/fs/ess/PCON0160/ben/genomes/c_elegans/WS279/c_elegans.PRJNA13758.WS279.genomic.fa",
    fastq = "/fs/ess/PAS1473/deep_sequencing/prg1CLASH/fastq/prg1CLASH_1.fastq.gz",
    index_dir = "/fs/ess/PAS1473/ben_past_projects/clash_index", 
    outdir = "/fs/ess/PAS1473/ben_past_projects/prg1_clash_rep1",
    ncores = 16
)

/fs/ess/PAS1473/ben_past_projects/prg1_clash_rep1/prg1CLASH_1_trimmed.uni.fa
Process smRNA alignment
Process mRNA alignment
RNAup
total target sites remaining == 445507
CLASH Analysis Pipeline took 178.84057 seconds to run!
total target sites remaining == 303535


In [87]:
ungrouped = res[0]
grouped = res[1]

In [88]:
grouped.query('smRNA == "21ur-161" and target == "WBGene00306003,W04B5.8"')

Unnamed: 0,target,target_site_start,target_site_end,smRNA,modified_smRNA_pairing,strand,interaction_seq,dG,clash_count
303474,"WBGene00306003,W04B5.8",2115.0,2135.0,21ur-161,.)))))))))).))..))))),-,AUCUCGAGGCCCUGGAUGAGA&UCUCAUCCGGUCCAAGAGGU,-21.81125,23.0


In [89]:
grouped.sort_values(by = 'clash_count', ascending = False)

Unnamed: 0,target,target_site_start,target_site_end,smRNA,modified_smRNA_pairing,strand,interaction_seq,dG,clash_count
248298,"WBGene00019569,K09D9.12",1434.0,1452.0,21ur-4564,)))))))))))))))))....,-,GUGAUCUGCCAUCGUUCCA&UGGAACGUGGCGGUCAC,-31.220000,2524.0
265823,"WBGene00021015,W03G9.3,enu-3.3",810.0,827.0,21ur-2061,.)))))))....))))))...,-,GCCGCGAAUCCGUAUGUG&CACAUACAAGGCGCGGC,-22.947500,2171.0
201848,"WBGene00015405,C03H5.3",382.0,403.0,21ur-2634,)))))))..).)))))))))),-,AAUCGUGGCGCCAUCGCAAGAA&UUCUUGCAAGUUGUCACGAUU,-22.625556,1884.0
117866,"WBGene00008238,C50F4.16",994.0,1015.0,21ur-4204,))))).)))))))))))).)),-,CCUAAAAUUGCAUACACUCUUG&UAAGAAUGUUGCAAUUUUCGG,-22.608500,1881.0
111411,"WBGene00007709,C25A1.8,clec-87",510.0,526.0,21ur-2809,.))))))))))....))))..,-,GGACUUGUCAGAUUCAC&GUGAAUCUGAAAAUGUCC,-25.623333,1561.0
...,...,...,...,...,...,...,...,...,...
148151,"WBGene00010731,K10C3.4",4578.0,4597.0,21ur-1449,.)))))).)))....)))))),-,UUUCCCAUAAUUUGUGUGGC&GCCACAACAGCUCAGGGAAA,-19.270000,1.0
148152,"WBGene00010731,K10C3.4",4880.0,4899.0,21ur-3103,))))))))))))).))))))),-,AUGUUCUUCCAGAGCAUUGA&UCAAUGCUCUGGACAGAACGU,-22.060000,1.0
148153,"WBGene00010731,K10C3.4",5034.0,5055.0,21ur-51,)))))))))))..))).))).,-,UCCCCGUCUCUCUCUCUAAUUA&UAAUUAGAAGGCCCGGUGGA,-20.900000,1.0
148155,"WBGene00010731,K10C3.4",5586.0,5602.0,21ur-1909,...))).)))..)).)))))),-,UUUUUUUGUUUAUCUCU&GGAAAUAUUGCUAAAAAA,-7.330000,1.0


In [90]:
grouped.sort_values(by = 'clash_count', ascending = False).query('smRNA == "21ur-4864"')

Unnamed: 0,target,target_site_start,target_site_end,smRNA,modified_smRNA_pairing,strand,interaction_seq,dG,clash_count
17000,"WBGene00000883,Y75B12B.2,cyn-7",62.0,81.0,21ur-4864,.))))))..)))))))))...,-,GGACGUAUCGUUAUGGAGCU&AGCUCCUGGAUAUGUCU,-20.660000,255.0
168162,"WBGene00012228,W03H9.2",492.0,511.0,21ur-4864,.)))))))))..)).)))))),-,GCUGGAAAUCAUCGGGAGCU&AGCUCCUGGAUAUGUCUAGC,-24.174286,146.0
261844,"WBGene00020696,T22F3.3,pygl-1",1291.0,1307.0,21ur-4864,.))))))).))..))))))).,-,CUAGACAUUGAGGAGCU&AGCUCCUGGAUAUGUCUAG,-26.856667,80.0
17001,"WBGene00000883,Y75B12B.2,cyn-7",63.0,82.0,21ur-4864,.))))))..)))))))))...,-,GGACGUAUCGUUAUGGAGCU&AGCUCCUGGAUAUGUCU,-20.660000,57.0
121088,"WBGene00008514,F02A9.4,mrpl-44",114.0,130.0,21ur-4864,.))))...))).)))))))..,-,UAGCUCCUGGAUAUGUC&GACAGAUUCUUGGAGCUG,-20.920000,48.0
...,...,...,...,...,...,...,...,...,...
158933,"WBGene00011531,T06D10.1,rsbp-1",2335.0,2354.0,21ur-4864,.))))))))))))))))....,-,GACAUUUCCGUUCAGGAGCU&AGCUCCUGGAUAUGUC,-16.090000,1.0
159109,"WBGene00011550,T06G6.11",841.0,857.0,21ur-4864,....))))))).)))).)))),-,GCUAUGCAUCUCCAGGA&UCCUGGAUAUGUCUAGC,-23.880000,1.0
139135,"WBGene00010013,F54B3.1",14860.0,14883.0,21ur-4864,))))).)))))))))))))).,-,CUAGGCAUCAUCAUUCAGCAGCUA&UAGCUCCUGGAUAUGUCUAG,-23.950000,1.0
150846,"WBGene00010964,MTCE.26,ctc-1",1323.0,1338.0,21ur-4864,))).))))).).)))).....,-,ACAUUUUGCAGGACUA&UAGCUCCUGGAUAUGU,-15.520000,1.0


In [91]:
grouped.sort_values(by = 'clash_count', ascending = False).query('dG <= 0').shape[0]

303532

In [92]:
grouped.sort_values(by = 'clash_count', ascending = False).query('dG <= -15 & clash_count > 1').shape[0]

158378

In [93]:
grouped.sort_values(by = 'clash_count', ascending = False).query('dG <= -20 & clash_count > 1').shape[0]

90718