## Calculate miRNA hybridization

For RNA targets, James Broughton from the Pasquinelli lab recommends `RNAhybrid`, and to use just the first 17 nt of the mature miRNA. Wanted to use `fastx-trimmer` but it only takes DNA sequences  - `U`'s are illegal :(

In [2]:
from Bio import SeqIO
import sys
import os

result_seq = []
filename = '/projects/ps-yeolab/genomes/mirbase/release_21/human_mature.fa'
with open(filename) as infile:
    for seq in SeqIO.parse(infile, 'fasta'):
        result_seq.append(seq[:17])

trimmed_filename = '/projects/ps-yeolab/genomes/mirbase/release_21/human_mature_17bp.fa'
with open(trimmed_filename, 'w') as outfile:
    SeqIO.write(result_seq, outfile, 'fasta')

In [3]:
import pybedtools

folder = '/projects/ps-yeolab/obotvinnik/singlecell_pnms'
csv_folder = '{}/csvs_for_paper/'.format(folder)
bed_folder = '{}/bed'.format(folder)



splicing_feature_folder = '{}/splicing_feature_data'.format(csv_folder)
alternative_feature_folder = '{}/alternative'.format(splicing_feature_folder)
constitutive_feature_folder = '{}/constitutive'.format(splicing_feature_folder)

In [4]:
alt_exons_bed = '{}/alt_exons_stop_plus1.bed'.format(bed_folder)
constitutive_bed = '{}/constitutive_stop_plus1.bed'.format(bed_folder)

In [10]:
exon2s = pybedtools.BedTool(alt_exons_bed)
constitutives = pybedtools.BedTool(constitutive_bed)

beds = {'alternative': exon2s, 'constitutive': constitutives}

In [8]:
hg19_fasta = '/projects/ps-yeolab/genomes/hg19/gencode/v19/GRCh37.p13.genome.fa'

In [11]:
bed = constitutives

In [12]:
seqs = bed.sequence(fi=hg19_fasta, s=True)

In [13]:
seqs.seqfn

'/state/partition1/obotvinnik/4349309.tscc-mgr.local/pybedtools.wkY7tx.tmp'

In [15]:
! wc -l /state/partition1/obotvinnik/4349309.tscc-mgr.local/pybedtools.wkY7tx.tmp

12500 /state/partition1/obotvinnik/4349309.tscc-mgr.local/pybedtools.wkY7tx.tmp


In [17]:
commands = []

for name, bed in beds.items():
    seqs = bed.sequence(fi=hg19_fasta, s=True)
    temp_fasta = seqs.seqfn
    final_fasta = '{}/{}/exons.fasta'.format(splicing_feature_folder, name)
    ! cp $temp_fasta $final_fasta
    mirna_seqs = '/projects/ps-yeolab/genomes/mirbase/release_21/human_mature_17bp.fa'
    rnahybrid_results = out_fasta.replace('.fasta', '_rnahybrid_mirbase_results.txt')
    command = 'RNAhybrid -c -s 3utr_human -e -28 -q {} -t {} > {}'.format(mirna_seqs, final_fasta, rnahybrid_results)
    commands.append(command)
commands

['RNAhybrid -c -s 3utr_human -e -28 -q /projects/ps-yeolab/genomes/mirbase/release_21/human_mature_17bp.fa -t /projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper//splicing_feature_data/alternative/exons.fasta > /projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper//splicing_feature_data/constitutive/exons_rnahybrid_mirbase_results.txt',
 'RNAhybrid -c -s 3utr_human -e -28 -q /projects/ps-yeolab/genomes/mirbase/release_21/human_mature_17bp.fa -t /projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper//splicing_feature_data/constitutive/exons.fasta > /projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper//splicing_feature_data/constitutive/exons_rnahybrid_mirbase_results.txt']

In [18]:
from qtools import Submitter

sub = Submitter(commands, 'RNAhybrid', walltime='120:00:00', write_and_submit=True, array=True)

running 2 tasks as an array-job.
job ID: 4356898


Submit a compute job to calculate microRNA hybridization.

In [4]:
folder = '/projects/ps-yeolab/obotvinnik/singlecell_pnms'
prefix = 'exon2'
transcribed_fasta = '{}/{}.fa'.format(folder, prefix)

In [5]:
from gscripts.qtools import Submitter

mirna_seqs = '/projects/ps-yeolab/genomes/mirbase/release_21/human_mature_17bp.fa'
rnahybrid_results = '/projects/ps-yeolab/obotvinnik/miso_helpers/hg19/se_exon2_RNAhybrid_mirbase_human_mature_17bp_mfe_cutoff.txt'
command = 'RNAhybrid -c -s 3utr_human -e -28 -q {} -t {} > {}'.format(mirna_seqs, transcribed_fasta, rnahybrid_results)
sub = Submitter([command], 'RNAhybrid', walltime='120:00:00', write_and_submit=True)

job ID: 3320888


Reading the output takes a LONG time

In [None]:
%%time
import pandas as pd
from collections import Counter

rnahybrid = pd.read_csv('/projects/ps-yeolab/obotvinnik/miso_helpers/hg19/se_exon2_RNAhybrid_mirbase_human_mature_17bp.txt', 
                        sep=':', 
#                         index_col=[0, 1, 3], 
                        header=None,
                        names=['chrom', 'start-stop', 'exon_length', 'mirna', 'mirna_length',
                                                                 'minimum_free_energy', 'p_value', 'target_bound_start', 'mirna_unbound',
                                                                 'mirna_bound', 'exon_bound', 'exon_bound'])
rnahybrid = rnahybrid.ix[rnahybrid.minimum_free_energy < -28]
grouped = rnahybrid.groupby(['chrom', 'start-stop'])
chrom_startstop_mirna = grouped.mirna.apply(lambda x: ','.join('{}[{}]'.format(k, v) for k,v in Counter(x).items()))
chrom_startstop_mirna.head()