# Transcribe isoforms to get RNA sequence


In [5]:
%load_ext autoreload
%autoreload 2

import itertools
import re

import gffutils
import numpy as np
import pandas as pd
import pybedtools
import six

v19db_filename = '/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf.db'
v19db = gffutils.FeatureDB(v19db_filename)

folder = '/projects/ps-yeolab/obotvinnik/singlecell_pnms'
csv_folder = '{}/csvs_for_paper/'.format(folder)
bed_folder = '{}/bed'.format(folder)

alt_exons_bedfile = '{}/exon2.bed'.format(bed_folder)
constitutive_bedfile = '{}/constitutive_exons.bed'.format(bed_folder)


splicing_feature_folder = '{}/splicing_feature_data'.format(csv_folder)
alternative_feature_folder = '{}/alternative'.format(splicing_feature_folder)
constitutive_feature_folder = '{}/constitutive'.format(splicing_feature_folder)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
figure_folder = '/home/obotvinnik/Dropbox/figures2/singlecell_pnms/isoform_rna_properties'
! mkdir $figure_folder

mkdir: cannot create directory `/home/obotvinnik/Dropbox/figures2/singlecell_pnms/isoform_rna_properties': File exists


## Transcribe isoforms

Get the RNA sequence form the DNA sequence

In [7]:
prefix = 'isoform_transcriptions'
transcribed_fasta = '{}/{}.fa'.format(folder, prefix)

## Calculate miRNA hybridization

For RNA targets, James Broughton from the Pasquinelli lab recommends `RNAhybrid`, and to use just the first 17 nt of the mature miRNA. Wanted to use `fastx-trimmer` but it only takes DNA sequences  - `U`'s are illegal :(

In [8]:
from Bio import SeqIO
import sys
import os

result_seq = []
filename = '/projects/ps-yeolab/genomes/mirbase/release_21/human_mature.fa'
with open(filename) as infile:
    for seq in SeqIO.parse(infile, 'fasta'):
        result_seq.append(seq[:17])

trimmed_filename = '/projects/ps-yeolab/genomes/mirbase/release_21/human_mature_17bp.fa'
with open(trimmed_filename, 'w') as outfile:
    SeqIO.write(result_seq, outfile, 'fasta')

Submit a compute job to calculate microRNA hybridization.

In [14]:
from gscripts.qtools import Submitter

mirna_seqs = '/projects/ps-yeolab/genomes/mirbase/release_21/human_mature_17bp.fa'
rnahybrid_results = transcribed_fasta.replace('.fa', '_mirbase_human.txt')
command = 'RNAhybrid -e -27 -c -s 3utr_human -q {} -t {} > {}'.format(mirna_seqs, transcribed_fasta, rnahybrid_results)
sub = Submitter([command], 'RNAhybrid', walltime='168:00:00', write_and_submit=True)

job ID: 4320901


In [12]:
ls -lha RNAhybrid*

-rw-r--r-- 1 obotvinnik yeo-group 524 Feb  4 15:06 RNAhybrid.sh
-rw------- 1 obotvinnik yeo-group   0 Oct 16 12:17 RNAhybrid.sh.err
-rw------- 1 obotvinnik yeo-group  24 Oct 19 11:47 RNAhybrid.sh.out


In [13]:
!tail RNAhybrid*

==> RNAhybrid.sh <==
#PBS -V
#PBS -l walltime=168:00:00
#PBS -l nodes=1:ppn=1
#PBS -A yeo-group
#PBS -q home

# Go to the directory from which the script was called
cd $PBS_O_WORKDIR
RNAhybrid -e -27 -c -s 3utr_human -q /projects/ps-yeolab/genomes/mirbase/release_21/human_mature_17bp.fa -t /projects/ps-yeolab/obotvinnik/singlecell_pnms/isoform_transcriptions.fa > /projects/ps-yeolab/obotvinnik/miso_helpers/hg19/se_exon2_RNAhybrid_mirbase_human_mature_17bp.txt


==> RNAhybrid.sh.err <==

==> RNAhybrid.sh.out <==
Nodes:        tscc-2-51


Reading the output takes a LONG time