# Base-wise conservation of introns


First, load the memory-mapped pickle file that was created in notebook [0.2.09]

In [2]:
folder = '/projects/ps-yeolab/obotvinnik/singlecell_pnms/'

csv_folder = '{}/csvs_for_paper'.format(folder)
conservation_folder = '{}/conservation'.format(csv_folder)

splicing_feature_folder = '{}/splicing_feature_data'.format(csv_folder)
alternative_feature_folder = '{}/alternative'.format(splicing_feature_folder)
constitutive_feature_folder = '{}/constitutive'.format(splicing_feature_folder)

alt_exons_bedfile = '{}/exons.bed'.format(alternative_feature_folder)
constitutive_bedfile = '{}/exons.bed'.format(constitutive_feature_folder)


In [3]:
cd $splicing_feature_folder

/projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper/splicing_feature_data


In [4]:
ls -lh /home/obotvinnik/projects/singlecell_pnms/analysis/htseq_memmap

ls: cannot access /home/obotvinnik/projects/singlecell_pnms/analysis/htseq_memmap: No such file or directory


In [5]:
%%file basewise_conservation.py
import glob
import os
import numpy as np
import pandas as pd
import pybedtools
import HTSeq
from flotilla.util import timestamp

import cPickle as pickle



folder = '/projects/ps-yeolab/obotvinnik/singlecell_pnms/'

csv_folder = '{}/csvs_for_paper'.format(folder)
conservation_folder = '{}/conservation'.format(csv_folder)

splicing_feature_folder = '{}/splicing_feature_data'.format(csv_folder)
alternative_feature_folder = '{}/alternative'.format(splicing_feature_folder)
constitutive_feature_folder = '{}/constitutive'.format(splicing_feature_folder)

alt_exons_bedfile = '{}/exons.bed'.format(alternative_feature_folder)
constitutive_bedfile = '{}/exons.bed'.format(constitutive_feature_folder)


memmap_dir = '/home/obotvinnik/projects/singlecell_pnms/analysis/htseq_memmap'
filename = '{}/hg19_phastcons_placental_mammal_htseq.pickle'.format(memmap_dir)
with open(filename) as f:
    conservation = pickle.load(f)


bedfiles = {'alternative': alt_exons_bedfile, 'constitutive': constitutive_bedfile}

directions = 'upstream', 'downstream'

nt = 400


for exon_type, bedfile in bedfiles.items():
    bed = pybedtools.BedTool(bedfile)

    basename = os.path.basename(bedfile)
    prefix = basename.split('.bed')[0]
    
    for direction in directions:
        if direction == 'downstream':
            # Get downstream intron
            intron = bed.flank(l=0, r=nt, s=True, g=pybedtools.chromsizes('hg19'))
        else:
            intron = bed.flank(l=nt, r=0, s=True, g=pybedtools.chromsizes('hg19'))
        
        # Get just unique upstream,/downstream
        intron = pybedtools.BedTool(list(set(x for x in intron)))
        nrow = len(intron)
        ncol = nt
        array = np.zeros(shape=(nrow, ncol), dtype=float)
        junction_ids = pd.Series(index=np.arange(nrow))

        print 'Iterating over {} intervals in {} {}nt of {} ...'.format(nrow, direction, nt, basename)
        for i, interval in enumerate(intron):
            if (i+1) % 10000 == 0:
                print '\t{}\t{}/{}'.format(timestamp(), i+1, nrow)
            junction_ids[i] = interval.name
            region = conservation[HTSeq.GenomicInterval(str(interval.chrom), interval.start, interval.stop)]
            count = sum(1 for _ in region.values())
            subset = np.fromiter(region.values(),
                                 dtype=float, count=count)
            if interval.strand == '-':
                subset = subset[::-1]
                j = nt - count
                array[i][j:] = subset
            else:
                j = count
                array[i][:j] = subset
        intron_conservation = pd.DataFrame(array, index=junction_ids.values)
        filename = '{}/{}/{}{}_placental_mammal_conservation.csv'.format(splicing_feature_folder, exon_type, direction, nt)
        print '\t', filename
        intron_conservation.to_csv(filename)


Overwriting basewise_conservation.py


In [6]:
pwd

u'/projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper/splicing_feature_data'

In [7]:
command = 'python /projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper/splicing_feature_data/basewise_conservation.py'
commands = [command]

from qtools import Submitter

Submitter(commands, 'basewise_conservation', walltime='48:00:00', ppn=16)

Wrote commands to basewise_conservation.sh.
Submitted script to queue home.
 Job ID: 7387135


<qtools.submitter.Submitter at 0x2af6c4058c90>

In [10]:
! ls -lha basewise_conservation*

-rw-r--r-- 1 obotvinnik yeo-group 2.8K Jan  5 12:37 basewise_conservation.py
-rw-r--r-- 1 obotvinnik yeo-group  395 Jan  5 12:37 basewise_conservation.sh
-rw------- 1 obotvinnik yeo-group  446 Feb 21  2016 basewise_conservation.sh.err
-rw------- 1 obotvinnik yeo-group 1.2K Feb 21  2016 basewise_conservation.sh.out


In [8]:
! tail basewise_conservation*

==> basewise_conservation.py <==
                subset = subset[::-1]
                j = nt - count
                array[i][j:] = subset
            else:
                j = count
                array[i][:j] = subset
        intron_conservation = pd.DataFrame(array, index=junction_ids.values)
        filename = '{}/{}/{}{}_placental_mammal_conservation.csv'.format(splicing_feature_folder, exon_type, direction, nt)
        print '\t', filename
        intron_conservation.to_csv(filename)
==> basewise_conservation.sh <==
#PBS -V
#PBS -l walltime=48:00:00
#PBS -l nodes=1:ppn=16
#PBS -A yeo-group
#PBS -q home

# Go to the directory from which the script was called
cd $PBS_O_WORKDIR
python /projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper/splicing_feature_data/basewise_conservation.py


==> basewise_conservation.sh.err <==
  _get_xdg_config_dir())

==> basewise_conservation.sh.out <==
	2016-02-21 17:45:25	10000/43903
	2016-02-21 17:45:30	20000/43