# Create memory-mapped HTSeq genomic Array

Get chromosome sizes to use memory mapped mode of HTSeq so it's faster

In [3]:
cd ~/projects/singlecell_pnms/scripts

/home/obotvinnik/processing_scripts/singlecell_pnms


In [4]:
 mkdir /home/obotvinnik/projects/singlecell_pnms/analysis/htseq_memmap

mkdir: cannot create directory `/home/obotvinnik/projects/singlecell_pnms/analysis/htseq_memmap': File exists


In [6]:
%%file memory_mapped_htseq_genomic_array_conservation.py

import pandas as pd
import HTSeq
import cPickle as pickle

chromsizes = pd.read_table('/projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes', header=None, index_col=0, squeeze=True)

# Remove all haplotype chromosomes
chromsizes = chromsizes[chromsizes.index.map(lambda x: '_' not in x)]
chromsizes = chromsizes.to_dict()

wiggle_filename = '/projects/ps-yeolab/genomes/hg19/hg19_phastcons_placental_mammal_space_separated.wig'
wig = HTSeq.WiggleReader(wiggle_filename)

conservation = HTSeq.GenomicArray(chromsizes, stranded=False, typecode='d', storage='memmap', 
                                  memmap_dir='/home/obotvinnik/projects/singlecell_pnms/analysis/htseq_memmap')
for location, score in wig:
    conservation[location] += score
    
memmap_dir = '/home/obotvinnik/projects/singlecell_pnms/analysis/htseq_memmap'
with open('{}/hg19_phastcons_placental_mammal_htseq.pickle'.format(memmap_dir), 'wb') as f:
    pickle.dump(conservation, f)

Overwriting memory_mapped_htseq_genomic_array_conservation.py


Populating `HTSeq.GenomicArray` took overnight so plan accordingly

In [7]:
import qtools

commands = [
    # Need the step of changing to the analysis directory so a bunch of 
    # .nmm files don't get created in your home directory
    'cd /home/obotvinnik/projects/singlecell_pnms/analysis/htseq_memmap'
    'time python /home/obotvinnik/projects/singlecell_pnms/scripts/memory_mapped_htseq_genomic_array_conservation.py']

qtools.Submitter(commands, 'memory_mapped_htseq_genomic_array_conservation', walltime='36:00:00', queue='home-scrm')

Wrote commands to memory_mapped_htseq_genomic_array_conservation.sh.
Submitted script to queue home-scrm.
 Job ID: 7492452


<qtools.submitter.Submitter at 0x2ad0980ede50>

In [5]:
! tail memory_mapped_htseq_genomic_array_conservation*

==> memory_mapped_htseq_genomic_array_conservation.py <==
wig = HTSeq.WiggleReader(wiggle_filename)

conservation = HTSeq.GenomicArray(chromsizes, stranded=False, typecode='d', storage='memmap', 
                                  memmap_dir='/home/obotvinnik/projects/singlecell_pnms/analysis/htseq_memmap')
for location, score in wig:
    conservation[location] += score
    
memmap_dir = '/home/obotvinnik/projects/singlecell_pnms/analysis/htseq_memmap'
with open('{}/hg19_phastcons_placental_mammal_htseq.pickle'.format(memmap_dir), 'wb') as f:
    pickle.dump(conservation, f)
==> memory_mapped_htseq_genomic_array_conservation.sh <==
#PBS -V
#PBS -l walltime=36:00:00
#PBS -l nodes=1:ppn=1
#PBS -A yeo-group
#PBS -q home-scrm

# Go to the directory from which the script was called
cd $PBS_O_WORKDIR
time python /home/obotvinnik/projects/singlecell_pnms/scripts/memory_mapped_htseq_genomic_array_conservation.py


==> memory_mapped_htseq_genomic_array_conservation.sh.err 