# Create memory-mapped HTSeq genomic Array

Get chromosome sizes to use memory mapped mode of HTSeq so it's faster

In [1]:
import pandas as pd
chromsizes = pd.read_table('/projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes', header=None, index_col=0, squeeze=True)

# Remove all haplotype chromosomes
chromsizes = chromsizes[chromsizes.index.map(lambda x: '_' not in x)]
chromsizes = chromsizes.to_dict()
chromsizes

{'chr1': 249250621,
 'chr10': 135534747,
 'chr11': 135006516,
 'chr12': 133851895,
 'chr13': 115169878,
 'chr14': 107349540,
 'chr15': 102531392,
 'chr16': 90354753,
 'chr17': 81195210,
 'chr18': 78077248,
 'chr19': 59128983,
 'chr2': 243199373,
 'chr20': 63025520,
 'chr21': 48129895,
 'chr22': 51304566,
 'chr3': 198022430,
 'chr4': 191154276,
 'chr5': 180915260,
 'chr6': 171115067,
 'chr7': 159138663,
 'chr8': 146364022,
 'chr9': 141213431,
 'chrM': 16571,
 'chrX': 155270560,
 'chrY': 59373566}

Populating `HTSeq.GenomicArray` took overnight so plan accordingly

In [None]:
%%time
import HTSeq

wiggle_filename = '/projects/ps-yeolab/genomes/hg19/hg19_phastcons_placental_mammal_space_separated.wig'
wig = HTSeq.WiggleReader(wiggle_filename)

conservation = HTSeq.GenomicArray(chromsizes, stranded=False, typecode='d', storage='memmap', 
                                  memmap_dir='/home/obotvinnik/projects/singlecell_pnms/analysis/htseq_memmap')
for location, score in wig:
    conservation[location] += score

In [None]:
%%time

import cPickle as pickle
memmap_dir = '/home/obotvinnik/projects/singlecell_pnms/analysis/htseq_memmap'
with open('{}/hg19_phastcons_placental_mammal_htseq.pickle'.format(memmap_dir), 'wb') as f:
    pickle.dump(conservation, f)