### Routines to slice and dice a set of 450k probes... 
#### NOTE: Go to the last cell in this notebook *first* and import that. 

In [1]:
pickle_base = '../../data/pickle/'
tissue      = 'BRCA'
directory   = tissue + '-PAIRED/'
minor       = 'BH'
major       = 'A1F5'

TCGA   = {"LUSC": ['43-6771', '18-5595', '22-5471', '39-5031', '33-4589', '33-4586'],
          "LUAD": ['50-5931', '15-5420', '38-4631', '49-4488', '50-5932', '50-5935'],
          "COAD": ['AA-3697', 'AA-3713', 'AA-3506', 'AZ-6599', 'A6-2686', 'A6-2682'],
          "BRCA": ['BH-A0BZ', 'BH-A1EN', 'BH-A1F5', 'E2-A1IO'],          
          "GBM":  ['74-6573', '06-0152']}
    
anno_str       = 'Illumina-450k-Anno.{rev}.{ext}'.format(rev='hg19',ext='pkl')
file_str_picl  = 'TCGA-{minor}-{major}-{tissue}.{ext}'.format(minor=minor,
                                                                major=major,
                                                                tissue=tissue,
                                                                ext='pkl')
file_str_picl[5:-4]
#file_str_picl.title()[5:-9]

'BH-A1F5-BRCA'

In [17]:
annotation  = load_obj(anno_str[:-4]) # load the saved annotation file
print (len(annotation.probe))

loading Illumina-450k-Anno.hg19 ...
482421


In [7]:
class Probe:
    """
    Holds probe data for a single CpG site.
    """
    def __init__(self):
        self.id = None
        self.seq = None
        self.name = None
        self.chr = None
        self.gene = None
        self.strand = None
        self.refseq = None
        self.feature = None
        self.tour = None
        self.hg19_cord = None
        self.hg38_cord = None
        self.GDC_gene = None
        self.GDC_cgi_cord = None
        self.GDC_beta_normal =  None 
        self.GDC_beta_tumor = None 
        self.GDC_gene_type = None

In [8]:
import _pickle as pickle

def load_obj(name):
    print('loading ' + name + ' ...')
    with open('../../data/pickle/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [13]:
methyl  = load_obj(file_str_picl[:-4])
print (dir(methyl))

loading TCGA-BH-A1F5-BRCA ...
['__class__', '__contains__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'clear', 'copy', 'fromkeys', 'get', 'items', 'keys', 'pop', 'popitem', 'setdefault', 'update', 'values']


In [12]:
methyl['cg00000029'].Beta_Normal

0.108354889450899

In [172]:
import pandas as pd

# Convert methyl dictionary to Pandas dataframe
methyl_df = pd.DataFrame(list(methyl.items()), 
                         columns=['Ref_Id', 'Probe'])
methyl_df.shape
methyl_df.index.name = file_str_picl[5:-4]

In [5]:
import pandas as pd

def _range(probes, start, end):
    '''
    Return an unsorted range of probes. 
    Equivalent to df.head() at any starting location. 
    '''
    if type(probes) is pd.core.frame.DataFrame:
        return probes.iloc[start:end]
    else:
        return dict(list(probes.items())[start:end])

def sort_range_by_refid(probes):
    '''
    Return dict of range of probes, sorted by refid.
    Call _range function first to limit scope.
    '''
    if type(probes) is pd.core.frame.DataFrame:
        pass # handle df instance
    else:
        return dict(sorted(list(probes.items())))

def sort_range_by_coordinate(probes):
    '''
    Return dict of range of probes, sorted by coordinate.
    Call _range function first to limit scope.
    '''
    if type(probes) is pd.core.frame.DataFrame:
        pass # handle df instance
    else:
        return dict(sorted(list(probes.items()), 
                       key=lambda item: item[1].cord))

def get_probes_by_chr(probes, chrom):
    """
    Return a dict of probes by chromosome.
    Call _range function first to limit scope.
    """ 
    if type(probes) is pd.core.frame.DataFrame:
        pass # handle df instance
    else:
        return {p: probes[p] for p in probes 
                if probes[p].chr == chrom}

def get_probes_by_location(probes, interval):
    """
    Return a dict of probes by location.
    Call _range function first to limit scope.
    """
    chrom = interval.chr
    
    if type(probes) is pd.core.frame.DataFrame:
        pass # handle df instance
    else:
        probe_dict = {k: probes[k] for k in probes if
                     probes[k].chr == chrom and start < probes[k].cord < end}
        return probe_dict

The following methods work when passed either a dictionary or a dataframe.

In [14]:
start = 0
end = 10

probe_slice_dict = _range(annotation.probe, start, end) # dict of the first 10 entries.
#probe_slice_df = _range(methyl_df, start, end) # df of the first 10 entries.

In [16]:
for key in list(probe_slice_dict.keys()):
    print (probe_slice_dict[key])

AttributeError: 'Probe' object has no attribute 'hg38_cord'

In [182]:
probe_slice_df

Unnamed: 0_level_0,Ref_Id,Probe
BH-A1F5-BRCA,Unnamed: 1_level_1,Unnamed: 2_level_1
0,cg00000029,<__main__.Probe object at 0x1948d98d0>
1,cg00000165,<__main__.Probe object at 0x1948d9518>
2,cg00000289,<__main__.Probe object at 0x1948d94a8>
3,cg00000363,<__main__.Probe object at 0x1948d92e8>
4,cg00000658,<__main__.Probe object at 0x1948d9240>


In [1]:
# %load '../methylator/annotation/annotate_450k.py'
import os

class Probe:
    """
    Holds Illumina 450k probe info for a single CpG site.
    """
    def __init__(self):
        self.id = None
        self.seq = None
        self.name = None
        self.chr = None
        self.hg19_cord = None
        self.strand = None
        self.gene = None
        self.refseq = None
        self.tour = None
        self.feature = None
        
class Interval:
    """
    Define a genomic interval by chromsome and strand orientation.
    """
    def __init__(self, chromosome, start, end, strand):
        self.chr = chromosome
        self.start = start
        self.end = end
        self.strand = strand

class Feature:
    """
    Define a Probe feature.
    """
    BODY = "Body"
    TSS200 = "TSS200"
    TSS1500 = "TSS1500"
    UTR5 = "5'UTR"
    UTR3 = "3'UTR"
    EXON = "Exon"

class CpG_location:
    """
    Defines a CpG location.
    """
    ISLAND = "Island"
    NSHORE = "N_Shore"
    SSHORE = "S_Shore"
    NSHELF = "N_Shelf"
    SSHELF = "S_Shelf"
    
class SNP:
    """
    Defines the SNPs in probes. Used to filter probes.
    """

    def __init__(self):
        self.probeid = None
        self.snpid = None
    
class Annotate_450k:
    """
    Parse and hold information about Illumina probes.
    """

    def __init__(self):        
        for i in open(anno_file, mode="r"):
            self.ann = os.path.join("../../data/", i.strip("\n").strip("\r"))

        self.probe = {}
        self.__run__()

    def __run__(self):
        """
        A static function to setup the Probe classes.
        """
        for i in open(self.ann, mode="r"):
            if i.startswith("cg"):
                data = i.split(",")
                # Assign probe information.
                new_probe = Probe()
                new_probe.id = data[0]                 # ID
                new_probe.name = data[1]               # name
                new_probe.seq = data[13]               # sequence prior to bi-sulfite conv.
                new_probe.chr = str(data[11])          # chromosome
                new_probe.hg19_cord = int(data[12])    # chromosomal coordinates of the CpG.
                new_probe.strand = data[16]            # strand
                new_probe.gene = data[21].split(";")   # UCSC_RefGene_Name
                new_probe.refseq = data[22]            # UCSC_RefGene_Accession
                features = data[23].split(";")         # UCSC_RefGene_Group
                list_features = []
                for i in features:
                    if i not in list_features:
                        list_features.append(i)

                new_probe.feature = list_features

                new_probe.tour = data[25]              # Relation_to_UCSC_CpG_Island
                newcpg = {new_probe.id: new_probe}
                self.probe.update(newcpg)

    def get_number(self):
        """
        Return total number of probes.
        """
        number = 0
        for probe_id in self.probe.keys():
            number += 1

        return number

    def get_probe(self, probe_id): #WORKS
        """
        Return probe info associated with an reference.
        """
        try:
            probe = self.probe[probe_id]
        except Exception as ex:
            probe = None
            print("WARNING: No probe with ref-id of %s found." % probe_id)
        return probe

    def remove_snp_probes(self):
        """
        Removes all SNPs associated with probes.
        """
        snp_list = []
        snp_file = open("../../data/humanmethylation450_dbsnp137.snpupdate.table.v2.sorted.txt", "r")
        for line in snp_file:
            if line.startswith("cg"):
                line = line.strip("\n").strip("\r").split("\t")
                new_snp = SNP()
                new_snp.probeid = line[0]
                new_snp.snpid = line[1]
                snp_list.append(new_snp)

        for snp in snp_list:
            self.probe.pop(snp.probeid)

anno_file = os.path.abspath("../../data/config.ini") # Illumina probe manifest. Note: This (large) file is not 
                                                     # in the repository.
# Functions to save/load dictionary objects. 

import _pickle as pickle

def save_obj(obj, name):
    print('saving ' + name + ' ...')
    with open('../../data/pickle/'+ name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f)
        
def load_obj(name):
    print('loading ' + name + ' ...')
    with open('../../data/pickle/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)