### Join or Split JHU Methylation Data Sets

NOTE: Before running, __go to the last line__ and run that first.

In [2]:
import os
import pandas as pd

In [3]:
pickle_base = '../../data/pickle/'
tissue      = 'BRCA'
directory   = tissue + '-PAIRED/'
minor       = 'BH'
major       = 'A1F5'

TCGA   = {"LUSC": ['43-6771', '18-5595', '22-5471', '39-5031', '33-4589', '33-4586'],
          "LUAD": ['50-5931', '15-5420', '38-4631', '49-4488', '50-5932', '50-5935'],
          "COAD": ['AA-3697', 'AA-3713', 'AA-3506', 'AZ-6599', 'A6-2686', 'A6-2682'],
          "BRCA": ['BH-A0BZ', 'BH-A1EN', 'BH-A1F5', 'E2-A1IO'],          
          "GBM":  ['74-6573', '06-0152']}

In [4]:
def load_samples(k):
    sample_list = []
    for key in TCGA.keys():
        if key == k:
            for sample in TCGA[key]:
                sample_list.append('TCGA-{s1}-{s2}-{k}'.format(k=key, 
                              s1 = sample.split('-')[0], 
                              s2 = sample.split('-')[1]))
    return sample_list

In [5]:
try:
    GBM_sample_dict = {}
    sample_list = load_samples('GBM')
    for sample in sample_list:
        GBM_sample_dict[sample] = load_obj(sample)
except FileNotFoundError:
    print ('File not found...')

loading TCGA-74-6573-GBM ...
loading TCGA-06-0152-GBM ...


In [6]:
dict(list(GBM_sample_dict['TCGA-74-6573-GBM'].items())[0:2])

{'cg00000029': <__main__.Probe at 0x113b68320>,
 'cg00000165': <__main__.Probe at 0x113b68a20>}

In [7]:
dict(list(GBM_sample_dict['TCGA-06-0152-GBM'].items())[0:2])

{'cg00000029': <__main__.Probe at 0x116993d30>,
 'cg00000165': <__main__.Probe at 0x126d3fdd8>}

In [1]:
# %load '../methylator/annotation/annotate_450k.py'
import os

class Probe:
    """
    Holds Illumina 450k probe info for a single CpG site.
    """
    def __init__(self):
        self.id = None
        self.seq = None
        self.name = None
        self.chr = None
        self.hg19_cord = None
        self.strand = None
        self.gene = None
        self.refseq = None
        self.tour = None
        self.feature = None
        
class Interval:
    """
    Define a genomic interval by chromsome and strand orientation.
    """
    def __init__(self, chromosome, start, end, strand):
        self.chr = chromosome
        self.start = start
        self.end = end
        self.strand = strand

class Feature:
    """
    Define a Probe feature.
    """
    BODY = "Body"
    TSS200 = "TSS200"
    TSS1500 = "TSS1500"
    UTR5 = "5'UTR"
    UTR3 = "3'UTR"
    EXON = "Exon"

class CpG_location:
    """
    Defines a CpG location.
    """
    ISLAND = "Island"
    NSHORE = "N_Shore"
    SSHORE = "S_Shore"
    NSHELF = "N_Shelf"
    SSHELF = "S_Shelf"
    
class SNP:
    """
    Defines the SNPs in probes. Used to filter probes.
    """

    def __init__(self):
        self.probeid = None
        self.snpid = None
    
class Annotate_450k:
    """
    Parse and hold information about Illumina probes.
    """

    def __init__(self):        
        for i in open(anno_file, mode="r"):
            self.ann = os.path.join("../../data/", i.strip("\n").strip("\r"))

        self.probe = {}
        self.__run__()

    def __run__(self):
        """
        A static function to setup the Probe classes.
        """
        for i in open(self.ann, mode="r"):
            if i.startswith("cg"):
                data = i.split(",")
                # Assign probe information.
                new_probe = Probe()
                new_probe.id = data[0]                 # ID
                new_probe.name = data[1]               # name
                new_probe.seq = data[13]               # sequence prior to bi-sulfite conv.
                new_probe.chr = str(data[11])          # chromosome
                new_probe.hg19_cord = int(data[12])    # chromosomal coordinates of the CpG.
                new_probe.strand = data[16]            # strand
                new_probe.gene = data[21].split(";")   # UCSC_RefGene_Name
                new_probe.refseq = data[22]            # UCSC_RefGene_Accession
                features = data[23].split(";")         # UCSC_RefGene_Group
                list_features = []
                for i in features:
                    if i not in list_features:
                        list_features.append(i)

                new_probe.feature = list_features

                new_probe.tour = data[25]              # Relation_to_UCSC_CpG_Island
                newcpg = {new_probe.id: new_probe}
                self.probe.update(newcpg)

    def get_number(self):
        """
        Return total number of probes.
        """
        number = 0
        for probe_id in self.probe.keys():
            number += 1

        return number

    def get_probe(self, probe_id): #WORKS
        """
        Return probe info associated with an reference.
        """
        try:
            probe = self.probe[probe_id]
        except Exception as ex:
            probe = None
            print("WARNING: No probe with ref-id of %s found." % probe_id)
        return probe

    def remove_snp_probes(self):
        """
        Removes all SNPs associated with probes.
        """
        snp_list = []
        snp_file = open("../../data/humanmethylation450_dbsnp137.snpupdate.table.v2.sorted.txt", "r")
        for line in snp_file:
            if line.startswith("cg"):
                line = line.strip("\n").strip("\r").split("\t")
                new_snp = SNP()
                new_snp.probeid = line[0]
                new_snp.snpid = line[1]
                snp_list.append(new_snp)

        for snp in snp_list:
            self.probe.pop(snp.probeid)

anno_file = os.path.abspath("../../data/config.ini") # Illumina probe manifest. Note: This (large) file is not 
                                                     # in the repository.
# Functions to save/load dictionary objects. 

import _pickle as pickle

def save_obj(obj, name):
    print('saving ' + name + ' ...')
    with open('../../data/pickle/'+ name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f)
        
def load_obj(name):
    print('loading ' + name + ' ...')
    with open('../../data/pickle/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)