Apply mecan on TCGA and arraymap data. PCAWG data is applied separately.

In [1]:
from pymongo import MongoClient
import sys, os
import mecan4cna.algorithms as alg
import operator
from natsort import natsorted 

In [2]:
m = alg.mecan()

tcga data

In [3]:

tcga_data = []
db = MongoClient()['tcga']['masked_v2']

for sample in db.find():
    if 'Normal' not in sample['sample_type']:
        tcga_data.append({'source': 'TCGA',
                          'project': sample['project_id'],
                          'sample_id': sample['file_id'],
                          'morphology': sample['morphology'],
                          'topography': sample['tissue_origin'],
                          'stage': sample['tumor_stage'],
                          'gender': sample['gender'],
                          'age': sample['age_at_diagnosis'],
                          'vital_status': sample['vital_status'],
                          'file_path': sample['file_path'],
                          'segments': sample['variants_cnv']})

In [4]:
m.peak_thresh=5000
for sample in tcga_data:
    res = m.run(sample['segments'])
    if len(res) > 1:
        base = res[0]
        level_dist = res[1]
    else:
        base = None
        level_dist = None
    sample['base'] = base
    sample['level_distance'] = level_dist

In [5]:
db = MongoClient()['Rebased']['tcga_masked_v2']
db.drop()
db.insert_many(tcga_data)

<pymongo.results.InsertManyResult at 0x17fe08168>

arraymap data

In [None]:
arraymap_data = []
db_am_bs = MongoClient()['arraymap_ga4gh']['biosamples']
db_am_ind = MongoClient()['arraymap_ga4gh']['individuals']
db_am_cs = MongoClient()['arraymap_ga4gh']['callsets']
db_am_var = MongoClient()['arraymap_ga4gh']['variants']

# i = 0

for sample in db_am_bs.find():
    bs_id = sample['id']
    ind_id = sample['individual_id']
    project = sample['project_id']
    morphology = sample['biocharacteristics'][1]['type']['id']

    if morphology in ['icdom-00000', 'icdom-']:
        continue

    topography = sample['biocharacteristics'][0]['type']['id']
    try:
        stage = sample['info']['tnm']
    except:
        stage = None

    ind = db_am_ind.find_one({'id': ind_id})
    gender = ind['biocharacteristics'][0]['description']

    age = sample['age_at_collection']['age']
    vital_status = sample['info']['death']

    cs = db_am_cs.find_one({'biosample_id': bs_id})
    
    try:
        file_path = cs['info']['paths']['segmentfile']
    except:
        continue
        
    platform = cs['description']

    segments = []
    for var in db_am_var.find({'biosample_id': bs_id}):
        segments.append({'chro': var['reference_name'],
                         'start': var['start'][0],
                         'end': var['end'][0],
                         'probes': var['info']['cnv_length'],
                         'value': var['info']['cnv_value']})
#     segments = sorted(segments, key=operator.itemgetter('chro','start'))
    segments = natsorted(segments, key=operator.itemgetter('chro','start'))

    arraymap_data.append({'source': 'arraymap',
                          'project': project,
                          'sample_id': bs_id,
                          'morphology': morphology,
                          'topography': topography,
                          'stage': stage,
                          'gender': gender,
                          'age': age,
                          'vital_status': vital_status,
                          'file_path': file_path,
                          'platform': platform,
                          'segments': segments})

#     if i >10:
#         break
#     else:
#         i +=1

In [None]:
for sample in arraymap_data:
    for seg in sample['segments']:
        if seg['value'] is None:
            print(sample['sample_id'])
            arraymap_data.remove(sample)
            break

In [None]:

for sample in arraymap_data:

    try:
        if 'snp 6' in sample['platform']:
            m.peak_thresh=5000
        elif '250k' in sample['platform']:
            m.peak_thresh=2000
        else: 
            m.peak_thresh=1000
            
        res = m.run(sample['segments'])
        if len(res) > 1:
            base = res[0]
            level_dist = res[1]
        else:
            base = None
            level_dist = None
    except Exception as e:
        print(e)
        base = None
        level_dist = None
    
    sample['base'] = base
    sample['level_distance'] = level_dist

In [None]:
db = MongoClient()['Rebased']['arraymap']
db.drop()
db.insert_many(arraymap_data)

In [None]:
i = 0
for s in db.find():
    if 'base' in s.keys():
        i +=1
print(i)

In [None]:
len(arraymap_data)

In [None]:
sample['segments']

In [None]:
for t in sample['segments']:
    value = round(2**t['value'] * 2, 4)

In [None]:
try:
    m.run(sample['segments'])
except Exception as e:
    print(e)