Generate intermediate pickle files from mongodb for later pipelines

In [1]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
import pickle


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

### Load data 

In [2]:
db = MongoClient()['test']['mecaned']

In [3]:
meta = []
data = []
for sam in db.find({'cytobands':{'$ne':None}}):
    meta.append([sam['sample_id'],
                 sam['source'],
                 sam['project'],
                 sam['morphology'],
                 sam['topography'],
                 len(sam['normalized'])
                ])
    
    amps = []
    dels = []
    features = []
    for band in sam['cytobands']:
        if band['chro'] not in ['X','Y']:
            amps.append(band['ave_dup'])
            dels.append(band['ave_del'])
    features = amps + dels
    data.append(features)

#### meta data to work with

In [4]:
df_meta = pd.DataFrame(meta, columns=['id','source','project','morphology','topography','num_segs'])

#### Save meta to disk

In [7]:
filepath = '../data/all_bands_meta.pkl'
with open(filepath, 'wb') as fo:
    pickle.dump(df_meta, fo)

In [8]:
with open(filepath, 'rb') as fi:
    t = pickle.load(fi)

In [9]:
t

Unnamed: 0,id,source,project,morphology,topography,num_segs
0,440e9ec5-8e61-4f75-b1d1-616941d9456d,TCGA,TCGA-OV,84413,Ovary,391
1,6ca6f0fe-8b53-421e-ad4e-dfbf8c8c5b59,TCGA,TCGA-OV,84413,Ovary,571
2,da9af00f-60ea-456b-9f25-0ab560962769,TCGA,TCGA-OV,84413,Ovary,431
3,8e84a028-5e9c-4d01-9791-db0f04b05bce,TCGA,TCGA-OV,84603,Ovary,394
4,d3f8c91f-e4cc-4018-9fbf-4368dd036b01,TCGA,TCGA-OV,84413,Ovary,252
...,...,...,...,...,...,...
160,d554524c-010c-46dc-8c87-5144627de2ba,TCGA,TCGA-OV,84413,Ovary,276
161,26f4b5b8-7159-41c4-a0d0-d842173a8169,TCGA,TCGA-OV,84413,Ovary,420
162,45ef294e-8a96-4ac8-beb8-fd0016181781,TCGA,TCGA-OV,84413,Ovary,720
163,fd14c1a3-7377-4d5c-bcc6-d7013223e155,TCGA,TCGA-OV,84413,Ovary,747


#### Save feature matrix to disk

In [10]:
feat_mat = np.array(data)
feat_mat.shape

(165, 1622)

In [11]:
filepath = '../data/all_bands.pkl'
with open(filepath, 'wb') as fo:
    pickle.dump(feat_mat, fo)

In [12]:
df_meta

Unnamed: 0,id,source,project,morphology,topography,num_segs
0,440e9ec5-8e61-4f75-b1d1-616941d9456d,TCGA,TCGA-OV,84413,Ovary,391
1,6ca6f0fe-8b53-421e-ad4e-dfbf8c8c5b59,TCGA,TCGA-OV,84413,Ovary,571
2,da9af00f-60ea-456b-9f25-0ab560962769,TCGA,TCGA-OV,84413,Ovary,431
3,8e84a028-5e9c-4d01-9791-db0f04b05bce,TCGA,TCGA-OV,84603,Ovary,394
4,d3f8c91f-e4cc-4018-9fbf-4368dd036b01,TCGA,TCGA-OV,84413,Ovary,252
...,...,...,...,...,...,...
160,d554524c-010c-46dc-8c87-5144627de2ba,TCGA,TCGA-OV,84413,Ovary,276
161,26f4b5b8-7159-41c4-a0d0-d842173a8169,TCGA,TCGA-OV,84413,Ovary,420
162,45ef294e-8a96-4ac8-beb8-fd0016181781,TCGA,TCGA-OV,84413,Ovary,720
163,fd14c1a3-7377-4d5c-bcc6-d7013223e155,TCGA,TCGA-OV,84413,Ovary,747


### Create lablels

The current labels are not consistent, the goal is:  
- samples use same terms
- both morphology and topography

A shortlist of condiered diseases:  
- Breast: TCGA-BRCA, icdot-c50.9, BRCA-EU, BRCA-UK
- Brain: TCGA-GBM, TCGA-LGG, icdot-c71.9
- Cerebellum: icdot-c71.6
- Ovary: TCGA-OV, icdot-c56.9, OV-AU
- Lung: TCGA-LUAD, icdot-c34.9, 
- Kidney: TCGA-KIRC, icdot-c64.9, RECA-EU
- Colon: TCGA-COAD, icdot-c18.9
- Stomach: TCGA-STAD, icdot-c16.9, GACA-CN
- Skin: TCGA-SKCM, icdot-c44.9, MELA-AU
- Liver: TCGA-LIHC, icdot-c22.0, LICA-FR, LINC-JP, LIRI-JP
- Prostate: TCGA-PRAD, icdot-c61.9, EOPC-DE, PRAD-CA, PRAD-UK



Modify arraymap morpholgy and topography format

In [13]:
df_meta.loc[df_meta['source'] == 'arraymap', 'morphology'] = df_meta.loc[df_meta['source'] == 'arraymap', 'morphology'].str[6:].str.lower()

In [14]:
df_meta.loc[df_meta['source'] == 'arraymap', 'topography'] = df_meta.loc[df_meta['source'] == 'arraymap', 'topography'].str[6:].str.lower()

TCGA morphology

In [15]:
df_meta.loc[df_meta['source'] == 'TCGA', 'morphology'] = df_meta.loc[df_meta['source'] == 'TCGA', 'morphology'].replace(['/'],'', regex=True)

Switch PCAWG morphology and topography

In [16]:
df_meta.loc[df_meta['source'] == 'PCAWG', 'topography'] = df_meta.loc[df_meta['source'] == 'PCAWG', 'morphology'].str.lower()

#### Column of disease site

In [17]:
df_meta = df_meta.assign(site='Others')

#### Breast

In [18]:
print(df_meta.loc[df_meta['project'] == 'TCGA-BRCA'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-BRCA', 'site'] = 'Breast'

print(df_meta.loc[df_meta['topography'] == 'c50.9'].shape)
df_meta.loc[df_meta['topography'] == 'c50.9', 'site'] = 'Breast'

print(df_meta.loc[df_meta['project'] == 'BRCA-EU'].shape)
df_meta.loc[df_meta['project'] == 'BRCA-EU', 'site'] = 'Breast'

print(df_meta.loc[df_meta['project'] == 'BRCA-UK'].shape)
df_meta.loc[df_meta['project'] == 'BRCA-UK', 'site'] = 'Breast'

(0, 7)
(0, 7)
(0, 7)
(0, 7)


#### Brain

In [19]:
print(df_meta.loc[df_meta['project'] == 'TCGA-GBM'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-GBM', 'site'] = 'Brain'

print(df_meta.loc[df_meta['project'] == 'TCGA-LGG'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-LGG', 'site'] = 'Brain'

print(df_meta.loc[df_meta['topography'] == 'c71.9'].shape)
df_meta.loc[df_meta['topography'] == 'c71.9', 'site'] = 'Brain'


(81, 7)
(0, 7)
(0, 7)


#### Cerebellum

In [20]:
print(df_meta.loc[df_meta['topography'] == 'c71.6'].shape)
df_meta.loc[df_meta['topography'] == 'c71.6', 'site'] = 'Cerebellum'

(0, 7)


#### Ovary

In [21]:
print(df_meta.loc[df_meta['project'] == 'TCGA-OV'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-OV', 'site'] = 'Ovary'

print(df_meta.loc[df_meta['topography'] == 'c56.9'].shape)
df_meta.loc[df_meta['topography'] == 'c56.9', 'site'] = 'Ovary'

print(df_meta.loc[df_meta['project'] == 'OV-AU'].shape)
df_meta.loc[df_meta['project'] == 'OV-AU', 'site'] = 'Ovary'

(84, 7)
(0, 7)
(0, 7)


#### Lung

In [22]:
print(df_meta.loc[df_meta['project'] == 'TCGA-LUAD'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-LUAD', 'site'] = 'Lung'

print(df_meta.loc[df_meta['topography'] == 'c34.9'].shape)
df_meta.loc[df_meta['topography'] == 'c34.9', 'site'] = 'Lung'

(0, 7)
(0, 7)


#### Kidney

In [23]:
print(df_meta.loc[df_meta['project'] == 'TCGA-KIRC'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-KIRC', 'site'] = 'Kidney'

print(df_meta.loc[df_meta['topography'] == 'c64.9'].shape)
df_meta.loc[df_meta['topography'] == 'c64.9', 'site'] = 'Kidney'

print(df_meta.loc[df_meta['project'] == 'RECA-EU'].shape)
df_meta.loc[df_meta['project'] == 'RECA-EU', 'site'] = 'Kidney'

(0, 7)
(0, 7)
(0, 7)


#### Colon

In [24]:
print(df_meta.loc[df_meta['project'] == 'TCGA-COAD'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-COAD', 'site'] = 'Colon'

print(df_meta.loc[df_meta['topography'] == 'c18.9'].shape)
df_meta.loc[df_meta['topography'] == 'c18.9', 'site'] = 'Colon'

(0, 7)
(0, 7)


#### Stomach

In [25]:
print(df_meta.loc[df_meta['project'] == 'TCGA-STAD'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-STAD', 'site'] = 'Stomach'

print(df_meta.loc[df_meta['topography'] == 'c16.9'].shape)
df_meta.loc[df_meta['topography'] == 'c16.9', 'site'] = 'Stomach'

print(df_meta.loc[df_meta['project'] == 'GACA-CN'].shape)
df_meta.loc[df_meta['project'] == 'GACA-CN', 'site'] = 'Stomach'

(0, 7)
(0, 7)
(0, 7)


#### Skin

In [26]:
print(df_meta.loc[df_meta['project'] == 'TCGA-SKCM'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-SKCM', 'site'] = 'Skin'

print(df_meta.loc[df_meta['topography'] == 'c44.9'].shape)
df_meta.loc[df_meta['topography'] == 'c44.9', 'site'] = 'Skin'

print(df_meta.loc[df_meta['project'] == 'MELA-AU'].shape)
df_meta.loc[df_meta['project'] == 'MELA-AU', 'site'] = 'Skin'

(0, 7)
(0, 7)
(0, 7)


#### Liver

In [27]:
print(df_meta.loc[df_meta['project'] == 'TCGA-LIHC'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-LIHC', 'site'] = 'Liver'

print(df_meta.loc[df_meta['topography'] == 'c22.9'].shape)
df_meta.loc[df_meta['topography'] == 'c22.9', 'site'] = 'Liver'

print(df_meta.loc[df_meta['project'] == 'LICA-FR'].shape)
df_meta.loc[df_meta['project'] == 'LICA-FR', 'site'] = 'Liver'

print(df_meta.loc[df_meta['project'] == 'LINC-JP'].shape)
df_meta.loc[df_meta['project'] == 'LINC-JP', 'site'] = 'Liver'

print(df_meta.loc[df_meta['project'] == 'LIRI-JP'].shape)
df_meta.loc[df_meta['project'] == 'LIRI-JP', 'site'] = 'Liver'

(0, 7)
(0, 7)
(0, 7)
(0, 7)
(0, 7)


#### Prostate

In [28]:
print(df_meta.loc[df_meta['project'] == 'TCGA-PRAD'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-PRAD', 'site'] = 'Prostate'

print(df_meta.loc[df_meta['topography'] == 'c61.9'].shape)
df_meta.loc[df_meta['topography'] == 'c61.9', 'site'] = 'Prostate'

print(df_meta.loc[df_meta['project'] == 'EOPC-DE'].shape)
df_meta.loc[df_meta['project'] == 'EOPC-DE', 'site'] = 'Prostate'

print(df_meta.loc[df_meta['project'] == 'PRAD-CA'].shape)
df_meta.loc[df_meta['project'] == 'PRAD-CA', 'site'] = 'Prostate'

print(df_meta.loc[df_meta['project'] == 'PRAD-UK'].shape)
df_meta.loc[df_meta['project'] == 'PRAD-UK', 'site'] = 'Prostate'

(0, 7)
(0, 7)
(0, 7)
(0, 7)
(0, 7)


Check

In [29]:
df_meta.loc[df_meta['site'] == 'Others'].shape

(0, 7)

In [30]:
df_meta['site'].unique()

array(['Ovary', 'Brain'], dtype=object)

#### Save meta to disk

In [31]:
## updated morphology and topography labels
filepath = '../data/all_bands_meta.pkl'
with open(filepath, 'wb') as fo:
    pickle.dump(df_meta, fo)

In [32]:
df_meta

Unnamed: 0,id,source,project,morphology,topography,num_segs,site
0,440e9ec5-8e61-4f75-b1d1-616941d9456d,TCGA,TCGA-OV,84413,Ovary,391,Ovary
1,6ca6f0fe-8b53-421e-ad4e-dfbf8c8c5b59,TCGA,TCGA-OV,84413,Ovary,571,Ovary
2,da9af00f-60ea-456b-9f25-0ab560962769,TCGA,TCGA-OV,84413,Ovary,431,Ovary
3,8e84a028-5e9c-4d01-9791-db0f04b05bce,TCGA,TCGA-OV,84603,Ovary,394,Ovary
4,d3f8c91f-e4cc-4018-9fbf-4368dd036b01,TCGA,TCGA-OV,84413,Ovary,252,Ovary
...,...,...,...,...,...,...,...
160,d554524c-010c-46dc-8c87-5144627de2ba,TCGA,TCGA-OV,84413,Ovary,276,Ovary
161,26f4b5b8-7159-41c4-a0d0-d842173a8169,TCGA,TCGA-OV,84413,Ovary,420,Ovary
162,45ef294e-8a96-4ac8-beb8-fd0016181781,TCGA,TCGA-OV,84413,Ovary,720,Ovary
163,fd14c1a3-7377-4d5c-bcc6-d7013223e155,TCGA,TCGA-OV,84413,Ovary,747,Ovary


### Feature labels

In [33]:
df_meta = df_meta.assign(index = df_meta.index)

In [34]:
labels = df_meta['site'].values

In [35]:
np.unique(labels, return_counts=True)

(array(['Brain', 'Ovary'], dtype=object), array([81, 84]))

Save to disk

In [36]:
labelpath = '../data/all_bands_label.pkl'
with open(labelpath, 'wb') as fo:
    pickle.dump(labels, fo)

### morphology as label

In [37]:
filepath = '../data/all_bands_meta.pkl'
with open(filepath, 'rb') as fi:
    df = pickle.load(fi)

In [38]:
df = df.assign(disease=df['morphology'])

In [39]:
df.loc[df['site'] == 'Others', 'disease'] = 'Others'

In [40]:
df.loc[df['disease'] == None].shape

(0, 8)

In [41]:
df.shape

(165, 8)

In [42]:
disease_label = df['disease'].values
disease_label[pd.isnull(disease_label)] = 'None'

In [43]:
dlabelpath = '../data/all_bands_disease_label.pkl'
with open(dlabelpath, 'wb') as fo:
    pickle.dump(disease_label, fo)

In [44]:
df['disease'].unique()

array(['84413', '84603', '94403'], dtype=object)

In [45]:
disease_label[pd.isnull(disease_label)] = 'None'

In [46]:
np.unique(disease_label)

array(['84413', '84603', '94403'], dtype=object)

### Source label

In [47]:
source_label = df['source'].values

In [48]:
slabelpath = '../data/all_bands_source_label.pkl'
with open(slabelpath, 'wb') as fo:
    pickle.dump(source_label, fo)