Generate intermediate pickle files from mongodb for later pipelines

In [1]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
import pickle


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

### Load data 

In [2]:
db = MongoClient()['Rebased']['mecaned']

In [3]:
meta = []
data = []
for sam in db.find({'cytobands':{'$ne':None}}):
    meta.append([sam['sample_id'],
                 sam['source'],
                 sam['project'],
                 sam['morphology'],
                 sam['topography'],
                 len(sam['normalized'])
                ])
    
    amps = []
    dels = []
    features = []
    for band in sam['cytobands']:
        if band['chro'] not in ['X','Y']:
            amps.append(band['ave_dup'])
            dels.append(band['ave_del'])
    features = amps + dels
    data.append(features)

#### meta data to work with

In [4]:
df_meta = pd.DataFrame(meta, columns=['id','source','project','morphology','topography','num_segs'])

#### Save meta to disk

In [11]:
filepath = '../data/all_bands_meta.pkl'
with open(filepath, 'wb') as fo:
    pickle.dump(df_meta, fo)

#### Save feature matrix to disk

In [5]:
feat_mat = np.array(data)
feat_mat.shape

(42820, 1622)

In [129]:
filepath = '../data/all_bands.pkl'
with open(filepath, 'wb') as fo:
    pickle.dump(feat_mat, fo)

In [6]:
df_meta

Unnamed: 0,id,source,project,morphology,topography,num_segs
0,440e9ec5-8e61-4f75-b1d1-616941d9456d,TCGA,TCGA-OV,8441/3,Ovary,391
1,6ca6f0fe-8b53-421e-ad4e-dfbf8c8c5b59,TCGA,TCGA-OV,8441/3,Ovary,571
2,da9af00f-60ea-456b-9f25-0ab560962769,TCGA,TCGA-OV,8441/3,Ovary,431
3,8e84a028-5e9c-4d01-9791-db0f04b05bce,TCGA,TCGA-OV,8460/3,Ovary,394
4,d3f8c91f-e4cc-4018-9fbf-4368dd036b01,TCGA,TCGA-OV,8441/3,Ovary,252
...,...,...,...,...,...,...
42815,PGX_AM_BS_GSM1414445,arraymap,GSE58579,icdom-84421,icdot-C56.9,104
42816,PGX_AM_BS_GSM1414446,arraymap,GSE58579,icdom-84421,icdot-C56.9,240
42817,PGX_AM_BS_GSM1414451,arraymap,GSE58579,icdom-84421,icdot-C56.9,79
42818,PGX_AM_BS_GSM1431034,arraymap,GSE58579,icdom-84421,icdot-C56.9,133


### Create lablels

The current labels are not consistent, the goal is:  
- samples use same terms
- both morphology and topography

A shortlist of condiered diseases:  
- Breast: TCGA-BRCA, icdot-c50.9, BRCA-EU, BRCA-UK
- Brain: TCGA-GBM, TCGA-LGG, icdot-c71.9
- Cerebellum: icdot-c71.6
- Ovary: TCGA-OV, icdot-c56.9, OV-AU
- Lung: TCGA-LUAD, icdot-c34.9, 
- Kidney: TCGA-KIRC, icdot-c64.9, RECA-EU
- Colon: TCGA-COAD, icdot-c18.9
- Stomach: TCGA-STAD, icdot-c16.9, GACA-CN
- Skin: TCGA-SKCM, icdot-c44.9, MELA-AU
- Liver: TCGA-LIHC, icdot-c22.0, LICA-FR, LINC-JP, LIRI-JP
- Prostate: TCGA-PRAD, icdot-c61.9, EOPC-DE, PRAD-CA, PRAD-UK



Modify arraymap morpholgy and topography format

In [7]:
df_meta.loc[df_meta['source'] == 'arraymap', 'morphology'] = df_meta.loc[df_meta['source'] == 'arraymap', 'morphology'].str[6:].str.lower()

In [8]:
df_meta.loc[df_meta['source'] == 'arraymap', 'topography'] = df_meta.loc[df_meta['source'] == 'arraymap', 'topography'].str[6:].str.lower()

TCGA morphology

In [9]:
df_meta.loc[df_meta['source'] == 'TCGA', 'morphology'] = df_meta.loc[df_meta['source'] == 'TCGA', 'morphology'].replace(['/'],'', regex=True)

Switch PCAWG morphology and topography

In [10]:
df_meta.loc[df_meta['source'] == 'PCAWG', 'topography'] = df_meta.loc[df_meta['source'] == 'PCAWG', 'morphology'].str.lower()

#### Column of disease site

In [11]:
df_meta = df_meta.assign(site='Others')

#### Breast

In [12]:
print(df_meta.loc[df_meta['project'] == 'TCGA-BRCA'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-BRCA', 'site'] = 'Breast'

print(df_meta.loc[df_meta['topography'] == 'c50.9'].shape)
df_meta.loc[df_meta['topography'] == 'c50.9', 'site'] = 'Breast'

print(df_meta.loc[df_meta['project'] == 'BRCA-EU'].shape)
df_meta.loc[df_meta['project'] == 'BRCA-EU', 'site'] = 'Breast'

print(df_meta.loc[df_meta['project'] == 'BRCA-UK'].shape)
df_meta.loc[df_meta['project'] == 'BRCA-UK', 'site'] = 'Breast'

(1043, 7)
(5101, 7)
(73, 7)
(38, 7)


#### Brain

In [24]:
print(df_meta.loc[df_meta['project'] == 'TCGA-GBM'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-GBM', 'site'] = 'Brain'

print(df_meta.loc[df_meta['project'] == 'TCGA-LGG'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-LGG', 'site'] = 'Brain'

print(df_meta.loc[df_meta['topography'] == 'c71.9'].shape)
df_meta.loc[df_meta['topography'] == 'c71.9', 'site'] = 'Brain'


(615, 7)
(511, 7)
(1682, 7)


#### Cerebellum

In [25]:
print(df_meta.loc[df_meta['topography'] == 'c71.6'].shape)
df_meta.loc[df_meta['topography'] == 'c71.6', 'site'] = 'Cerebellum'

(1708, 7)


#### Ovary

In [26]:
print(df_meta.loc[df_meta['project'] == 'TCGA-OV'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-OV', 'site'] = 'Ovary'

print(df_meta.loc[df_meta['topography'] == 'c56.9'].shape)
df_meta.loc[df_meta['topography'] == 'c56.9', 'site'] = 'Ovary'

print(df_meta.loc[df_meta['project'] == 'OV-AU'].shape)
df_meta.loc[df_meta['project'] == 'OV-AU', 'site'] = 'Ovary'

(511, 7)
(1564, 7)
(53, 7)


#### Lung

In [27]:
print(df_meta.loc[df_meta['project'] == 'TCGA-LUAD'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-LUAD', 'site'] = 'Lung'

print(df_meta.loc[df_meta['topography'] == 'c34.9'].shape)
df_meta.loc[df_meta['topography'] == 'c34.9', 'site'] = 'Lung'

(537, 7)
(3431, 7)


#### Kidney

In [28]:
print(df_meta.loc[df_meta['project'] == 'TCGA-KIRC'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-KIRC', 'site'] = 'Kidney'

print(df_meta.loc[df_meta['topography'] == 'c64.9'].shape)
df_meta.loc[df_meta['topography'] == 'c64.9', 'site'] = 'Kidney'

print(df_meta.loc[df_meta['project'] == 'RECA-EU'].shape)
df_meta.loc[df_meta['project'] == 'RECA-EU', 'site'] = 'Kidney'

(565, 7)
(747, 7)
(73, 7)


#### Colon

In [29]:
print(df_meta.loc[df_meta['project'] == 'TCGA-COAD'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-COAD', 'site'] = 'Colon'

print(df_meta.loc[df_meta['topography'] == 'c18.9'].shape)
df_meta.loc[df_meta['topography'] == 'c18.9', 'site'] = 'Colon'

(463, 7)
(1369, 7)


#### Stomach

In [30]:
print(df_meta.loc[df_meta['project'] == 'TCGA-STAD'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-STAD', 'site'] = 'Stomach'

print(df_meta.loc[df_meta['topography'] == 'c16.9'].shape)
df_meta.loc[df_meta['topography'] == 'c16.9', 'site'] = 'Stomach'

print(df_meta.loc[df_meta['project'] == 'GACA-CN'].shape)
df_meta.loc[df_meta['project'] == 'GACA-CN', 'site'] = 'Stomach'

(411, 7)
(863, 7)
(29, 7)


#### Skin

In [31]:
print(df_meta.loc[df_meta['project'] == 'TCGA-SKCM'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-SKCM', 'site'] = 'Skin'

print(df_meta.loc[df_meta['topography'] == 'c44.9'].shape)
df_meta.loc[df_meta['topography'] == 'c44.9', 'site'] = 'Skin'

print(df_meta.loc[df_meta['project'] == 'MELA-AU'].shape)
df_meta.loc[df_meta['project'] == 'MELA-AU', 'site'] = 'Skin'

(437, 7)
(739, 7)
(62, 7)


#### Liver

In [32]:
print(df_meta.loc[df_meta['project'] == 'TCGA-LIHC'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-LIHC', 'site'] = 'Liver'

print(df_meta.loc[df_meta['topography'] == 'c22.9'].shape)
df_meta.loc[df_meta['topography'] == 'c22.9', 'site'] = 'Liver'

print(df_meta.loc[df_meta['project'] == 'LICA-FR'].shape)
df_meta.loc[df_meta['project'] == 'LICA-FR', 'site'] = 'Liver'

print(df_meta.loc[df_meta['project'] == 'LINC-JP'].shape)
df_meta.loc[df_meta['project'] == 'LINC-JP', 'site'] = 'Liver'

print(df_meta.loc[df_meta['project'] == 'LIRI-JP'].shape)
df_meta.loc[df_meta['project'] == 'LIRI-JP', 'site'] = 'Liver'

(363, 7)
(25, 7)
(5, 7)
(27, 7)
(253, 7)


#### Prostate

In [22]:
print(df_meta.loc[df_meta['project'] == 'TCGA-PRAD'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-PRAD', 'site'] = 'Prostate'

print(df_meta.loc[df_meta['topography'] == 'c61.9'].shape)
df_meta.loc[df_meta['topography'] == 'c61.9', 'site'] = 'Prostate'

print(df_meta.loc[df_meta['project'] == 'EOPC-DE'].shape)
df_meta.loc[df_meta['project'] == 'EOPC-DE', 'site'] = 'Prostate'

print(df_meta.loc[df_meta['project'] == 'PRAD-CA'].shape)
df_meta.loc[df_meta['project'] == 'PRAD-CA', 'site'] = 'Prostate'

print(df_meta.loc[df_meta['project'] == 'PRAD-UK'].shape)
df_meta.loc[df_meta['project'] == 'PRAD-UK', 'site'] = 'Prostate'

(453, 7)
(492, 7)
(68, 7)
(110, 7)
(75, 7)


Check

In [33]:
df_meta.loc[df_meta['site'] == 'Others'].shape

(18335, 7)

In [34]:
df_meta['site'].unique()

array(['Ovary', 'Brain', 'Breast', 'Kidney', 'Others', 'Prostate', 'Skin',
       'Colon', 'Liver', 'Lung', 'Stomach', 'Cerebellum'], dtype=object)

#### Save meta to disk

In [35]:
## updated morphology and topography labels
filepath = '../data/all_bands_meta.pkl'
with open(filepath, 'wb') as fo:
    pickle.dump(df_meta, fo)

In [23]:
df_meta

Unnamed: 0,id,source,project,morphology,topography,num_segs,site
0,440e9ec5-8e61-4f75-b1d1-616941d9456d,TCGA,TCGA-OV,84413,Ovary,391,Ovary
1,6ca6f0fe-8b53-421e-ad4e-dfbf8c8c5b59,TCGA,TCGA-OV,84413,Ovary,571,Ovary
2,da9af00f-60ea-456b-9f25-0ab560962769,TCGA,TCGA-OV,84413,Ovary,431,Ovary
3,8e84a028-5e9c-4d01-9791-db0f04b05bce,TCGA,TCGA-OV,84603,Ovary,394,Ovary
4,d3f8c91f-e4cc-4018-9fbf-4368dd036b01,TCGA,TCGA-OV,84413,Ovary,252,Ovary
...,...,...,...,...,...,...,...
42815,PGX_AM_BS_GSM1414445,arraymap,GSE58579,84421,c56.9,104,Ovary
42816,PGX_AM_BS_GSM1414446,arraymap,GSE58579,84421,c56.9,240,Ovary
42817,PGX_AM_BS_GSM1414451,arraymap,GSE58579,84421,c56.9,79,Ovary
42818,PGX_AM_BS_GSM1431034,arraymap,GSE58579,84421,c56.9,133,Ovary


### Feature labels

In [36]:
df_meta = df_meta.assign(index = df_meta.index)

In [37]:
labels = df_meta['site'].values

In [42]:
np.unique(labels, return_counts=True)

(array(['Brain', 'Breast', 'Cerebellum', 'Colon', 'Kidney', 'Liver',
        'Lung', 'Others', 'Ovary', 'Prostate', 'Skin', 'Stomach'],
       dtype=object),
 array([ 2808,  6244,  1708,  1832,  1385,   673,  3968, 18335,  2128,
         1198,  1238,  1303]))

Save to disk

In [39]:
labelpath = '../data/all_bands_label.pkl'
with open(labelpath, 'wb') as fo:
    pickle.dump(labels, fo)

### morphology as label

In [43]:
filepath = '../data/all_bands_meta.pkl'
with open(filepath, 'rb') as fi:
    df = pickle.load(fi)

In [44]:
df = df.assign(disease=df['morphology'])

In [45]:
df.loc[df['site'] == 'Others', 'disease'] = 'Others'

In [46]:
df.loc[df['disease'] == None].shape

(0, 8)

In [47]:
df.shape

(42820, 8)

In [48]:
disease_label = df['disease'].values
disease_label[pd.isnull(disease_label)] = 'None'

In [49]:
dlabelpath = '../data/all_bands_disease_label.pkl'
with open(dlabelpath, 'wb') as fo:
    pickle.dump(disease_label, fo)

In [50]:
df['disease'].unique()

array(['84413', '84603', '94403', 'None', '85003', '85203', '85223',
       '85753', '90203', '84803', '80503', '85243', '80103', '85103',
       '80223', '85073', '85233', '85413', '83103', '83123', 'Others',
       '93823', '94513', '94013', '94503', '94003', '81403', '87203',
       '87443', '87433', '87303', '87423', '87213', '82553', '87703',
       '85743', '81703', '81713', '81803', '82503', '85503', '82533',
       '82523', '82653', '82603', '82113', '81443', '81453', '84903',
       '82303', '82013', '87713', '84613', '80903', '85033', '81743',
       '85603', '87723', '87453', '84013', '85023', '81733', 'C61',
       'C26.8', 'C64', 'C50.1', 'C71.6', 'C22', 'C56', 'C50.9', 'C50.4',
       'C48.2', 'C43.5', 'C43.6', 'C50.5', 'C43.7', 'C50.2', 'C43.3',
       'C50.6', 'C57', 'C43.9', 'C43.4', 'C22.0', 'C50.412', 'C50',
       'C50.3', 'C50.2/C50.4', 'C30.0', 'C50.8', 'C50.912', 'C43.2',
       '94713', '94703', '94743', '82903', '83173', '93803', '85303',
       '80203', '90643

In [51]:
disease_label[pd.isnull(disease_label)] = 'None'

In [52]:
np.unique(disease_label)

array(['80053', '80103', '80123', '80203', '80223', '80313', '80413',
       '80463', '80500', '80503', '80701', '80703', '80710', '80903',
       '81400', '81402', '81403', '81443', '81453', '81703', '81713',
       '81733', '81743', '81803', '82003', '82013', '82113', '82303',
       '82403', '82503', '82513', '82523', '82533', '82543', '82553',
       '82603', '82653', '82900', '82903', '83103', '83123', '83173',
       '83183', '83803', '84013', '84303', '84403', '84413', '84421',
       '84603', '84613', '84700', '84703', '84800', '84803', '84903',
       '85002', '85003', '85004', '85023', '85032', '85033', '85073',
       '85103', '85202', '85203', '85223', '85233', '85243', '85303',
       '85413', '85503', '85603', '85743', '85753', '86203', '87203',
       '87213', '87303', '87423', '87433', '87443', '87453', '87700',
       '87703', '87713', '87723', '88333', '88903', '88910', '89363',
       '89403', '89633', '90203', '90503', '90504', '90643', '90713',
       '90800', '908

### Source label

In [53]:
source_label = df['source'].values

In [54]:
slabelpath = '../data/all_bands_source_label.pkl'
with open(slabelpath, 'wb') as fo:
    pickle.dump(source_label, fo)