Generate intermediate pickle files from mongodb for later pipelines

In [None]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
import pickle


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

### Load data 

In [None]:
db = MongoClient()['Rebased']['mecaned']

In [None]:
meta = []
data = []
for sam in db.find({'cytobands':{'$ne':None}}):
    meta.append([sam['sample_id'],
                 sam['source'],
                 sam['project'],
                 sam['morphology'],
                 sam['topography'],
                 len(sam['normalized'])
                ])
    
    amps = []
    dels = []
    features = []
    for band in sam['cytobands']:
        if band['chro'] not in ['X','Y']:
            amps.append(band['ave_dup'])
            dels.append(band['ave_del'])
    features = amps + dels
    data.append(features)

#### meta data to work with

In [None]:
df_meta = pd.DataFrame(meta, columns=['id','source','project','morphology','topography','num_segs'])

#### Save meta to disk

In [None]:
filepath = '../data/all_bands_meta.pkl'
with open(filepath, 'wb') as fo:
    pickle.dump(df_meta, fo)

#### Save feature matrix to disk

In [None]:
feat_mat = np.array(data)
feat_mat.shape

In [None]:
filepath = '../data/all_bands.pkl'
with open(filepath, 'wb') as fo:
    pickle.dump(feat_mat, fo)

### Create lablels

The current labels are not consistent, the goal is:  
- samples use same terms
- both morphology and topography

A shortlist of condiered diseases:  
- Breast: TCGA-BRCA, icdot-c50.9, BRCA-EU, BRCA-UK
- Brain: TCGA-GBM, TCGA-LGG, icdot-c71.9
- Cerebellum: icdot-c71.6
- Ovary: TCGA-OV, icdot-c56.9, OV-AU
- Lung: TCGA-LUAD, icdot-c34.9, 
- Kidney: TCGA-KIRC, icdot-c64.9, RECA-EU
- Colon: TCGA-COAD, icdot-c18.9
- Stomach: TCGA-STAD, icdot-c16.9, GACA-CN
- Skin: TCGA-SKCM, icdot-c44.9, MELA-AU
- Liver: TCGA-LIHC, icdot-c22.0, LICA-FR, LINC-JP, LIRI-JP
- Prostate: TCGA-PRAD, icdot-c61.9, EOPC-DE, PRAD-CA, PRAD-UK



Modify arraymap morpholgy and topography format

In [None]:
df_meta.loc[df_meta['source'] == 'arraymap', 'morphology'] = df_meta.loc[df_meta['source'] == 'arraymap', 'morphology'].str[6:].str.lower()

In [None]:
df_meta.loc[df_meta['source'] == 'arraymap', 'topography'] = df_meta.loc[df_meta['source'] == 'arraymap', 'topography'].str[6:].str.lower()

TCGA morphology

In [None]:
df_meta.loc[df_meta['source'] == 'TCGA', 'morphology'] = df_meta.loc[df_meta['source'] == 'TCGA', 'morphology'].replace(['/'],'', regex=True)

Switch PCAWG morphology and topography

In [None]:
df_meta.loc[df_meta['source'] == 'PCAWG', 'topography'] = df_meta.loc[df_meta['source'] == 'PCAWG', 'morphology'].str.lower()

#### Column of disease site

In [None]:
df_meta = df_meta.assign(site='Others')

#### Breast

In [None]:
print(df_meta.loc[df_meta['project'] == 'TCGA-BRCA'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-BRCA', 'site'] = 'Breast'

print(df_meta.loc[df_meta['topography'] == 'c50.9'].shape)
df_meta.loc[df_meta['topography'] == 'c50.9', 'site'] = 'Breast'

print(df_meta.loc[df_meta['project'] == 'BRCA-EU'].shape)
df_meta.loc[df_meta['project'] == 'BRCA-EU', 'site'] = 'Breast'

print(df_meta.loc[df_meta['project'] == 'BRCA-UK'].shape)
df_meta.loc[df_meta['project'] == 'BRCA-UK', 'site'] = 'Breast'

#### Brain

In [None]:
print(df_meta.loc[df_meta['project'] == 'TCGA-GBM'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-GBM', 'site'] = 'Brain'

print(df_meta.loc[df_meta['project'] == 'TCGA-LGG'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-LGG', 'site'] = 'Brain'

print(df_meta.loc[df_meta['topography'] == 'c71.9'].shape)
df_meta.loc[df_meta['topography'] == 'c71.9', 'site'] = 'Brain'


#### Cerebellum

In [None]:
print(df_meta.loc[df_meta['topography'] == 'c71.6'].shape)
df_meta.loc[df_meta['topography'] == 'c71.6', 'site'] = 'Cerebellum'

#### Ovary

In [None]:
print(df_meta.loc[df_meta['project'] == 'TCGA-OV'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-OV', 'site'] = 'Ovary'

print(df_meta.loc[df_meta['topography'] == 'c56.9'].shape)
df_meta.loc[df_meta['topography'] == 'c56.9', 'site'] = 'Ovary'

print(df_meta.loc[df_meta['project'] == 'OV-AU'].shape)
df_meta.loc[df_meta['project'] == 'OV-AU', 'site'] = 'Ovary'

#### Lung

In [None]:
print(df_meta.loc[df_meta['project'] == 'TCGA-LUAD'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-LUAD', 'site'] = 'Lung'

print(df_meta.loc[df_meta['topography'] == 'c34.9'].shape)
df_meta.loc[df_meta['topography'] == 'c34.9', 'site'] = 'Lung'

#### Kidney

In [None]:
print(df_meta.loc[df_meta['project'] == 'TCGA-KIRC'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-KIRC', 'site'] = 'Kidney'

print(df_meta.loc[df_meta['topography'] == 'c64.9'].shape)
df_meta.loc[df_meta['topography'] == 'c64.9', 'site'] = 'Kidney'

print(df_meta.loc[df_meta['project'] == 'RECA-EU'].shape)
df_meta.loc[df_meta['project'] == 'RECA-EU', 'site'] = 'Kidney'

#### Colon

In [None]:
print(df_meta.loc[df_meta['project'] == 'TCGA-COAD'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-COAD', 'site'] = 'Colon'

print(df_meta.loc[df_meta['topography'] == 'c18.9'].shape)
df_meta.loc[df_meta['topography'] == 'c18.9', 'site'] = 'Colon'

#### Stomach

In [None]:
print(df_meta.loc[df_meta['project'] == 'TCGA-STAD'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-STAD', 'site'] = 'Stomach'

print(df_meta.loc[df_meta['topography'] == 'c16.9'].shape)
df_meta.loc[df_meta['topography'] == 'c16.9', 'site'] = 'Stomach'

print(df_meta.loc[df_meta['project'] == 'GACA-CN'].shape)
df_meta.loc[df_meta['project'] == 'GACA-CN', 'site'] = 'Stomach'

#### Skin

In [None]:
print(df_meta.loc[df_meta['project'] == 'TCGA-SKCM'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-SKCM', 'site'] = 'Skin'

print(df_meta.loc[df_meta['topography'] == 'c44.9'].shape)
df_meta.loc[df_meta['topography'] == 'c44.9', 'site'] = 'Skin'

print(df_meta.loc[df_meta['project'] == 'MELA-AU'].shape)
df_meta.loc[df_meta['project'] == 'MELA-AU', 'site'] = 'Skin'

#### Liver

In [None]:
print(df_meta.loc[df_meta['project'] == 'TCGA-LIHC'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-LIHC', 'site'] = 'Liver'

print(df_meta.loc[df_meta['topography'] == 'c22.9'].shape)
df_meta.loc[df_meta['topography'] == 'c22.9', 'site'] = 'Liver'

print(df_meta.loc[df_meta['project'] == 'LICA-FR'].shape)
df_meta.loc[df_meta['project'] == 'LICA-FR', 'site'] = 'Liver'

print(df_meta.loc[df_meta['project'] == 'LINC-JP'].shape)
df_meta.loc[df_meta['project'] == 'LINC-JP', 'site'] = 'Liver'

print(df_meta.loc[df_meta['project'] == 'LIRI-JP'].shape)
df_meta.loc[df_meta['project'] == 'LIRI-JP', 'site'] = 'Liver'

#### Prostate

In [None]:
print(df_meta.loc[df_meta['project'] == 'TCGA-PRAD'].shape)
df_meta.loc[df_meta['project'] == 'TCGA-PRAD', 'site'] = 'Prostate'

print(df_meta.loc[df_meta['topography'] == 'c61.9'].shape)
df_meta.loc[df_meta['topography'] == 'c61.9', 'site'] = 'Prostate'

print(df_meta.loc[df_meta['project'] == 'EOPC-DE'].shape)
df_meta.loc[df_meta['project'] == 'EOPC-DE', 'site'] = 'Prostate'

print(df_meta.loc[df_meta['project'] == 'PRAD-CA'].shape)
df_meta.loc[df_meta['project'] == 'PRAD-CA', 'site'] = 'Prostate'

print(df_meta.loc[df_meta['project'] == 'PRAD-UK'].shape)
df_meta.loc[df_meta['project'] == 'PRAD-UK', 'site'] = 'Prostate'

Check

In [None]:
df_meta.loc[df_meta['site'] == 'Others'].shape

In [None]:
df_meta['site'].unique()

#### Save meta to disk

In [None]:
## updated morphology and topography labels
filepath = '../data/all_bands_meta.pkl'
with open(filepath, 'wb') as fo:
    pickle.dump(df_meta, fo)

### Feature labels

In [None]:
df_meta = df_meta.assign(index = df_meta.index)

In [None]:
labels = df_meta['site'].values

In [None]:
np.unique(labels, return_counts=True)

Save to disk

In [None]:
labelpath = '../data/all_bands_label.pkl'
with open(labelpath, 'wb') as fo:
    pickle.dump(labels, fo)

### morphology as label

In [None]:
filepath = '../data/all_bands_meta.pkl'
with open(filepath, 'rb') as fi:
    df = pickle.load(fi)

In [None]:
df = df.assign(disease=df['morphology'])

In [None]:
df.loc[df['site'] == 'Others', 'disease'] = 'Others'

In [None]:
disease_label = df['disease'].values
disease_label[pd.isnull(disease_label)] = 'None'

In [None]:
dlabelpath = '../data/all_bands_disease_label.pkl'
with open(dlabelpath, 'wb') as fo:
    pickle.dump(disease_label, fo)

In [None]:
disease_label[pd.isnull(disease_label)] = 'None'

### Source label

In [None]:
source_label = df['source'].values

In [None]:
slabelpath = '../data/all_bands_source_label.pkl'
with open(slabelpath, 'wb') as fo:
    pickle.dump(source_label, fo)