# Data Preprocessing for predicting Transcription Rate (TS)

In [10]:
import numpy as np
import pandas as pd
# To display all the columns
pd.options.display.max_columns = None
import os
import sys
import matplotlib.pyplot as plt
import json

# Set paths
BASE_DIR = os.path.realpath(os.path.join(os.path.abspath(''),'../..'))
if not os.path.exists(BASE_DIR):
    print('ERROR!, base path {} does not exist! Setting to None'.format(BASE_DIR))
    BASE_DIR = None
else:
    print('BASE_DIR: {}'.format(BASE_DIR))

DATA_DIR = os.path.join(BASE_DIR, 'datasets', 'raw')
if not os.path.exists(DATA_DIR):
    print('ERROR!, data path {} does not exist! Setting to None'.format(DATA_DIR))
    DATA_DIR = None
else:
    print('DATA_DIR: {}\n'.format(DATA_DIR))
    
# Add BASE_DIR to sys paths (for loading libraries)
sys.path.insert(1, os.path.join(BASE_DIR, 'workspace'))
# Load mpp_data library to convert raw data into images
from pelkmans.mpp_data import MPPData as MPPData

# List available local Wells
wells = [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]
print('Available local wells: \n', wells)


BASE_DIR: /home/hhughes/Documents/Master_Thesis/Project
DATA_DIR: /home/hhughes/Documents/Master_Thesis/Project/datasets/raw

Available local wells: 
 ['I11', 'I09', 'J10']


Load raw data:

Set parameters for data transformation:

In [2]:
# In case you only want to load some specific wells, rename 'wells'
wells = ['J10']

data_params = {
    # where to read data from
    'data_dirs': [os.path.join(DATA_DIR, well) for well in wells],
    'dir_type': 'hannah',
    # make results reproducible
    'seed': 42,
    # input/output definition
    'input_channels': [
        '00_DAPI',
        '07_H2B',
        '01_CDK9_pT186',
        '03_CDK9',
        '05_GTF2B',
        '07_SETD1A',
        '08_H3K4me3',
        '09_SRRM2',
        '10_H3K27ac',
        '11_KPNA2_MAX',
        '12_RB1_pS807_S811',
        '13_PABPN1',
        '14_PCNA',
        '15_SON',
        '16_H3',
        '17_HDAC3',
        '19_KPNA1_MAX',
        '20_SP100',
        '21_NCL',
        '01_PABPC1',
        '02_CDK7',
        '03_RPS6',
        '05_Sm',
        '07_POLR2A',
        '09_CCNT1',
        '10_POL2RA_pS2',
        '11_PML',
        '12_YAP1',
        '13_POL2RA_pS5',
        '15_U2SNRNPB',
        '18_NONO',
        '20_ALYREF',
        '21_COIL',
    ],
    'output_channels': ['00_EU'],
    'aggregate_output': 'avg', # None results in output images, 'max', 'avg' aggregate output channels and output a single number
    # train/val/test split
    'train_frac': 0.5,
    'val_frac': 0.4,
    'img_size': 224,
    # normalisation
    'background_value': os.path.join(DATA_DIR, 'secondary_only_relative_normalisation.csv'),
    'normalise': True,
    'percentile': 98.0,
    # Condition
    'cell_cycle_file': os.path.join(DATA_DIR, 'cell_cycle_classification.csv'),
    #'condition': ['G1', 'S', 'G2'],
    'condition': ['cell_cycle'],
    'subset_to_cell_cycle': True,
}

Process data:

In [3]:
p = data_params

mpp_datas = {'train': [], 'val': [], 'test': []}
for data_dir in p['data_dirs']:
    # Load data as an MPPData object
    mpp_data = MPPData.from_data_dir(data_dir,
                                     dir_type=p['dir_type'],
                                     seed=p['seed'])

    # Subtract background  values for each channel
    if p['normalise']:
        mpp_data.subtract_background(p['background_value'])
    
    # Split data into train, validation and test
    train, val, test = mpp_data.train_val_test_split(p['train_frac'], p['val_frac'])
    
    # Save well data in dic
    mpp_datas['train'].append(train)
    mpp_datas['test'].append(test)
    mpp_datas['val'].append(val)
    
# merge data from all the loaded wells
train = MPPData.concat(mpp_datas['train'])
val = MPPData.concat(mpp_datas['val'])
test = MPPData.concat(mpp_datas['test'])

# Normalize train, val and test using the normalization parameters
# got from the train data (inner percentile% of train data)
if p['normalise']:
    rescale_values = train.rescale_intensities_per_channel(percentile=p['percentile'])
    _ = val.rescale_intensities_per_channel(rescale_values=rescale_values)
    _ = test.rescale_intensities_per_channel(rescale_values=rescale_values)
    p['normalise_rescale_values'] = list(rescale_values)

# Filter cells depending on its condition (cell state (G1, S or G2))
'''if p.get('condition', None) is not None:
    train.add_conditions(p['condition'], 
                         cell_cycle_file=p.get('cell_cycle_file', None), 
                         subset=p.get('subset_to_cell_cycle', False))
    val.add_conditions(p['condition'], 
                       cell_cycle_file=p.get('cell_cycle_file', None), 
                       subset=p.get('subset_to_cell_cycle', False))
    test.add_conditions(p['condition'], 
                        cell_cycle_file=p.get('cell_cycle_file', None), 
                        subset=p.get('subset_to_cell_cycle', False))

# subset to channels
        if p.get('channels', None) is not None:
            train.subset_channels(p['channels'])
            val.subset_channels(p['channels'])
            test.subset_channels(p['channels'])
'''

missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']


"if p.get('condition', None) is not None:\n    train.add_conditions(p['condition'], \n                         cell_cycle_file=p.get('cell_cycle_file', None), \n                         subset=p.get('subset_to_cell_cycle', False))\n    val.add_conditions(p['condition'], \n                       cell_cycle_file=p.get('cell_cycle_file', None), \n                       subset=p.get('subset_to_cell_cycle', False))\n    test.add_conditions(p['condition'], \n                        cell_cycle_file=p.get('cell_cycle_file', None), \n                        subset=p.get('subset_to_cell_cycle', False))\n\n# subset to channels\n        if p.get('channels', None) is not None:\n            train.subset_channels(p['channels'])\n            val.subset_channels(p['channels'])\n            test.subset_channels(p['channels'])\n"

In [5]:
(mpp_data.__dict__).keys()

dict_keys(['log', 'mapobject_ids', 'metadata', 'channels', 'labels', 'x', 'y', 'seed', 'mpp', 'mcu_ids', 'conditions', 'data_dir'])

In [11]:
mpp_data.metadata.head(20)

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels
0,279425,plate01,J10,0,0,0,0,1,0,279369,plate01,J10,0,0,0,0,1,1,0.0,,0.0,,0.0,
1,279426,plate01,J10,0,0,0,0,2,0,279370,plate01,J10,0,0,0,0,2,1,0.0,,0.0,,0.0,
2,279427,plate01,J10,0,0,0,0,3,0,279371,plate01,J10,0,0,0,0,3,1,1.0,,1.0,,0.0,
3,279428,plate01,J10,0,0,0,0,4,0,279372,plate01,J10,0,0,0,0,4,1,0.0,,0.0,,0.0,
4,279429,plate01,J10,0,0,0,0,5,0,279373,plate01,J10,0,0,0,0,5,1,0.0,,0.0,,0.0,
5,279430,plate01,J10,0,0,0,0,6,0,279374,plate01,J10,0,0,0,0,6,0,0.0,,0.0,,0.0,
7,279432,plate01,J10,0,0,0,0,8,0,279376,plate01,J10,0,0,0,0,8,0,1.0,,1.0,,0.0,
8,279433,plate01,J10,0,0,0,0,9,0,279377,plate01,J10,0,0,0,0,9,0,0.0,,0.0,,0.0,
9,279434,plate01,J10,0,0,0,0,10,0,279378,plate01,J10,0,0,0,0,10,0,1.0,,1.0,,0.0,
10,279435,plate01,J10,0,0,0,0,11,0,279379,plate01,J10,0,0,0,0,11,0,0.0,,0.0,,0.0,


In [7]:
train.__dict__.keys()

dict_keys(['log', 'mapobject_ids', 'metadata', 'channels', 'labels', 'x', 'y', 'seed', 'mpp', 'mcu_ids', 'conditions'])

In [12]:
train.metadata.head(20)
# Los mapobject_id coinciden con los del objeto original mpp_data
# entonces para que la semilla? ver si se barajean las observaciones 


Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels
0,279425,plate01,J10,0,0,0,0,1,0,279369,plate01,J10,0,0,0,0,1,1,0.0,,0.0,,0.0,
1,279426,plate01,J10,0,0,0,0,2,0,279370,plate01,J10,0,0,0,0,2,1,0.0,,0.0,,0.0,
2,279427,plate01,J10,0,0,0,0,3,0,279371,plate01,J10,0,0,0,0,3,1,1.0,,1.0,,0.0,
3,279428,plate01,J10,0,0,0,0,4,0,279372,plate01,J10,0,0,0,0,4,1,0.0,,0.0,,0.0,
4,279429,plate01,J10,0,0,0,0,5,0,279373,plate01,J10,0,0,0,0,5,1,0.0,,0.0,,0.0,
5,279432,plate01,J10,0,0,0,0,8,0,279376,plate01,J10,0,0,0,0,8,0,1.0,,1.0,,0.0,
6,279433,plate01,J10,0,0,0,0,9,0,279377,plate01,J10,0,0,0,0,9,0,0.0,,0.0,,0.0,
7,279436,plate01,J10,0,0,0,0,12,0,279380,plate01,J10,0,0,0,0,12,0,1.0,,1.0,,0.0,
8,279438,plate01,J10,0,0,0,0,14,0,279382,plate01,J10,0,0,0,0,14,0,0.0,,0.0,,0.0,
9,279442,plate01,J10,0,0,0,0,18,0,279386,plate01,J10,0,0,0,0,18,1,0.0,,0.0,,0.0,


In [19]:
train_ids = np.unique(train.mapobject_ids)
print(train_ids)

[193561 193563 193565 193567 193569 193570 193572 193573 193577 193584
 208535 208537 208538 208541 208542 208543 208544 208546 208553 208555
 208557 208558 208563 208565 208568 208569 208570 212782 212783 212785
 212788 212790 212791 212792 212795 212796 212797 212798 212799 212802
 212803 212805 212806 212808 212809 212810 212811 212814 224165 224167
 224169 224171 224173 224174 224177 224178 224181 224183 224184 246827
 246828 246829 246832 246838 246840 246846 246852 246854 246855 246857
 246858 246860 246862 246863 246864 246867 246868 246871 249425 249428
 249433 249434 249435 249438 249439 249442 249443 249444 249446 249447
 249448 249451 249453 249455 249456 249463 249464 249467 249469 249471
 258566 258572 258573 258574 258575 258576 258577 258578 258579 258582
 258584 258585 258587 258591 258592 258595 258599 258600 258602 258603
 258604 258609 258610 258613 258615 258616 259948 259953 259955 259958
 259959 259960 259964 259965 259966 259973 259974 259975 259977 259980
 25998

In [25]:
train_meta_ids = np.unique(train.metadata.mapobject_id)
train_meta_ids

array([193561, 193563, 193565, 193567, 193569, 193570, 193572, 193573,
       193577, 193584, 208535, 208537, 208538, 208541, 208542, 208543,
       208544, 208546, 208553, 208555, 208557, 208558, 208563, 208565,
       208568, 208569, 208570, 212782, 212783, 212785, 212788, 212790,
       212791, 212792, 212795, 212796, 212797, 212798, 212799, 212802,
       212803, 212805, 212806, 212808, 212809, 212810, 212811, 212814,
       224165, 224167, 224169, 224171, 224173, 224174, 224177, 224178,
       224181, 224183, 224184, 246827, 246828, 246829, 246832, 246838,
       246840, 246846, 246852, 246854, 246855, 246857, 246858, 246860,
       246862, 246863, 246864, 246867, 246868, 246871, 249425, 249428,
       249433, 249434, 249435, 249438, 249439, 249442, 249443, 249444,
       249446, 249447, 249448, 249451, 249453, 249455, 249456, 249463,
       249464, 249467, 249469, 249471, 258566, 258572, 258573, 258574,
       258575, 258576, 258577, 258578, 258579, 258582, 258584, 258585,
      

In [38]:
meta_ids = np.unique(mpp_data.metadata.mapobject_id)

In [29]:
set(train_meta_ids).issubset(set(train_ids))

True

In [30]:
set(train_ids).issubset(set(train_meta_ids))

True

with the last 2 rows we can be sure that the ids in the data are the same as the ones in the metadata

In [40]:
set(meta_ids).issubset(set(train_ids))

False

In [9]:
# get images
train_dataset = np.array(train.get_object_imgs(data='MPP', img_size=p['img_size']))
val_dataset = np.array(val.get_object_imgs(data='MPP', img_size=p['img_size']))
test_dataset = np.array(test.get_object_imgs(data='MPP', img_size=p['img_size']))

In [5]:
# Get channels ids (proteins) which will be used to predict transcripcion rate
input_ids = list(train.channels.set_index('name').loc[p['input_channels']]['channel_id'])
# Get id of the channel that measure trancripcion rate
output_ids = list(train.channels.set_index('name').loc[p['output_channels']]['channel_id'])

# Create regressor (y)
if p['aggregate_output'] == 'avg':
    train_dataset_y = np.array([img[img!=0].mean() for img in train_dataset[:,:,:,output_ids]])
    val_dataset_y = np.array([img[img!=0].mean() for img in val_dataset[:,:,:,output_ids]])
    test_dataset_y = np.array([img[img!=0].mean() for img in test_dataset[:,:,:,output_ids]])

Save Processed data

In [6]:
# create dir
dataset_name = '184A1_hannah_EU_regression'
outdir = os.path.join(BASE_DIR, 'datasets', dataset_name)
os.makedirs(outdir, exist_ok=True)

# Save datasets
np.savez(os.path.join(outdir, 'train_dataset.npz'), x=train_dataset[:,:,:,input_ids], y=train_dataset_y)
np.savez(os.path.join(outdir, 'val_dataset.npz'), x=val_dataset[:,:,:,input_ids], y=val_dataset_y)
np.savez(os.path.join(outdir, 'test_dataset.npz'), x=test_dataset[:,:,:,input_ids], y=test_dataset_y)

# save params
json.dump(data_params, open(os.path.join(outdir, 'params.json'), 'w'), indent=4)

# save metadata
train.metadata.to_csv(os.path.join(outdir, 'train_metadata.csv'))
val.metadata.to_csv(os.path.join(outdir, 'val_metadata.csv'))
test.metadata.to_csv(os.path.join(outdir, 'test_metadata.csv'))
pd.concat([train.metadata, val.metadata, test.metadata]).to_csv(os.path.join(outdir, 'metadata.csv'))
train.channels.to_csv(os.path.join(outdir, 'channels.csv'))

In [6]:
pd.options.display.max_columns=None
train.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels
0,279425,plate01,J10,0,0,0,0,1,0,279369,plate01,J10,0,0,0,0,1,1,0.0,,0.0,,0.0,
1,279426,plate01,J10,0,0,0,0,2,0,279370,plate01,J10,0,0,0,0,2,1,0.0,,0.0,,0.0,
2,279427,plate01,J10,0,0,0,0,3,0,279371,plate01,J10,0,0,0,0,3,1,1.0,,1.0,,0.0,
3,279428,plate01,J10,0,0,0,0,4,0,279372,plate01,J10,0,0,0,0,4,1,0.0,,0.0,,0.0,
4,279429,plate01,J10,0,0,0,0,5,0,279373,plate01,J10,0,0,0,0,5,1,0.0,,0.0,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,361986,plate01,J10,5,4,0,0,62,0,361852,plate01,J10,5,4,0,0,62,1,0.0,,0.0,,0.0,
860,361987,plate01,J10,5,4,0,0,63,0,361853,plate01,J10,5,4,0,0,63,0,0.0,,0.0,,0.0,
861,361991,plate01,J10,5,4,0,0,67,0,361857,plate01,J10,5,4,0,0,67,0,1.0,,0.0,,0.0,
862,361992,plate01,J10,5,4,0,0,68,0,361858,plate01,J10,5,4,0,0,68,1,0.0,,0.0,,0.0,


In [14]:
train.__dict__.keys()

dict_keys(['log', 'mapobject_ids', 'metadata', 'channels', 'labels', 'x', 'y', 'seed', 'mpp', 'mcu_ids', 'conditions'])

In [29]:
train_ids = np.unique(train.mapobject_ids)
print(train_ids.shape)
train_ids

(864,)


array([193561, 193562, 193563, 193565, 193567, 193569, 193570, 193571,
       193572, 193573, 193575, 193577, 193581, 193582, 193583, 193584,
       193585, 208534, 208535, 208537, 208538, 208539, 208541, 208542,
       208543, 208544, 208546, 208549, 208550, 208552, 208553, 208555,
       208556, 208557, 208558, 208559, 208560, 208562, 208563, 208564,
       208565, 208567, 208568, 208569, 208570, 208571, 212782, 212783,
       212785, 212786, 212788, 212790, 212791, 212792, 212794, 212795,
       212796, 212797, 212798, 212799, 212800, 212801, 212802, 212803,
       212805, 212806, 212807, 212808, 212809, 212810, 212811, 212814,
       224165, 224166, 224167, 224169, 224170, 224171, 224173, 224174,
       224176, 224177, 224178, 224179, 224180, 224181, 224182, 224183,
       224184, 246827, 246828, 246829, 246830, 246832, 246834, 246835,
       246836, 246837, 246838, 246839, 246840, 246842, 246844, 246845,
       246846, 246847, 246848, 246850, 246852, 246854, 246855, 246857,
      

In [28]:
train.metadata.mapobject_id

0      279425
1      279426
2      279427
3      279428
4      279429
        ...  
859    361986
860    361987
861    361991
862    361992
863    361993
Name: mapobject_id, Length: 864, dtype: int64

In [13]:
mpp_data.mapobject_ids

array([279425, 279425, 279425, ..., 361993, 361993, 361993], dtype=uint32)

In [12]:
ids = np.unique(mpp_data.mapobject_ids)
np.random.seed(42)
np.random.shuffle(ids)
num_train = int(len(ids)*p['train_frac'])
num_val = int(len(ids)*p['val_frac'])
train_ids = ids[:num_train]
val_ids = ids[num_train:num_train+num_val]
test_ids = ids[num_train+num_val:]

count = 0
for split_ids in (train_ids, val_ids, test_ids):
    print(count)
    count += 1
    print(split_ids.shape)
    ind = np.in1d(mpp_data.mapobject_ids, split_ids)
    print(ind)
    print(ind.shape)
    print(np.sum(ind))
    print("")

0
(864,)
[ True  True  True ...  True  True  True]
(14265789,)
11476949

1
(108,)
[False False False ... False False False]
(14265789,)
1402814

2
(108,)
[False False False ... False False False]
(14265789,)
1386026



In [30]:
ids = np.unique(mpp_data.mapobject_ids)
print(ids)

[193561 193562 193563 ... 380693 380695 380696]


In [33]:
np.random.seed(42)
np.random.shuffle(ids)
print(ids)

[361965 340105 277082 ... 321068 346276 275891]


In [44]:
train_ids

array([278732, 287433, 337013, 315268, 380665, 315297, 363266, 363268,
       380687, 361941, 380661, 298404, 321057, 249428, 271149, 279426,
       271113, 279478, 361927, 357548, 208570, 212796, 279467, 275894,
       284796, 363262, 351347, 271155, 379464, 340093, 321080, 323197,
       208565, 259977, 249444, 315298, 340094, 212814, 263010, 380692,
       275895, 323227, 323195, 361975, 363267, 246828, 224165, 208544,
       346271, 287426, 249463, 279475, 275887, 249471, 315302, 351354,
       361952, 321085, 279432, 357535, 278751, 380678, 323212, 259989,
       353337, 351329, 357529, 279458, 284799, 351335, 287447, 379448,
       315277, 275882, 246832, 298401, 380690, 361969, 357534, 361965,
       259955, 315292, 224184, 212783, 340071, 271134, 323240, 277078,
       380685, 208535, 357561, 246864, 284765, 278750, 271138, 379474,
       357541, 279444, 224178, 249448, 287448, 346273, 361982, 363275,
       271118, 315263, 284794, 208563, 351320, 323215, 321056, 363254,
      