# Data Preprocessing for predicting Transcription Rate (TS)

In [3]:
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import json

# Set paths
BASE_DIR = os.path.realpath(os.path.join(os.path.abspath(''),'../..'))
if not os.path.exists(BASE_DIR):
    print('ERROR!, base path {} does not exist! Setting to None'.format(BASE_DIR))
    BASE_DIR = None
else:
    print('BASE_DIR: {}'.format(BASE_DIR))

DATA_DIR = os.path.join(BASE_DIR, 'datasets', 'raw')
if not os.path.exists(DATA_DIR):
    print('ERROR!, data path {} does not exist! Setting to None'.format(DATA_DIR))
    DATA_DIR = None
else:
    print('DATA_DIR: {}\n'.format(DATA_DIR))
    
# Add BASE_DIR to sys paths (for loading libraries)
sys.path.insert(1, os.path.join(BASE_DIR, 'workspace'))
# Load mpp_data library to convert raw data into images
from pelkmans.mpp_data import MPPData as MPPData

# List available local Wells
wells = [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]
print('Available local wells: \n', wells)


BASE_DIR: /home/hhughes/Documents/Master_Thesis/Project
DATA_DIR: /home/hhughes/Documents/Master_Thesis/Project/datasets/raw

Available local wells: 
 ['I11', 'I09', 'J10']


Load raw data:

Set parameters for data transformation:

In [4]:
# In case you only want to load some specific wells, rename 'wells'
wells = ['J10']

data_params = {
    # where to read data from
    'data_dirs': [os.path.join(DATA_DIR, well) for well in wells],
    'dir_type': 'hannah',
    # make results reproducible
    'seed': 42,
    # input/output definition
    'input_channels': [
        '00_DAPI',
        '07_H2B',
        '01_CDK9_pT186',
        '03_CDK9',
        '05_GTF2B',
        '07_SETD1A',
        '08_H3K4me3',
        '09_SRRM2',
        '10_H3K27ac',
        '11_KPNA2_MAX',
        '12_RB1_pS807_S811',
        '13_PABPN1',
        '14_PCNA',
        '15_SON',
        '16_H3',
        '17_HDAC3',
        '19_KPNA1_MAX',
        '20_SP100',
        '21_NCL',
        '01_PABPC1',
        '02_CDK7',
        '03_RPS6',
        '05_Sm',
        '07_POLR2A',
        '09_CCNT1',
        '10_POL2RA_pS2',
        '11_PML',
        '12_YAP1',
        '13_POL2RA_pS5',
        '15_U2SNRNPB',
        '18_NONO',
        '20_ALYREF',
        '21_COIL',
    ],
    'output_channels': ['00_EU'],
    'aggregate_output': 'avg', # None results in output images, 'max', 'avg' aggregate output channels and output a single number
    # train/val/test split
    'train_frac': 0.8,
    'val_frac': 0.1,
    'img_size': 224,
    # normalisation
    'background_value': os.path.join(DATA_DIR, 'secondary_only_relative_normalisation.csv'),
    'normalise': True,
    'percentile': 98.0,
    # Condition
    'cell_cycle_file': os.path.join(DATA_DIR, 'cell_cycle_classification.csv'),
    #'condition': ['G1', 'S', 'G2'],
    'condition': ['cell_cycle'],
    'subset_to_cell_cycle': True,
}

Process data:

In [5]:
p = data_params

mpp_datas = {'train': [], 'val': [], 'test': []}
for data_dir in p['data_dirs']:
    # Load data as an MPPData object
    mpp_data = MPPData.from_data_dir(data_dir,
                                 dir_type=p['dir_type'],
                                 seed=p['seed'])

    # Subtract background  values for each channel
    if p['normalise']:
        mpp_data.subtract_background(p['background_value'])
    
    # Split data into train, validation and test
    train, val, test = mpp_data.train_val_test_split(p['train_frac'], p['val_frac'])
    
    # Save well data in dic
    mpp_datas['train'].append(train)
    mpp_datas['test'].append(test)
    mpp_datas['val'].append(val)
    
# merge data from all the loaded wells
train = MPPData.concat(mpp_datas['train'])
val = MPPData.concat(mpp_datas['val'])
test = MPPData.concat(mpp_datas['test'])

# Normalize train, val and test using the normalization parameters
# got from the train data (inner percentile% of train data)
if p['normalise']:
    rescale_values = train.rescale_intensities_per_channel(percentile=p['percentile'])
    _ = val.rescale_intensities_per_channel(rescale_values=rescale_values)
    _ = test.rescale_intensities_per_channel(rescale_values=rescale_values)
    p['normalise_rescale_values'] = list(rescale_values)

# Filter cells depending on its condition (cell state (G1, S or G2))
'''if p.get('condition', None) is not None:
    train.add_conditions(p['condition'], 
                         cell_cycle_file=p.get('cell_cycle_file', None), 
                         subset=p.get('subset_to_cell_cycle', False))
    val.add_conditions(p['condition'], 
                       cell_cycle_file=p.get('cell_cycle_file', None), 
                       subset=p.get('subset_to_cell_cycle', False))
    test.add_conditions(p['condition'], 
                        cell_cycle_file=p.get('cell_cycle_file', None), 
                        subset=p.get('subset_to_cell_cycle', False))

# subset to channels
        if p.get('channels', None) is not None:
            train.subset_channels(p['channels'])
            val.subset_channels(p['channels'])
            test.subset_channels(p['channels'])
'''

missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']


"if p.get('condition', None) is not None:\n    train.add_conditions(p['condition'], \n                         cell_cycle_file=p.get('cell_cycle_file', None), \n                         subset=p.get('subset_to_cell_cycle', False))\n    val.add_conditions(p['condition'], \n                       cell_cycle_file=p.get('cell_cycle_file', None), \n                       subset=p.get('subset_to_cell_cycle', False))\n    test.add_conditions(p['condition'], \n                        cell_cycle_file=p.get('cell_cycle_file', None), \n                        subset=p.get('subset_to_cell_cycle', False))\n\n# subset to channels\n        if p.get('channels', None) is not None:\n            train.subset_channels(p['channels'])\n            val.subset_channels(p['channels'])\n            test.subset_channels(p['channels'])\n"

In [7]:
dir(mpp_data)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_get_per_mpp_value',
 'add_conditions',
 'add_mcu',
 'add_neighborhood',
 'center_mpp',
 'channels',
 'concat',
 'conditions',
 'data_dir',
 'from_data_dir',
 'get_condition_img',
 'get_img_from_data',
 'get_mcu_img',
 'get_mpp_img',
 'get_neighborhood',
 'get_object_imgs',
 'has_neighbor_data',
 'labels',
 'log',
 'mapobject_ids',
 'mcu_ids',
 'metadata',
 'mpp',
 'rescale_intensities_per_channel',
 'seed',
 'subsample',
 'subset',
 'subset_channels',
 'subtract_background',
 'train_val_test_split',
 'x',
 'y']

In [11]:
(mpp_data.__dict__).keys()

dict_keys(['log', 'mapobject_ids', 'metadata', 'channels', 'labels', 'x', 'y', 'seed', 'mpp', 'mcu_ids', 'conditions', 'data_dir'])

In [38]:
mpp_data.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,...,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels
0,279425,plate01,J10,0,0,0,0,1,0,279369,...,0,0,1,1,0.0,,0.0,,0.0,
1,279426,plate01,J10,0,0,0,0,2,0,279370,...,0,0,2,1,0.0,,0.0,,0.0,
2,279427,plate01,J10,0,0,0,0,3,0,279371,...,0,0,3,1,1.0,,1.0,,0.0,
3,279428,plate01,J10,0,0,0,0,4,0,279372,...,0,0,4,1,0.0,,0.0,,0.0,
4,279429,plate01,J10,0,0,0,0,5,0,279373,...,0,0,5,1,0.0,,0.0,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1314,361989,plate01,J10,5,4,0,0,65,0,361855,...,0,0,65,1,0.0,,0.0,,0.0,
1315,361990,plate01,J10,5,4,0,0,66,0,361856,...,0,0,66,1,0.0,,0.0,,0.0,
1316,361991,plate01,J10,5,4,0,0,67,0,361857,...,0,0,67,0,1.0,,0.0,,0.0,
1317,361992,plate01,J10,5,4,0,0,68,0,361858,...,0,0,68,1,0.0,,0.0,,0.0,


In [36]:
train.__dict__.keys()

dict_keys(['log', 'mapobject_ids', 'metadata', 'channels', 'labels', 'x', 'y', 'seed', 'mpp', 'mcu_ids', 'conditions'])

In [37]:
train.metadata
# Los mapobject_id coinciden con los del objeto original mpp_data
# entonces para que la semilla? ver si se barajean las observaciones 


Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,...,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels
0,279425,plate01,J10,0,0,0,0,1,0,279369,...,0,0,1,1,0.0,,0.0,,0.0,
1,279426,plate01,J10,0,0,0,0,2,0,279370,...,0,0,2,1,0.0,,0.0,,0.0,
2,279427,plate01,J10,0,0,0,0,3,0,279371,...,0,0,3,1,1.0,,1.0,,0.0,
3,279428,plate01,J10,0,0,0,0,4,0,279372,...,0,0,4,1,0.0,,0.0,,0.0,
4,279429,plate01,J10,0,0,0,0,5,0,279373,...,0,0,5,1,0.0,,0.0,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,361986,plate01,J10,5,4,0,0,62,0,361852,...,0,0,62,1,0.0,,0.0,,0.0,
860,361987,plate01,J10,5,4,0,0,63,0,361853,...,0,0,63,0,0.0,,0.0,,0.0,
861,361991,plate01,J10,5,4,0,0,67,0,361857,...,0,0,67,0,1.0,,0.0,,0.0,
862,361992,plate01,J10,5,4,0,0,68,0,361858,...,0,0,68,1,0.0,,0.0,,0.0,


In [None]:
# get images
train_dataset = np.array(train.get_object_imgs(data='MPP', img_size=p['img_size']))
val_dataset = np.array(val.get_object_imgs(data='MPP', img_size=p['img_size']))
test_dataset = np.array(test.get_object_imgs(data='MPP', img_size=p['img_size']))

In [5]:
# Get channels ids (proteins) which will be used to predict transcripcion rate
input_ids = list(train.channels.set_index('name').loc[p['input_channels']]['channel_id'])
# Get id of the channel that measure trancripcion rate
output_ids = list(train.channels.set_index('name').loc[p['output_channels']]['channel_id'])

# Create regressor (y)
if p['aggregate_output'] == 'avg':
    train_dataset_y = np.array([img[img!=0].mean() for img in train_dataset[:,:,:,output_ids]])
    val_dataset_y = np.array([img[img!=0].mean() for img in val_dataset[:,:,:,output_ids]])
    test_dataset_y = np.array([img[img!=0].mean() for img in test_dataset[:,:,:,output_ids]])

Save Processed data

In [6]:
# create dir
dataset_name = '184A1_hannah_EU_regression'
outdir = os.path.join(BASE_DIR, 'datasets', dataset_name)
os.makedirs(outdir, exist_ok=True)

# Save datasets
np.savez(os.path.join(outdir, 'train_dataset.npz'), x=train_dataset[:,:,:,input_ids], y=train_dataset_y)
np.savez(os.path.join(outdir, 'val_dataset.npz'), x=val_dataset[:,:,:,input_ids], y=val_dataset_y)
np.savez(os.path.join(outdir, 'test_dataset.npz'), x=test_dataset[:,:,:,input_ids], y=test_dataset_y)

# save params
json.dump(data_params, open(os.path.join(outdir, 'params.json'), 'w'), indent=4)

# save metadata
train.metadata.to_csv(os.path.join(outdir, 'train_metadata.csv'))
val.metadata.to_csv(os.path.join(outdir, 'val_metadata.csv'))
test.metadata.to_csv(os.path.join(outdir, 'test_metadata.csv'))
pd.concat([train.metadata, val.metadata, test.metadata]).to_csv(os.path.join(outdir, 'metadata.csv'))
train.channels.to_csv(os.path.join(outdir, 'channels.csv'))

In [6]:
pd.options.display.max_columns=None
train.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels
0,279425,plate01,J10,0,0,0,0,1,0,279369,plate01,J10,0,0,0,0,1,1,0.0,,0.0,,0.0,
1,279426,plate01,J10,0,0,0,0,2,0,279370,plate01,J10,0,0,0,0,2,1,0.0,,0.0,,0.0,
2,279427,plate01,J10,0,0,0,0,3,0,279371,plate01,J10,0,0,0,0,3,1,1.0,,1.0,,0.0,
3,279428,plate01,J10,0,0,0,0,4,0,279372,plate01,J10,0,0,0,0,4,1,0.0,,0.0,,0.0,
4,279429,plate01,J10,0,0,0,0,5,0,279373,plate01,J10,0,0,0,0,5,1,0.0,,0.0,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,361986,plate01,J10,5,4,0,0,62,0,361852,plate01,J10,5,4,0,0,62,1,0.0,,0.0,,0.0,
860,361987,plate01,J10,5,4,0,0,63,0,361853,plate01,J10,5,4,0,0,63,0,0.0,,0.0,,0.0,
861,361991,plate01,J10,5,4,0,0,67,0,361857,plate01,J10,5,4,0,0,67,0,1.0,,0.0,,0.0,
862,361992,plate01,J10,5,4,0,0,68,0,361858,plate01,J10,5,4,0,0,68,1,0.0,,0.0,,0.0,


In [7]:
dir(train)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_get_per_mpp_value',
 'add_conditions',
 'add_mcu',
 'add_neighborhood',
 'center_mpp',
 'channels',
 'concat',
 'conditions',
 'from_data_dir',
 'get_condition_img',
 'get_img_from_data',
 'get_mcu_img',
 'get_mpp_img',
 'get_neighborhood',
 'get_object_imgs',
 'has_neighbor_data',
 'labels',
 'log',
 'mapobject_ids',
 'mcu_ids',
 'metadata',
 'mpp',
 'rescale_intensities_per_channel',
 'seed',
 'subsample',
 'subset_channels',
 'subtract_background',
 'train_val_test_split',
 'x',
 'y']

In [None]:
train