# Data Preprocessing for predicting Transcription Rate (TS)

This notebook is ment to convert raw cell data from several wells into multichannel images (along with its corresponding metadata).

In [1]:
import numpy as np
import pandas as pd
# To display all the columns
pd.options.display.max_columns = None
import os
import sys
import matplotlib.pyplot as plt
import json

# Set paths
BASE_DIR = os.path.realpath(os.path.join(os.path.abspath(''),'../..'))
if not os.path.exists(BASE_DIR):
    print('ERROR!, base path {} does not exist! Setting to None'.format(BASE_DIR))
    BASE_DIR = None
else:
    print('BASE_DIR: {}'.format(BASE_DIR))

DATA_DIR = os.path.join(BASE_DIR, 'datasets', 'raw')
if not os.path.exists(DATA_DIR):
    print('ERROR!, data path {} does not exist! Setting to None'.format(DATA_DIR))
    DATA_DIR = None
else:
    print('DATA_DIR: {}\n'.format(DATA_DIR))
    
# Add BASE_DIR to sys paths (for loading libraries)
sys.path.insert(1, os.path.join(BASE_DIR, 'workspace'))
# Load mpp_data library to convert raw data into images
from pelkmans.mpp_data import MPPData as MPPData

# List available local Wells
wells = [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]
print('Available local wells: \n', wells)


BASE_DIR: /home/hhughes/Documents/Master_Thesis/Project
DATA_DIR: /home/hhughes/Documents/Master_Thesis/Project/datasets/raw

Setting BASE_DIR to /home/hhughes/Documents/Master_Thesis/Project
Available local wells: 
 ['I11', 'I09', 'J10']


Set parameters for data transformation:

In [2]:
# In case you only want to load some specific wells, rename 'wells'
wells = ['I09']

data_params = {
    # where to read data from
    'data_dirs': [os.path.join(DATA_DIR, well) for well in wells],
    'dir_type': 'hannah',
    # make results reproducible
    'seed': 42,
    # input/output definition
    'input_channels': [
        '00_DAPI',
        '07_H2B',
        '01_CDK9_pT186',
        '03_CDK9',
        '05_GTF2B',
        '07_SETD1A',
        '08_H3K4me3',
        '09_SRRM2',
        '10_H3K27ac',
        '11_KPNA2_MAX',
        '12_RB1_pS807_S811',
        '13_PABPN1',
        '14_PCNA',
        '15_SON',
        '16_H3',
        '17_HDAC3',
        '19_KPNA1_MAX',
        '20_SP100',
        '21_NCL',
        '01_PABPC1',
        '02_CDK7',
        '03_RPS6',
        '05_Sm',
        '07_POLR2A',
        '09_CCNT1',
        '10_POL2RA_pS2',
        '11_PML',
        '12_YAP1',
        '13_POL2RA_pS5',
        '15_U2SNRNPB',
        '18_NONO',
        '20_ALYREF',
        '21_COIL',
    ],
    'output_channels': ['00_EU'],
    'aggregate_output': 'avg', # None results in output images, 'max', 'avg' aggregate output channels and output a single number
    # train/val/test split
    'train_frac': 0.8,
    'val_frac': 0.1,
    'img_size': 224,
    # normalisation
    'background_value': os.path.join(DATA_DIR, 'secondary_only_relative_normalisation.csv'),
    'normalise': True,
    'percentile': 98.0,
    # Add Cell cycle to metadata
    'add_cell_cycle_to_metadata': True,
    'cell_cycle_file': os.path.join(DATA_DIR, 'cell_cycle_classification.csv'),
    # Add well info to metadata (cell_type, perturbation and duration)
    'add_well_info_to_metadata': True,
    'well_info_file': os.path.join(DATA_DIR, 'wells_metadata.csv'),
    # Fitering
    'filter_criteria': ['is_border_cell', 'is_mitotic'],
    'filter_values': [0, 0],
}

Process data:

In [3]:
p = data_params

mpp_datas = {'train': [], 'val': [], 'test': []}
for data_dir in p['data_dirs']:
    print('\nProcessing Well {}...'.format(data_dir))
    # Load data as an MPPData object
    mpp_data = MPPData.from_data_dir(data_dir,
                                     dir_type=p['dir_type'],
                                     seed=p['seed'])
    
    # Remove unwanted cells
    if p.get('filter_criteria', None) is not None:
        print('removing unwanted cells...')
        mpp_data.filter_cells(p['filter_criteria'], p['filter_values'])

    # Subtract background  values for each channel
    if p['normalise']:
        mpp_data.subtract_background(p['background_value'])
        
    
    # Split data into train, validation and test
    train, val, test = mpp_data.train_val_test_split(p['train_frac'], p['val_frac'])
    
    # Save well data in dic
    mpp_datas['train'].append(train)
    mpp_datas['test'].append(test)
    mpp_datas['val'].append(val)
    
    # Release memory
    del(train)
    del(val)
    del(test)
    del(mpp_data)
    
# merge data from all the loaded wells
train = MPPData.concat(mpp_datas['train'])
val = MPPData.concat(mpp_datas['val'])
test = MPPData.concat(mpp_datas['test'])
del(mpp_datas)

# Normalize train, val and test using the normalization parameters
# got from the train data (inner percentile% of train data)
if p['normalise']:
    rescale_values = train.rescale_intensities_per_channel(percentile=p['percentile'])
    _ = val.rescale_intensities_per_channel(rescale_values=rescale_values)
    _ = test.rescale_intensities_per_channel(rescale_values=rescale_values)
    p['normalise_rescale_values'] = list(rescale_values)

# Add cell cycle to metadata (G1, S, G2)
if p['add_cell_cycle_to_metadata']:
    train.add_cell_cycle_to_metadata(p['cell_cycle_file'])
    val.add_cell_cycle_to_metadata(p['cell_cycle_file'])
    test.add_cell_cycle_to_metadata(p['cell_cycle_file'])

if p['add_well_info_to_metadata']:
    train.add_well_info_to_metadata(p['well_info_file'])
    val.add_well_info_to_metadata(p['well_info_file'])
    test.add_well_info_to_metadata(p['well_info_file'])


Processing Well /home/hhughes/Documents/Master_Thesis/Project/datasets/raw/I09...
removing unwanted cells...
Total number of cells: 886

269 cells cutted by filter: is_border_cell != 0
48 cells cutted by filter: is_mitotic != 0

Final number of cells cutted: 304


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']


In [4]:
train.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration
0,373535,plate01,I09,0,0,0,0,6,0,373506,plate01,I09,0,0,0,0,6,0,0.0,,0.0,,0.0,,S,184A1,normal,
1,373536,plate01,I09,0,0,0,0,7,0,373507,plate01,I09,0,0,0,0,7,0,0.0,,0.0,,0.0,,G1,184A1,normal,
2,373537,plate01,I09,0,0,0,0,8,0,373508,plate01,I09,0,0,0,0,8,0,0.0,,0.0,,0.0,,G1,184A1,normal,
3,373538,plate01,I09,0,0,0,0,9,0,373509,plate01,I09,0,0,0,0,9,0,0.0,,0.0,,0.0,,S,184A1,normal,
4,373539,plate01,I09,0,0,0,0,10,0,373510,plate01,I09,0,0,0,0,10,0,0.0,,0.0,,0.0,,S,184A1,normal,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460,248097,plate01,I09,5,4,0,0,32,0,248052,plate01,I09,5,4,0,0,32,0,0.0,,0.0,,0.0,,G1,184A1,normal,
461,248098,plate01,I09,5,4,0,0,33,0,248053,plate01,I09,5,4,0,0,33,0,0.0,,0.0,,0.0,,S,184A1,normal,
462,248100,plate01,I09,5,4,0,0,35,0,248055,plate01,I09,5,4,0,0,35,0,0.0,,0.0,,1.0,,,184A1,normal,
463,248102,plate01,I09,5,4,0,0,37,0,248057,plate01,I09,5,4,0,0,37,0,0.0,,0.0,,0.0,,S,184A1,normal,


In [5]:
val.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration
0,373548,plate01,I09,0,0,0,0,19,0,373519,plate01,I09,0,0,0,0,19,0,0.0,,0.0,,0.0,,G1,184A1,normal,
1,373550,plate01,I09,0,0,0,0,21,0,373521,plate01,I09,0,0,0,0,21,0,0.0,,0.0,,0.0,,S,184A1,normal,
2,228068,plate01,I09,0,1,0,0,1,0,228047,plate01,I09,0,1,0,0,1,0,0.0,,0.0,,0.0,,G1,184A1,normal,
3,324142,plate01,I09,0,2,0,0,7,0,324083,plate01,I09,0,2,0,0,7,0,0.0,,0.0,,0.0,,G2,184A1,normal,
4,291040,plate01,I09,0,3,0,0,11,0,290974,plate01,I09,0,3,0,0,11,0,0.0,,0.0,,0.0,,G1,184A1,normal,
5,291044,plate01,I09,0,3,0,0,15,0,290978,plate01,I09,0,3,0,0,15,0,0.0,,0.0,,0.0,,S,184A1,normal,
6,291048,plate01,I09,0,3,0,0,19,0,290982,plate01,I09,0,3,0,0,19,0,0.0,,0.0,,0.0,,G2,184A1,normal,
7,291056,plate01,I09,0,3,0,0,27,0,290990,plate01,I09,0,3,0,0,27,0,0.0,,0.0,,0.0,,G1,184A1,normal,
8,383784,plate01,I09,0,4,0,0,10,0,383748,plate01,I09,0,4,0,0,10,0,0.0,,0.0,,0.0,,G2,184A1,normal,
9,383785,plate01,I09,0,4,0,0,11,0,383749,plate01,I09,0,4,0,0,11,0,0.0,,0.0,,0.0,,G1,184A1,normal,


In [6]:
test.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration
0,228079,plate01,I09,0,1,0,0,12,0,228058,plate01,I09,0,1,0,0,12,0,0.0,,0.0,,0.0,,S,184A1,normal,
1,324146,plate01,I09,0,2,0,0,11,0,324087,plate01,I09,0,2,0,0,11,0,0.0,,0.0,,0.0,,G1,184A1,normal,
2,291066,plate01,I09,0,3,0,0,37,0,291000,plate01,I09,0,3,0,0,37,0,0.0,,0.0,,0.0,,G2,184A1,normal,
3,291073,plate01,I09,0,3,0,0,44,0,291007,plate01,I09,0,3,0,0,44,0,0.0,,0.0,,0.0,,S,184A1,normal,
4,291075,plate01,I09,0,3,0,0,46,0,291009,plate01,I09,0,3,0,0,46,0,0.0,,0.0,,0.0,,G1,184A1,normal,
5,383787,plate01,I09,0,4,0,0,13,0,383751,plate01,I09,0,4,0,0,13,0,0.0,,0.0,,0.0,,S,184A1,normal,
6,383800,plate01,I09,0,4,0,0,26,0,383764,plate01,I09,0,4,0,0,26,0,0.0,,0.0,,0.0,,S,184A1,normal,
7,383312,plate01,I09,1,3,0,0,20,0,383258,plate01,I09,1,3,0,0,20,0,0.0,,0.0,,0.0,,S,184A1,normal,
8,345912,plate01,I09,1,4,0,0,19,0,345875,plate01,I09,1,4,0,0,19,0,0.0,,0.0,,0.0,,G1,184A1,normal,
9,271270,plate01,I09,2,0,0,0,18,0,271178,plate01,I09,2,0,0,0,18,0,0.0,,0.0,,0.0,,G1,184A1,normal,


## Save data

Prepare to save data:

In [7]:
import shutil

# create dir
dataset_name = '184A1_hannah_EU_regression'
outdir = os.path.join(BASE_DIR, 'datasets', dataset_name)
if os.path.exists(outdir):
    print('Warning! Directory {} already exist! Deleting...\n'.format(outdir))
    try:
        shutil.rmtree(outdir)
    except OSError as e:
        print('Dir {} could not be deleted!\n\nOSError: {}'.format(outdir, e))

print('Creating dir: {}'.format(outdir))
os.makedirs(outdir, exist_ok=False)
    


Creating dir: /home/hhughes/Documents/Master_Thesis/Project/datasets/184A1_hannah_EU_regression


In [8]:
# Get channels ids (proteins) which will be used to predict transcripcion rate
input_ids = list(train.channels.set_index('name').loc[p['input_channels']]['channel_id'])
# Get id of the channel that measure trancripcion rate
output_ids = list(train.channels.set_index('name').loc[p['output_channels']]['channel_id'])
# add output channel id after the input channels ids
ids = input_ids + output_ids

Save metadata and used parameters

In [9]:
# save params
json.dump(data_params, open(os.path.join(outdir, 'params.json'), 'w'), indent=4)

# save metadata
train.metadata.to_csv(os.path.join(outdir, 'train_metadata.csv'))
val.metadata.to_csv(os.path.join(outdir, 'val_metadata.csv'))
test.metadata.to_csv(os.path.join(outdir, 'test_metadata.csv'))
pd.concat([train.metadata, val.metadata, test.metadata]).to_csv(os.path.join(outdir, 'metadata.csv'))

# Save used channels
#train.channels.to_csv(os.path.join(outdir, 'channels.csv'))
train.channels.set_index('channel_id').loc[ids].to_csv(os.path.join(outdir, 'channels.csv'))

Save Images

In [10]:
# Note! instead of calculating the response value (y) here and save
# it separatelly, instead we will do it on the modeling part

"""
# get images
train_dataset = np.array(train.get_object_imgs(data='MPP', img_size=p['img_size']))
del(train)
val_dataset = np.array(val.get_object_imgs(data='MPP', img_size=p['img_size']))
del(val)
test_dataset = np.array(test.get_object_imgs(data='MPP', img_size=p['img_size']))
del(test)

# Create responce variable (y)
if p['aggregate_output'] == 'avg':
    train_dataset_y = np.array([img[img!=0].mean() for img in train_dataset[:,:,:,output_ids]])
    val_dataset_y = np.array([img[img!=0].mean() for img in val_dataset[:,:,:,output_ids]])
    test_dataset_y = np.array([img[img!=0].mean() for img in test_dataset[:,:,:,output_ids]])

# Save datasets
np.savez(os.path.join(outdir, 'train_dataset.npz'), x=train_dataset[:,:,:,input_ids], y=train_dataset_y)
del(train_dataset)
np.savez(os.path.join(outdir, 'val_dataset.npz'), x=val_dataset[:,:,:,input_ids], y=val_dataset_y)
del(val_dataset)
np.savez(os.path.join(outdir, 'test_dataset.npz'), x=test_dataset[:,:,:,input_ids], y=test_dataset_y)
del(test_dataset)
"""

# get images, save them and delete vars
train_dataset = np.array(train.get_object_imgs(data='MPP', img_size=p['img_size']))
np.save(os.path.join(outdir, 'train_dataset.npy'), train_dataset[:,:,:,ids])
del(train_dataset)
del(train)

val_dataset = np.array(val.get_object_imgs(data='MPP', img_size=p['img_size']))
np.save(os.path.join(outdir, 'val_dataset.npy'), val_dataset[:,:,:,ids])
del(val_dataset)
del(val)

test_dataset = np.array(test.get_object_imgs(data='MPP', img_size=p['img_size']))
np.save(os.path.join(outdir, 'test_dataset.npy'), test_dataset[:,:,:,ids])
del(test_dataset)
del(test)