# Data Preprocessing for predicting Transcription Rate (TS)

This notebook is ment to convert raw cell data from several wells into multichannel images (along with its corresponding metadata).

Data was taken from:
`/storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/` and server `vicb-submit-01`. 

Load libraries and set Directories paths:

In [1]:
import numpy as np
import pandas as pd
# To display all the columns
pd.options.display.max_columns = None
import os
import sys
import matplotlib.pyplot as plt
import json

# Set paths
BASE_DIR = os.path.realpath(os.path.join(os.path.abspath(''),'../..'))
if not os.path.exists(BASE_DIR):
    raise Exception('Base path {} does not exist!'.format(BASE_DIR))
else:
    print('BASE_DIR: {}'.format(BASE_DIR))
    
DATA_DIR = os.path.join(BASE_DIR, 'datasets', 'raw')
if not os.path.exists(DATA_DIR):
    raise Exception('Data path {} does not exist!'.format(DATA_DIR))
else:
    print('DATA_DIR: {}'.format(DATA_DIR))
    
# Add BASE_DIR to sys paths (for loading libraries)
sys.path.insert(1, os.path.join(BASE_DIR, 'workspace'))
# Load mpp_data library to convert raw data into images
from pelkmans.mpp_data import MPPData as MPPData

BASE_DIR: /home/hhughes/Documents/Master_Thesis/Project
DATA_DIR: /home/hhughes/Documents/Master_Thesis/Project/datasets/raw
Setting BASE_DIR to /home/hhughes/Documents/Master_Thesis/Project


Check available data (Perturbations and Wells):

In [2]:
# Save available local Perturbations and Wells
perturbations = [p for p in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, p))]
local_data = {}
#print('Local available perturbations-wells:\n')
for p in perturbations:
    pertur_dir = os.path.join(DATA_DIR, p)
    wells = [w for w in os.listdir(pertur_dir) if os.path.isdir(os.path.join(pertur_dir, w))]
    #print('{}\n\t{}\n'.format(p, wells))
    local_data[p] = wells

Select Perturbations and its wells to process: 

In [3]:
print('Local available perturbations-wells:\n{}'.format(local_data))

# Process all available data:
#selected_data = local_data

# In case you only want to load some specific perturbations and/or wells:
selected_data = {
    '184A1_hannah_unperturbed': ['I11'],
    '184A1_hannah_TSA': ['J20'],
}
print('\nSelected perturbations-wells:\n{}'.format(selected_data))

#Generate and save data dirs
data_dirs = []
for p in selected_data.keys():
    for w in selected_data[p]:
        d = os.path.join(DATA_DIR, p, w)
        data_dirs.append(d)
        if not os.path.exists(d):
            raise Exception('{} does not exist!\nCheck if selected_data contain elements only from local_data dict.'.format(d))

Local available perturbations-wells:
{'184A1_hannah_unperturbed': ['I11', 'I09', 'J10'], '184A1_hannah_TSA': ['J20', 'I16', 'J13']}

Selected perturbations-wells:
{'184A1_hannah_unperturbed': ['I11'], '184A1_hannah_TSA': ['J20']}


Set parameters for data transformation:

In [4]:
data_params = {
    # where to read data from
    'data_dirs': data_dirs,
    'dir_type': 'hannah',
    # make results reproducible
    'seed': 42,
    # input/output definition
    'input_channels': [
        '00_DAPI',
        '07_H2B',
        '01_CDK9_pT186',
        '03_CDK9',
        '05_GTF2B',
        '07_SETD1A',
        '08_H3K4me3',
        '09_SRRM2',
        '10_H3K27ac',
        '11_KPNA2_MAX',
        '12_RB1_pS807_S811',
        '13_PABPN1',
        '14_PCNA',
        '15_SON',
        '16_H3',
        '17_HDAC3',
        '19_KPNA1_MAX',
        '20_SP100',
        '21_NCL',
        '01_PABPC1',
        '02_CDK7',
        '03_RPS6',
        '05_Sm',
        '07_POLR2A',
        '09_CCNT1',
        '10_POL2RA_pS2',
        '11_PML',
        '12_YAP1',
        '13_POL2RA_pS5',
        '15_U2SNRNPB',
        '18_NONO',
        '20_ALYREF',
        '21_COIL',
    ],
    'output_channels': ['00_EU'],
    'aggregate_output': 'avg', # None results in output images, 'max', 'avg' aggregate output channels and output a single number
    # train/val/test split
    'train_frac': 0.8,
    'val_frac': 0.1,
    'img_size': 224,
    # normalisation
    'background_value': os.path.join(DATA_DIR, 'secondary_only_relative_normalisation.csv'),
    'normalise': True,
    'percentile': 98.0,
    # Add Cell cycle to metadata
    'add_cell_cycle_to_metadata': True,
    'cell_cycle_file': os.path.join(DATA_DIR, 'cell_cycle_classification.csv'),
    # Add well info to metadata (cell_type, perturbation and duration)
    'add_well_info_to_metadata': True,
    'well_info_file': os.path.join(DATA_DIR, 'wells_metadata.csv'),
    # Fitering
    #'filter_criteria': ['is_border_cell', 'is_mitotic', 'is_polynuclei_184A1'],
    #'filter_values': [1, 1, 'NaN'],
    'filter_criteria': ['is_border_cell', 'is_polynuclei_184A1', 'cell_cycle'],
    'filter_values': [1, 1, 'NaN'],
}
p = data_params

Process data:

In [5]:
mpp_datas = {'train': [], 'val': [], 'test': []}
for data_dir in p['data_dirs']:
    print('\nProcessing dir {}...'.format(data_dir))
    # Load data as an MPPData object
    mpp_data = MPPData.from_data_dir(data_dir,
                                     dir_type=p['dir_type'],
                                     seed=p['seed'])
    
    # Add cell cycle to metadata (G1, S, G2)
    # Important! If mapobject_id_cell is not in cell_cycle_file =>
    # its corresponding cell is in Mitosis phase!
    if p['add_cell_cycle_to_metadata']:
        mpp_data.add_cell_cycle_to_metadata(p['cell_cycle_file'])
    
    # Add well info to metadata
    if p['add_well_info_to_metadata']:
        mpp_data.add_well_info_to_metadata(p['well_info_file'])
    
    # Remove unwanted cells
    if p.get('filter_criteria', None) is not None:
        print('Removing unwanted cells...')
        mpp_data.filter_cells(p['filter_criteria'], p['filter_values'])

    # Subtract background  values for each channel
    if p['normalise']:
        mpp_data.subtract_background(p['background_value'])
        
    
    # Split data into train, validation and test
    train, val, test = mpp_data.train_val_test_split(p['train_frac'], p['val_frac'])
    
    # Save well data in dic
    mpp_datas['train'].append(train)
    mpp_datas['test'].append(test)
    mpp_datas['val'].append(val)
    
    # Release memory
    del(train)
    del(val)
    del(test)
    del(mpp_data)
    
# merge data from all the loaded wells
train = MPPData.concat(mpp_datas['train'])
val = MPPData.concat(mpp_datas['val'])
test = MPPData.concat(mpp_datas['test'])
del(mpp_datas)

# Normalize train, val and test using the normalization parameters
# got from the train data (inner percentile% of train data)
if p['normalise']:
    rescale_values = train.rescale_intensities_per_channel(percentile=p['percentile'])
    _ = val.rescale_intensities_per_channel(rescale_values=rescale_values)
    _ = test.rescale_intensities_per_channel(rescale_values=rescale_values)
    p['normalise_rescale_values'] = list(rescale_values)


Processing dir /home/hhughes/Documents/Master_Thesis/Project/datasets/raw/184A1_hannah_unperturbed/I11...
Removing unwanted cells...
Total number of cells: 1025
333 cells cutted by filter: is_border_cell == 1
33 cells cutted by filter: is_polynuclei_184A1 == 1
375 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 375


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Processing dir /home/hhughes/Documents/Master_Thesis/Project/datasets/raw/184A1_hannah_TSA/J20...
Removing unwanted cells...
Total number of cells: 944
285 cells cutted by filter: is_border_cell == 1
24 cells cutted by filter: is_polynuclei_184A1 == 1
341 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 341


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']


In [18]:
train.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration
0,384925,plate01,I11,0,0,0,0,8,0,384878,plate01,I11,0,0,0,0,8,0,0.0,,0.0,,0.0,,G2,184A1,normal,
1,384928,plate01,I11,0,0,0,0,11,0,384881,plate01,I11,0,0,0,0,11,0,0.0,,0.0,,0.0,,G1,184A1,normal,
2,384929,plate01,I11,0,0,0,0,12,0,384882,plate01,I11,0,0,0,0,12,0,0.0,,0.0,,0.0,,G2,184A1,normal,
3,384930,plate01,I11,0,0,0,0,13,0,384883,plate01,I11,0,0,0,0,13,0,0.0,,0.0,,0.0,,G1,184A1,normal,
4,384931,plate01,I11,0,0,0,0,14,0,384884,plate01,I11,0,0,0,0,14,0,0.0,,0.0,,0.0,,G1,184A1,normal,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,289489,plate01,J20,5,4,0,0,24,0,289454,plate01,J20,5,4,0,0,24,0,0.0,,0.0,,0.0,,G1,184A1,TSA,30.0
998,289490,plate01,J20,5,4,0,0,25,0,289455,plate01,J20,5,4,0,0,25,0,0.0,,0.0,,0.0,,S,184A1,TSA,30.0
999,289493,plate01,J20,5,4,0,0,28,0,289458,plate01,J20,5,4,0,0,28,0,0.0,,0.0,,0.0,,G1,184A1,TSA,30.0
1000,289494,plate01,J20,5,4,0,0,29,0,289459,plate01,J20,5,4,0,0,29,0,0.0,,0.0,,0.0,,G1,184A1,TSA,30.0


In [19]:
val.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration
0,384932,plate01,I11,0,0,0,0,15,0,384885,plate01,I11,0,0,0,0,15,0,0.0,,0.0,,0.0,,S,184A1,normal,
1,384956,plate01,I11,0,0,0,0,39,0,384909,plate01,I11,0,0,0,0,39,0,0.0,,0.0,,0.0,,S,184A1,normal,
2,383889,plate01,I11,0,1,0,0,15,0,383825,plate01,I11,0,1,0,0,15,0,0.0,,0.0,,0.0,,G1,184A1,normal,
3,383902,plate01,I11,0,1,0,0,28,0,383838,plate01,I11,0,1,0,0,28,0,0.0,,0.0,,0.0,,G1,184A1,normal,
4,337400,plate01,I11,0,2,0,0,39,0,337277,plate01,I11,0,2,0,0,39,0,0.0,,0.0,,0.0,,G1,184A1,normal,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,242758,plate01,J20,5,0,0,0,18,0,242678,plate01,J20,5,0,0,0,18,0,0.0,,0.0,,0.0,,S,184A1,TSA,30.0
121,367236,plate01,J20,5,2,0,0,11,0,367205,plate01,J20,5,2,0,0,11,0,0.0,,0.0,,0.0,,S,184A1,TSA,30.0
122,367248,plate01,J20,5,2,0,0,23,0,367217,plate01,J20,5,2,0,0,23,0,0.0,,0.0,,0.0,,G1,184A1,TSA,30.0
123,321469,plate01,J20,5,3,0,0,12,0,321436,plate01,J20,5,3,0,0,12,0,0.0,,0.0,,0.0,,S,184A1,TSA,30.0


In [20]:
test.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration
0,384935,plate01,I11,0,0,0,0,18,0,384888,plate01,I11,0,0,0,0,18,0,0.0,,0.0,,0.0,,G1,184A1,normal,
1,384951,plate01,I11,0,0,0,0,34,0,384904,plate01,I11,0,0,0,0,34,0,0.0,,0.0,,0.0,,G1,184A1,normal,
2,383890,plate01,I11,0,1,0,0,16,0,383826,plate01,I11,0,1,0,0,16,0,0.0,,0.0,,0.0,,G2,184A1,normal,
3,383898,plate01,I11,0,1,0,0,24,0,383834,plate01,I11,0,1,0,0,24,0,0.0,,0.0,,0.0,,G2,184A1,normal,
4,383909,plate01,I11,0,1,0,0,35,0,383845,plate01,I11,0,1,0,0,35,0,0.0,,0.0,,0.0,,G1,184A1,normal,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,367246,plate01,J20,5,2,0,0,21,0,367215,plate01,J20,5,2,0,0,21,0,0.0,,0.0,,0.0,,G1,184A1,TSA,30.0
122,367252,plate01,J20,5,2,0,0,27,0,367221,plate01,J20,5,2,0,0,27,0,0.0,,0.0,,0.0,,S,184A1,TSA,30.0
123,321465,plate01,J20,5,3,0,0,8,0,321432,plate01,J20,5,3,0,0,8,0,0.0,,0.0,,0.0,,G1,184A1,TSA,30.0
124,321478,plate01,J20,5,3,0,0,21,0,321445,plate01,J20,5,3,0,0,21,0,0.0,,0.0,,0.0,,S,184A1,TSA,30.0


## Save data

Prepare to save data:

In [21]:
import shutil

# create dir
dataset_name = '184A1_hannah_EU_regression'
outdir = os.path.join(BASE_DIR, 'datasets', dataset_name)
if os.path.exists(outdir):
    print('Warning! Directory {} already exist! Deleting...\n'.format(outdir))
    try:
        shutil.rmtree(outdir)
    except OSError as e:
        print('Dir {} could not be deleted!\n\nOSError: {}'.format(outdir, e))

print('Creating dir: {}'.format(outdir))
os.makedirs(outdir, exist_ok=False)
    


Creating dir: /home/hhughes/Documents/Master_Thesis/Project/datasets/184A1_hannah_EU_regression


In [23]:
# Get channels ids (proteins) which will be used to predict transcripcion rate
input_ids = list(train.channels.set_index('name').loc[p['input_channels']]['channel_id'])
# Get id of the channel that measure trancripcion rate
output_ids = list(train.channels.set_index('name').loc[p['output_channels']]['channel_id'])
# add output channel id after the input channels ids
ids = input_ids + output_ids

Save metadata and used parameters

In [24]:
# save params
json.dump(data_params, open(os.path.join(outdir, 'params.json'), 'w'), indent=4)

# save metadata
train.metadata.to_csv(os.path.join(outdir, 'train_metadata.csv'))
val.metadata.to_csv(os.path.join(outdir, 'val_metadata.csv'))
test.metadata.to_csv(os.path.join(outdir, 'test_metadata.csv'))
pd.concat([train.metadata, val.metadata, test.metadata]).to_csv(os.path.join(outdir, 'metadata.csv'))

# Save used channels
#train.channels.to_csv(os.path.join(outdir, 'channels.csv'))
train.channels.set_index('channel_id').loc[ids].to_csv(os.path.join(outdir, 'channels.csv'))

Save Images

In [25]:
# Note! instead of calculating the response value (y) here and save
# it separatelly, instead we will do it on the modeling part

"""
# get images
train_dataset = np.array(train.get_object_imgs(data='MPP', img_size=p['img_size']))
del(train)
val_dataset = np.array(val.get_object_imgs(data='MPP', img_size=p['img_size']))
del(val)
test_dataset = np.array(test.get_object_imgs(data='MPP', img_size=p['img_size']))
del(test)

# Create responce variable (y)
if p['aggregate_output'] == 'avg':
    train_dataset_y = np.array([img[img!=0].mean() for img in train_dataset[:,:,:,output_ids]])
    val_dataset_y = np.array([img[img!=0].mean() for img in val_dataset[:,:,:,output_ids]])
    test_dataset_y = np.array([img[img!=0].mean() for img in test_dataset[:,:,:,output_ids]])

# Save datasets
np.savez(os.path.join(outdir, 'train_dataset.npz'), x=train_dataset[:,:,:,input_ids], y=train_dataset_y)
del(train_dataset)
np.savez(os.path.join(outdir, 'val_dataset.npz'), x=val_dataset[:,:,:,input_ids], y=val_dataset_y)
del(val_dataset)
np.savez(os.path.join(outdir, 'test_dataset.npz'), x=test_dataset[:,:,:,input_ids], y=test_dataset_y)
del(test_dataset)
"""

# get images, save them and delete vars
train_dataset = np.array(train.get_object_imgs(data='MPP', img_size=p['img_size']))
np.save(os.path.join(outdir, 'train_dataset.npy'), train_dataset[:,:,:,ids])
del(train_dataset)
del(train)

val_dataset = np.array(val.get_object_imgs(data='MPP', img_size=p['img_size']))
np.save(os.path.join(outdir, 'val_dataset.npy'), val_dataset[:,:,:,ids])
del(val_dataset)
del(val)

test_dataset = np.array(test.get_object_imgs(data='MPP', img_size=p['img_size']))
np.save(os.path.join(outdir, 'test_dataset.npy'), test_dataset[:,:,:,ids])
del(test_dataset)
del(test)