# Data Preprocessing for predicting Transcription Rate (TS)

This notebook is ment to convert raw cell data from several wells into multichannel images (along with its corresponding metadata).

Data was taken from:
`/storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/` and server `vicb-submit-01`. 

Load libraries and set Directories paths:

In [1]:
import numpy as np
import pandas as pd
# To display all the columns
pd.options.display.max_columns = None
import os
import sys
import matplotlib.pyplot as plt
import json

# Set paths
BASE_DIR = os.path.realpath(os.path.join(os.path.abspath(''),'../..'))
if not os.path.exists(BASE_DIR):
    raise Exception('Base path {} does not exist!'.format(BASE_DIR))
else:
    print('BASE_DIR: {}'.format(BASE_DIR))
    
DATA_DIR = os.path.join(BASE_DIR, 'datasets', 'raw')
if not os.path.exists(DATA_DIR):
    raise Exception('Data path {} does not exist!'.format(DATA_DIR))
else:
    print('DATA_DIR: {}'.format(DATA_DIR))
    
# Add BASE_DIR to sys paths (for loading libraries)
sys.path.insert(1, os.path.join(BASE_DIR, 'workspace'))
# Load mpp_data library to convert raw data into images
from pelkmans.mpp_data import MPPData as MPPData

BASE_DIR: /home/hhughes/Documents/Master_Thesis/Project
DATA_DIR: /home/hhughes/Documents/Master_Thesis/Project/datasets/raw
Setting BASE_DIR to /home/hhughes/Documents/Master_Thesis/Project


Check available data (Perturbations and Wells):

In [2]:
# Save available local Perturbations and Wells
perturbations = [p for p in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, p))]
local_data = {}
#print('Local available perturbations-wells:\n')
for p in perturbations:
    pertur_dir = os.path.join(DATA_DIR, p)
    wells = [w for w in os.listdir(pertur_dir) if os.path.isdir(os.path.join(pertur_dir, w))]
    #print('{}\n\t{}\n'.format(p, wells))
    local_data[p] = wells

Select Perturbations and its wells to process: 

In [5]:
print('Local available perturbations-wells:\n{}'.format(local_data))

# In case you only want to load some specific perturbations and/or wells:
selected_data = {
    '184A1_hannah_unperturbed': ['I11', 'I09'],
    '184A1_hannah_TSA': ['J20', 'I16'],
}

selected_data = {
    '184A1_hannah_unperturbed': ['I11']
}

# Process all available data:
selected_data = local_data

print('\nSelected perturbations-wells:\n{}'.format(selected_data))

#Generate and save data dirs
data_dirs = []
for p in selected_data.keys():
    for w in selected_data[p]:
        d = os.path.join(DATA_DIR, p, w)
        data_dirs.append(d)
        if not os.path.exists(d):
            raise Exception('{} does not exist!\nCheck if selected_data contain elements only from local_data dict.'.format(d))

Local available perturbations-wells:
{'184A1_hannah_unperturbed': ['I11', 'I09', 'J10'], '184A1_hannah_TSA': ['J20', 'I16', 'J13']}

Selected perturbations-wells:
{'184A1_hannah_unperturbed': ['I11', 'I09', 'J10'], '184A1_hannah_TSA': ['J20', 'I16', 'J13']}


Set parameters for data transformation:

In [6]:
data_params = {
    # where to read data from
    'data_dirs': data_dirs,
    'dir_type': 'hannah',
    # make results reproducible
    'seed': 42,
    # input/output definition
    'input_channels': [
        '00_DAPI',
        '07_H2B',
        '01_CDK9_pT186',
        '03_CDK9',
        '05_GTF2B',
        '07_SETD1A',
        '08_H3K4me3',
        '09_SRRM2',
        '10_H3K27ac',
        '11_KPNA2_MAX',
        '12_RB1_pS807_S811',
        '13_PABPN1',
        '14_PCNA',
        '15_SON',
        '16_H3',
        '17_HDAC3',
        '19_KPNA1_MAX',
        '20_SP100',
        '21_NCL',
        '01_PABPC1',
        '02_CDK7',
        '03_RPS6',
        '05_Sm',
        '07_POLR2A',
        '09_CCNT1',
        '10_POL2RA_pS2',
        '11_PML',
        '12_YAP1',
        '13_POL2RA_pS5',
        '15_U2SNRNPB',
        '18_NONO',
        '20_ALYREF',
        '21_COIL',
    ],
    'output_channels': ['00_EU'],
    'aggregate_output': 'avg', # None results in output images, 'max', 'avg' aggregate output channels and output a single number
    # train/val/test split
    'train_frac': 0.8,
    'val_frac': 0.1,
    'img_size': 224,
    # Subtract background
    'subtract_background': True,
    'background_value': os.path.join(DATA_DIR, 'secondary_only_relative_normalisation.csv'),
    # normalisation
    'normalise': True,
    'percentile': 98.0,
    # Add Cell cycle to metadata
    'add_cell_cycle_to_metadata': True,
    'cell_cycle_file': os.path.join(DATA_DIR, 'cell_cycle_classification.csv'),
    # Add well info to metadata (cell_type, perturbation and duration)
    'add_well_info_to_metadata': True,
    'well_info_file': os.path.join(DATA_DIR, 'wells_metadata.csv'),
    # Fitering
    #'filter_criteria': ['is_border_cell', 'is_mitotic', 'is_polynuclei_184A1'],
    #'filter_values': [1, 1, 'NaN'],
    'filter_criteria': ['is_border_cell', 'is_polynuclei_184A1', 'cell_cycle'],
    'filter_values': [1, 1, 'NaN'],
    # Convert into image
    'convert_into_image': False,
    'remove_original_data': True,
    # Project each cell channel into a scalar
    # methods: 'avg' and 'median'
    'project_into_scalar': True,
    'method': 'avg',
}
p = data_params

Process data:

In [7]:
for data_dir in p['data_dirs']:
    print('\nProcessing dir {}...'.format(data_dir))
    # Load data as an MPPData object
    mpp_temp = MPPData.from_data_dir(data_dir,
                                     dir_type=p['dir_type'],
                                     seed=p['seed'])
    
    # Add cell cycle to metadata (G1, S, G2)
    # Important! If mapobject_id_cell is not in cell_cycle_file =>
    # its corresponding cell is in Mitosis phase!
    if p['add_cell_cycle_to_metadata']:
        mpp_temp.add_cell_cycle_to_metadata(p['cell_cycle_file'])
    
    # Add well info to metadata
    if p['add_well_info_to_metadata']:
        mpp_temp.add_well_info_to_metadata(p['well_info_file'])
    
    # Remove unwanted cells
    if p.get('filter_criteria', None) is not None:
        print('Removing unwanted cells...')
        mpp_temp.filter_cells(p['filter_criteria'], p['filter_values'])

    # Subtract background  values for each channel
    if p['subtract_background']:
        mpp_temp.subtract_background(p['background_value'])
    
    # Project every uni-channel images into a scalar for further analysis
    if p['project_into_scalar']:
        print('\nProjecting data...')
        mpp_temp.add_scalar_projection(p['method'])
        
        instance_vars = {'labels', 'x', 'y', 'mpp', 'mapobject_ids', 'mcu_ids','conditions'}
        for var in set(vars(mpp_temp).keys()).intersection(instance_vars):
            delattr(mpp_temp, var)
    
    # Concatenate wells
    # Check first if data sets are already defined
    if 'mpp_data' not in globals().keys():
        mpp_data = mpp_temp
    else:
        mpp_data.merge_instances([mpp_temp])
        del(mpp_temp)



Processing dir /home/hhughes/Documents/Master_Thesis/Project/datasets/raw/184A1_hannah_unperturbed/I11...
Removing unwanted cells...
Total number of cells: 1025
333 cells cutted by filter: is_border_cell == 1
33 cells cutted by filter: is_polynuclei_184A1 == 1
375 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 375


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /home/hhughes/Documents/Master_Thesis/Project/datasets/raw/184A1_hannah_unperturbed/I09...
Removing unwanted cells...
Total number of cells: 886
269 cells cutted by filter: is_border_cell == 1
30 cells cutted by filter: is_polynuclei_184A1 == 1
323 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 323


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /home/hhughes/Documents/Master_Thesis/Project/datasets/raw/184A1_hannah_unperturbed/J10...
Removing unwanted cells...
Total number of cells: 1080
333 cells cutted by filter: is_border_cell == 1
31 cells cutted by filter: is_polynuclei_184A1 == 1
400 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 400


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /home/hhughes/Documents/Master_Thesis/Project/datasets/raw/184A1_hannah_TSA/J20...
Removing unwanted cells...
Total number of cells: 944
285 cells cutted by filter: is_border_cell == 1
24 cells cutted by filter: is_polynuclei_184A1 == 1
341 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 341


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /home/hhughes/Documents/Master_Thesis/Project/datasets/raw/184A1_hannah_TSA/I16...
Removing unwanted cells...
Total number of cells: 994
306 cells cutted by filter: is_border_cell == 1
35 cells cutted by filter: is_polynuclei_184A1 == 1
372 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 372


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /home/hhughes/Documents/Master_Thesis/Project/datasets/raw/184A1_hannah_TSA/J13...
Removing unwanted cells...
Total number of cells: 819
272 cells cutted by filter: is_border_cell == 1
23 cells cutted by filter: is_polynuclei_184A1 == 1
320 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 320


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...


In [8]:
mpp_data.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration,00_DAPI_avg,07_H2B_avg,01_CDK9_pT186_avg,03_CDK9_avg,05_GTF2B_avg,07_SETD1A_avg,08_H3K4me3_avg,09_SRRM2_avg,10_H3K27ac_avg,11_KPNA2_MAX_avg,12_RB1_pS807_S811_avg,13_PABPN1_avg,14_PCNA_avg,15_SON_avg,16_H3_avg,17_HDAC3_avg,19_KPNA1_MAX_avg,20_SP100_avg,21_NCL_avg,01_PABPC1_avg,02_CDK7_avg,03_RPS6_avg,05_Sm_avg,07_POLR2A_avg,09_CCNT1_avg,10_POL2RA_pS2_avg,11_PML_avg,12_YAP1_avg,13_POL2RA_pS5_avg,15_U2SNRNPB_avg,18_NONO_avg,20_ALYREF_avg,21_COIL_avg,00_BG488_avg,00_BG568_avg,00_EU_avg,09_SRRM2_ILASTIK_avg,15_SON_ILASTIK_avg
0,384925,plate01,I11,0,0,0,0,8,0,384878,plate01,I11,0,0,0,0,8,0,0.0,,0.0,,0.0,,G2,184A1,normal,,49.580919,202.820664,18.948131,40.829000,117.064156,37.114938,57.157965,53.252705,96.106062,88.486502,109.575919,157.057707,82.718242,47.307534,274.299171,102.145356,85.217198,19.401196,57.496097,20.530550,124.133512,24.785745,48.391676,190.700551,113.529029,324.867045,10.849931,111.732052,229.702287,47.331092,379.498006,347.910186,10.121987,6.791126,1.707966,495.497004,7043.172991,10227.344984
1,384928,plate01,I11,0,0,0,0,11,0,384881,plate01,I11,0,0,0,0,11,0,0.0,,0.0,,0.0,,G1,184A1,normal,,30.164021,182.196790,9.705733,21.723158,80.366882,17.517103,34.247029,45.296948,54.844183,28.424197,76.827739,133.000519,32.565534,35.066510,199.746845,53.335645,46.448794,9.864785,39.274740,11.105025,92.830594,14.205652,31.476830,84.119378,57.919687,255.926885,6.019770,52.357283,188.020420,28.492050,263.426842,254.811570,6.093975,4.350661,1.586379,359.699839,6315.950161,8270.801364
2,384929,plate01,I11,0,0,0,0,12,0,384882,plate01,I11,0,0,0,0,12,0,0.0,,0.0,,0.0,,G2,184A1,normal,,50.551918,102.011438,14.855990,37.375930,80.897943,35.959874,61.106586,43.479062,98.110620,46.911233,110.400647,141.185170,49.535250,36.820278,123.243392,70.658524,58.768373,19.428695,37.602250,12.419216,113.725790,17.868525,36.993121,163.472033,112.520560,223.700999,9.371938,155.857527,136.674591,48.235297,229.832606,187.211588,6.947348,5.473350,1.624962,381.879589,5611.754178,8009.514178
3,384930,plate01,I11,0,0,0,0,13,0,384883,plate01,I11,0,0,0,0,13,0,0.0,,0.0,,0.0,,G1,184A1,normal,,36.229395,215.829031,12.517807,29.875591,102.446467,30.596964,45.135234,50.819852,58.881205,47.103537,106.866420,167.980486,55.065943,45.481732,266.386174,71.393406,64.611638,19.871854,60.299853,12.357054,119.148002,17.248061,39.763981,137.131355,81.777671,294.531620,10.640693,106.956343,212.578367,36.870416,367.738222,358.291881,9.133031,5.054470,1.489578,418.298199,7537.571819,11181.232349
4,384931,plate01,I11,0,0,0,0,14,0,384884,plate01,I11,0,0,0,0,14,0,0.0,,0.0,,0.0,,G1,184A1,normal,,41.856395,251.145820,7.348860,23.377857,61.773567,5.226182,25.000256,5.750725,81.997321,39.261571,83.900904,119.411416,18.633530,6.530840,304.286646,56.059566,59.092754,2.530481,48.826359,8.468889,96.642851,13.612243,27.191182,63.549832,57.583291,228.511565,5.053079,74.234135,231.847623,28.325956,342.079412,315.073009,7.378918,4.307978,1.638408,367.575124,516.542219,2242.101304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3612,284257,plate01,J13,5,4,0,0,16,0,284230,plate01,J13,5,4,0,0,16,0,0.0,,0.0,,0.0,,S,184A1,TSA,30.0,36.228779,209.716216,7.942593,17.984600,60.868849,17.328932,29.106292,37.189649,278.824554,44.406394,49.409101,126.817310,73.869078,38.481424,262.339198,72.327595,61.510589,16.015946,52.443471,9.528996,60.228627,13.913276,31.929463,88.000252,40.163512,239.036010,8.032978,64.306855,242.191424,30.561041,267.758279,308.714466,7.168197,4.255319,1.499613,306.581961,4499.556777,10026.592331
3613,284258,plate01,J13,5,4,0,0,17,0,284231,plate01,J13,5,4,0,0,17,0,0.0,,0.0,,0.0,,G1,184A1,TSA,30.0,32.458674,209.488659,11.205831,24.285097,78.980794,24.463916,43.873929,50.263050,183.135711,43.408463,49.551876,169.859174,43.273022,46.224629,252.235002,85.564977,68.374515,30.362304,64.411121,16.675623,74.908022,18.043900,42.845318,102.592567,63.714476,242.367395,9.221728,111.617335,239.930371,40.757160,293.558403,354.000245,8.567912,4.791024,1.651020,305.251321,6096.387987,10171.199519
3614,284260,plate01,J13,5,4,0,0,19,0,284233,plate01,J13,5,4,0,0,19,0,0.0,,0.0,,0.0,,G1,184A1,TSA,30.0,29.343924,195.001162,6.641188,16.955150,50.935482,11.629758,24.423626,39.322417,192.451286,35.528139,48.128282,131.560420,30.702149,30.394663,248.003017,58.867249,44.932914,13.793796,50.021936,9.437159,59.199136,12.058069,31.186000,65.928638,37.005856,216.726063,6.958052,48.490317,227.708960,25.061305,255.906819,335.438800,6.492982,4.117301,1.544936,283.376949,5882.252904,8534.171736
3615,284261,plate01,J13,5,4,0,0,20,0,284234,plate01,J13,5,4,0,0,20,0,0.0,,0.0,,0.0,,G1,184A1,TSA,30.0,33.870777,234.665645,10.683163,21.774896,75.347900,19.759536,37.449078,44.315569,218.585294,56.257521,73.823027,132.948999,43.068630,43.762826,273.025224,83.658497,66.278915,20.182075,66.902140,14.483246,93.263207,19.926318,40.136706,108.875594,55.197546,271.813168,9.005791,97.250617,256.702377,35.592933,260.657575,335.056422,8.605475,5.258699,1.545509,363.587973,5530.696576,9958.918240


## Save data

Prepare to save data:

In [9]:
import shutil

# create dir
dataset_name = '184A1_hannah_scalars'
outdir = os.path.join(BASE_DIR, 'datasets', dataset_name)
if os.path.exists(outdir):
    print('Warning! Directory {} already exist! Deleting...\n'.format(outdir))
    try:
        shutil.rmtree(outdir)
    except OSError as e:
        print('Dir {} could not be deleted!\n\nOSError: {}'.format(outdir, e))

print('Creating dir: {}'.format(outdir))
os.makedirs(outdir, exist_ok=False)
    

Creating dir: /home/hhughes/Documents/Master_Thesis/Project/datasets/184A1_hannah_scalars


In [11]:
# Get channels ids (proteins) which will be used to predict transcripcion rate
input_ids = list(mpp_data.channels.set_index('name').loc[p['input_channels']]['channel_id'])
# Get id of the channel that measure trancripcion rate
output_ids = list(mpp_data.channels.set_index('name').loc[p['output_channels']]['channel_id'])
# add output channel id after the input channels ids
channels_ids = input_ids + output_ids

Save metadata and used parameters

In [12]:
# save params
json.dump(data_params, open(os.path.join(outdir, 'params.json'), 'w'), indent=4)

# save metadata
mpp_data.metadata.to_csv(os.path.join(outdir, 'metadata.csv'))

# Save used channels
mpp_data.channels.to_csv(os.path.join(outdir, 'channels.csv'))
#mpp_data.channels.set_index('channel_id').loc[channels_ids].to_csv(os.path.join(outdir, 'channels.csv'))