# Data Preprocessing for predicting Transcription Rate (TS)

This notebook is ment to convert raw cell data from several wells into multichannel images (along with its corresponding metadata).

Data was taken from:
`/storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/` and server `vicb-submit-01`. 

Load libraries and set Directories paths:

In [1]:
import numpy as np
import pandas as pd
# To display all the columns
pd.options.display.max_columns = None
import os
import sys
import matplotlib.pyplot as plt
import json

# Set paths
BASE_DIR = os.path.realpath(os.path.join(os.path.abspath(''),'../..'))
if not os.path.exists(BASE_DIR):
    raise Exception('Base path {} does not exist!'.format(BASE_DIR))
else:
    print('BASE_DIR: {}'.format(BASE_DIR))
    
# Add BASE_DIR to sys paths (for loading libraries)
sys.path.insert(1, os.path.join(BASE_DIR, 'workspace'))
# Load mpp_data library to convert raw data into images
from pelkmans.mpp_data import MPPData as MPPData
    
PARAMETERS_FILE = os.path.join(BASE_DIR, 'workspace/scripts/Parameters/temp_parameters.json')
if not os.path.exists(PARAMETERS_FILE):
    raise Exception('Parameter file {} does not exist!'.format(PARAMETERS_FILE))


BASE_DIR: /home/hhughes/Documents/Master_Thesis/Project
Setting BASE_DIR to /home/hhughes/Documents/Master_Thesis/Project


Open parameters file

In [2]:
# Open parameters
with open(PARAMETERS_FILE) as params_file:
    p = json.load(params_file)
#del(p['_comment'])

In [3]:
p.keys()

dict_keys(['_comment', 'raw_data_dir', 'dir_type', 'perturbations_and_wells', 'output_data_dir', 'seed', 'input_channels', 'output_channels', 'aggregate_output', 'train_frac', 'val_frac', 'img_size', 'subtract_background', 'background_value', 'normalise', 'percentile', 'add_cell_cycle_to_metadata', 'cell_cycle_file', 'add_well_info_to_metadata', 'well_info_file', 'filter_criteria', 'filter_values', 'convert_into_image', 'remove_original_data', 'project_into_scalar', 'method', 'parameter_file_name'])

In [4]:
p

{'_comment': 'Save file name as reference ---------------------------------',
 'raw_data_dir': '/home/hhughes/Documents/Master_Thesis/Project/datasets/raw',
 'dir_type': 'hannah',
 'perturbations_and_wells': {'184A1_hannah_unperturbed': ['I09']},
 'output_data_dir': '/home/hhughes/Documents/Master_Thesis/Project/datasets/184A1_hannah_EU_regression_TEST_2',
 'seed': 42,
 'input_channels': ['00_DAPI',
  '07_H2B',
  '01_CDK9_pT186',
  '03_CDK9',
  '05_GTF2B',
  '07_SETD1A',
  '08_H3K4me3',
  '09_SRRM2',
  '10_H3K27ac',
  '11_KPNA2_MAX',
  '12_RB1_pS807_S811',
  '13_PABPN1',
  '14_PCNA',
  '15_SON',
  '16_H3',
  '17_HDAC3',
  '19_KPNA1_MAX',
  '20_SP100',
  '21_NCL',
  '01_PABPC1',
  '02_CDK7',
  '03_RPS6',
  '05_Sm',
  '07_POLR2A',
  '09_CCNT1',
  '10_POL2RA_pS2',
  '11_PML',
  '12_YAP1',
  '13_POL2RA_pS5',
  '15_U2SNRNPB',
  '18_NONO',
  '20_ALYREF',
  '21_COIL'],
 'output_channels': ['00_EU'],
 'aggregate_output': 'avg',
 'train_frac': 0.8,
 'val_frac': 0.1,
 'img_size': 224,
 'subtract

Set raw data directory

In [5]:
DATA_DIR = p['raw_data_dir']
if not os.path.exists(DATA_DIR):
    raise Exception('Data path {} does not exist!'.format(DATA_DIR))
else:
    print('DATA_DIR: {}'.format(DATA_DIR))

DATA_DIR: /home/hhughes/Documents/Master_Thesis/Project/datasets/raw


Check available data (Perturbations and Wells):

In [6]:
# Save available local Perturbations and Wells
perturbations = [per for per in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, per))]
local_data = {}
#print('Local available perturbations-wells:\n')
for per in perturbations:
    pertur_dir = os.path.join(DATA_DIR, per)
    wells = [w for w in os.listdir(pertur_dir) if os.path.isdir(os.path.join(pertur_dir, w))]
    #print('{}\n\t{}\n'.format(p, wells))
    local_data[per] = wells

Select Perturbations and its wells to process: 

In [7]:
print('Local available perturbations-wells:\n{}'.format(local_data))

# In case you only want to load some specific perturbations and/or wells:
selected_data = {
    '184A1_hannah_unperturbed': ['I11', 'I09'],
    '184A1_hannah_TSA': ['J20', 'I16'],
}

selected_data = p['perturbations_and_wells']

# Process all available data:
#selected_data = local_data

print('\nSelected perturbations-wells:\n{}'.format(selected_data))

#Generate and save data dirs
data_dirs = []
for per in selected_data.keys():
    for w in selected_data[per]:
        d = os.path.join(DATA_DIR, per, w)
        data_dirs.append(d)
        if not os.path.exists(d):
            raise Exception('{} does not exist!\nCheck if selected_data contain elements only from local_data dict.'.format(d))
p['data_dirs'] = data_dirs

Local available perturbations-wells:
{'184A1_hannah_unperturbed': ['I11', 'I09', 'J10'], '184A1_hannah_TSA': ['J20', 'I16', 'J13']}

Selected perturbations-wells:
{'184A1_hannah_unperturbed': ['I09']}


Process data:

In [8]:
for data_dir in p['data_dirs']:
    print('\nProcessing dir {}...'.format(data_dir))
    # Load data as an MPPData object
    mpp_temp = MPPData.from_data_dir(data_dir,
                                     dir_type=p['dir_type'],
                                     seed=p['seed'])
    
    # Add cell cycle to metadata (G1, S, G2)
    # Important! If mapobject_id_cell is not in cell_cycle_file =>
    # its corresponding cell is in Mitosis phase!
    if p['add_cell_cycle_to_metadata']:
        print('Adding cell cycle to metadata...')
        mpp_temp.add_cell_cycle_to_metadata(os.path.join(DATA_DIR, p['cell_cycle_file']))
    
    # Add well info to metadata
    if p['add_well_info_to_metadata']:
        print('Adding well info to metadata...')
        mpp_temp.add_well_info_to_metadata(os.path.join(DATA_DIR, p['well_info_file']))
    
    # Remove unwanted cells
    if p.get('filter_criteria', None) is not None:
        print('Removing unwanted cells...')
        mpp_temp.filter_cells(p['filter_criteria'], p['filter_values'])

    # Subtract background  values for each channel
    if p['subtract_background']:
        print('Subtracting background...')
        mpp_temp.subtract_background(os.path.join(DATA_DIR, p['background_value']))
    
    # Project every uni-channel images into a scalar for further analysis
    if p['project_into_scalar']:
        print('\nProjecting data...')
        mpp_temp.add_scalar_projection(p['method'])
        
    # Split data into train, validation and test
    train_temp, val_temp, test_temp = mpp_temp.train_val_test_split(p['train_frac'], p['val_frac'])
    del(mpp_temp)
    
    if p['convert_into_image']:
        print('Converting data into images...')
        train_temp.add_image_and_mask(data='MPP', remove_original_data=p['remove_original_data'], img_size=p['img_size'])
        val_temp.add_image_and_mask(data='MPP', remove_original_data=p['remove_original_data'], img_size=p['img_size'])
        test_temp.add_image_and_mask(data='MPP', remove_original_data=p['remove_original_data'], img_size=p['img_size'])
    
    # Concatenate wells
    # Check first if data sets are already defined
    if 'train' not in globals().keys():
        train, val, test = train_temp, val_temp, test_temp
    else:
        print('Merging data with loaded well...')
        val.merge_instances([val_temp])
        del(val_temp)
        test.merge_instances([test_temp])
        del(test_temp)
        train.merge_instances([train_temp])
        del(train_temp)

# Normalize train, val and test using the normalization parameters
# got from the train data (inner percentile% of train data)
if p['normalise']:
    print('\nNormalizing data...')
    rescale_values = train.rescale_intensities_per_channel(percentile=p['percentile'], )
    _ = val.rescale_intensities_per_channel(rescale_values=rescale_values)
    _ = test.rescale_intensities_per_channel(rescale_values=rescale_values)
    p['normalise_rescale_values'] = list(rescale_values)


Processing dir /home/hhughes/Documents/Master_Thesis/Project/datasets/raw/184A1_hannah_unperturbed/I09...
Adding cell cycle to metadata...
Adding well info to metadata...
Removing unwanted cells...
Total number of cells: 886
269 cells cutted by filter: is_border_cell == 1
30 cells cutted by filter: is_polynuclei_184A1 == 1
323 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 323


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']


Subtracting background...

Projecting data...

Normalizing data...


In [9]:
train.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration,00_DAPI_avg,07_H2B_avg,01_CDK9_pT186_avg,03_CDK9_avg,05_GTF2B_avg,07_SETD1A_avg,08_H3K4me3_avg,09_SRRM2_avg,10_H3K27ac_avg,11_KPNA2_MAX_avg,12_RB1_pS807_S811_avg,13_PABPN1_avg,14_PCNA_avg,15_SON_avg,16_H3_avg,17_HDAC3_avg,19_KPNA1_MAX_avg,20_SP100_avg,21_NCL_avg,01_PABPC1_avg,02_CDK7_avg,03_RPS6_avg,05_Sm_avg,07_POLR2A_avg,09_CCNT1_avg,10_POL2RA_pS2_avg,11_PML_avg,12_YAP1_avg,13_POL2RA_pS5_avg,15_U2SNRNPB_avg,18_NONO_avg,20_ALYREF_avg,21_COIL_avg,00_BG488_avg,00_BG568_avg,00_EU_avg,09_SRRM2_ILASTIK_avg,15_SON_ILASTIK_avg
0,373535,plate01,I09,0,0,0,0,6,0,373506,plate01,I09,0,0,0,0,6,0,0.0,,0.0,,0.0,,S,184A1,normal,,43.893030,175.995428,27.577226,54.206169,128.697294,41.469521,60.416069,64.842372,125.169772,65.557846,105.751293,192.479292,78.460671,49.046642,187.326717,99.378451,117.227410,26.282266,60.407161,33.006360,131.946547,28.173045,54.011811,187.774103,131.526724,385.672558,11.862217,81.837107,291.005551,51.128612,368.260323,285.676524,11.584543,8.256946,1.803026,599.449225,8305.377207,10121.675842
1,373536,plate01,I09,0,0,0,0,7,0,373507,plate01,I09,0,0,0,0,7,0,0.0,,0.0,,0.0,,G1,184A1,normal,,39.837376,273.655677,12.569294,22.001503,81.730022,20.032913,39.631146,44.481310,73.860256,54.789958,67.896671,162.914590,35.864765,37.245010,355.807515,67.943580,76.462250,12.614654,55.775873,14.612852,120.283067,19.574581,41.063924,101.116543,55.055405,297.648758,11.671538,91.374197,329.135675,33.598199,335.386680,410.320858,9.813957,5.888977,1.561608,398.334857,6055.847451,10278.970546
2,373537,plate01,I09,0,0,0,0,8,0,373508,plate01,I09,0,0,0,0,8,0,0.0,,0.0,,0.0,,G1,184A1,normal,,44.735922,258.942987,23.616477,47.406444,140.684383,44.831516,68.078372,67.289663,85.795525,90.090190,87.339633,249.505730,88.193348,51.833120,332.372822,124.072929,130.945283,38.285344,76.223063,43.566143,151.189717,39.663339,71.273700,205.703836,124.654347,332.255405,17.381051,196.612601,361.317387,55.593623,457.713302,469.516855,14.441616,9.757942,1.859657,464.431683,8280.577392,11757.616541
4,373539,plate01,I09,0,0,0,0,10,0,373510,plate01,I09,0,0,0,0,10,0,0.0,,0.0,,0.0,,S,184A1,normal,,30.892690,231.842236,19.046228,37.117579,100.918987,25.272207,43.387094,43.176593,78.249253,76.013457,112.938592,178.444723,81.792631,41.639270,292.071767,90.931247,99.625565,27.330404,66.549839,20.411372,107.398853,23.828493,55.354041,134.139140,99.027838,301.044591,15.211128,122.575573,334.431894,42.139486,362.906348,438.364101,10.736532,6.967087,1.697280,410.540764,5541.760160,10350.776433
5,373540,plate01,I09,0,0,0,0,11,0,373511,plate01,I09,0,0,0,0,11,0,0.0,,0.0,,0.0,,S,184A1,normal,,37.753066,238.882044,13.789982,27.224439,84.975838,23.843158,40.274607,44.294862,88.214824,56.990949,104.852156,156.820501,104.210400,37.244133,299.586078,84.814062,103.998514,19.334907,53.953275,18.832787,109.794168,22.626681,42.797852,119.819474,68.142357,291.043989,7.073450,92.228603,300.013785,35.563778,306.392659,381.804707,8.969512,6.654304,1.741013,405.846715,6569.049927,8857.680949
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558,248096,plate01,I09,5,4,0,0,31,0,248051,plate01,I09,5,4,0,0,31,0,0.0,,0.0,,0.0,,G1,184A1,normal,,34.088505,155.096587,13.024716,28.936674,95.528086,23.275727,43.408660,39.239153,52.576061,40.188708,5.078687,175.720682,51.470518,35.296696,174.323644,56.949624,56.759946,18.220416,41.090229,17.958328,122.376457,20.033855,43.918364,119.742814,86.827567,218.802525,10.731663,134.708834,183.026731,37.249065,260.121881,235.003543,7.600701,6.105711,1.609373,325.157342,5004.892771,9991.788145
559,248097,plate01,I09,5,4,0,0,32,0,248052,plate01,I09,5,4,0,0,32,0,0.0,,0.0,,0.0,,G1,184A1,normal,,29.103252,157.851762,9.229010,18.525031,76.213924,19.376481,31.879794,39.433825,48.630776,37.535381,79.274388,165.326735,42.447831,37.616003,190.478952,54.423588,51.708736,21.694139,44.480561,17.644941,95.801883,17.100752,37.933908,81.901958,60.711370,268.427825,9.421172,47.902741,201.229701,32.069356,284.005220,313.050669,7.123799,5.655949,1.528000,382.174679,6101.246866,9692.065772
560,248098,plate01,I09,5,4,0,0,33,0,248053,plate01,I09,5,4,0,0,33,0,0.0,,0.0,,0.0,,S,184A1,normal,,41.039714,180.212368,17.454660,31.442297,84.303652,28.616915,44.196115,49.574424,103.050050,39.699195,69.999549,180.591155,86.435982,45.644611,242.215620,63.927868,64.317648,17.133886,51.754897,19.469017,133.090863,22.530236,39.120315,125.629150,85.664930,338.872580,7.997690,113.691155,251.432130,38.683705,336.799904,329.825839,8.433395,6.436342,1.709882,451.010388,6905.080085,9703.626920
561,248102,plate01,I09,5,4,0,0,37,0,248057,plate01,I09,5,4,0,0,37,0,0.0,,0.0,,0.0,,S,184A1,normal,,41.121487,153.873601,15.314953,28.338914,82.752061,22.497689,40.477599,45.760907,118.973075,45.119237,100.598724,150.048637,83.140813,33.998817,199.528098,58.714794,62.800221,25.627111,44.674296,19.702687,98.382955,20.627130,39.620163,109.974209,74.349470,267.687032,10.510172,68.689059,215.263432,35.644982,311.052738,289.755046,7.408682,6.552806,1.713042,515.735421,7191.907034,9512.493248


In [10]:
val.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration,00_DAPI_avg,07_H2B_avg,01_CDK9_pT186_avg,03_CDK9_avg,05_GTF2B_avg,07_SETD1A_avg,08_H3K4me3_avg,09_SRRM2_avg,10_H3K27ac_avg,11_KPNA2_MAX_avg,12_RB1_pS807_S811_avg,13_PABPN1_avg,14_PCNA_avg,15_SON_avg,16_H3_avg,17_HDAC3_avg,19_KPNA1_MAX_avg,20_SP100_avg,21_NCL_avg,01_PABPC1_avg,02_CDK7_avg,03_RPS6_avg,05_Sm_avg,07_POLR2A_avg,09_CCNT1_avg,10_POL2RA_pS2_avg,11_PML_avg,12_YAP1_avg,13_POL2RA_pS5_avg,15_U2SNRNPB_avg,18_NONO_avg,20_ALYREF_avg,21_COIL_avg,00_BG488_avg,00_BG568_avg,00_EU_avg,09_SRRM2_ILASTIK_avg,15_SON_ILASTIK_avg
15,373552,plate01,I09,0,0,0,0,23,0,373523,plate01,I09,0,0,0,0,23,0,0.0,,0.0,,0.0,,S,184A1,normal,,30.574293,249.56957,7.583841,12.275772,68.128334,10.41495,17.431491,26.870564,56.54682,49.877291,48.190328,131.746804,65.050555,31.259364,313.478391,64.566879,95.895774,13.1302,44.428449,13.922616,57.981341,17.937401,40.632685,47.944492,34.554776,235.647705,7.176806,84.357927,310.685763,29.629622,240.284502,347.773641,7.038823,4.764324,1.504008,348.36151,3933.958062,9260.937295
17,228069,plate01,I09,0,1,0,0,2,0,228048,plate01,I09,0,1,0,0,2,0,0.0,,0.0,,0.0,,S,184A1,normal,,37.439699,245.27434,17.889401,28.128698,77.139874,18.420196,29.043601,50.53031,66.000257,91.896682,79.542296,141.386941,99.579891,37.320648,280.587424,92.903227,111.034924,14.532252,53.764492,22.844649,112.060962,27.840653,55.110393,108.570324,85.880647,311.581509,13.488006,88.534204,349.458715,38.682724,302.884503,379.881793,10.14415,7.706533,1.765122,435.851395,6589.405269,9630.423705
41,291043,plate01,I09,0,3,0,0,14,0,290977,plate01,I09,0,3,0,0,14,0,0.0,,0.0,,0.0,,S,184A1,normal,,40.567,163.835686,19.704372,35.336253,107.740268,36.216175,45.366358,57.97297,73.875981,61.487103,145.352718,155.916906,69.674045,42.145291,177.018449,65.471572,54.560807,18.421989,46.049133,16.959562,147.944396,19.15487,40.642219,144.79391,90.021095,293.602179,12.477208,94.965482,213.413462,38.985342,313.432754,273.643137,9.176985,6.443995,1.671419,412.074239,8498.392042,11110.759694
50,291056,plate01,I09,0,3,0,0,27,0,290990,plate01,I09,0,3,0,0,27,0,0.0,,0.0,,0.0,,G1,184A1,normal,,29.099963,162.728813,11.833903,21.418495,101.4884,26.886086,30.653287,47.203884,37.454291,33.303338,3.97491,176.756954,50.318569,38.093269,188.648134,54.823196,42.768541,10.717219,28.983672,13.58476,103.585063,13.901486,35.749202,121.093297,57.866152,270.13745,10.363075,96.477406,239.218251,29.967979,331.050846,305.319775,9.088626,5.134492,1.673023,367.283281,7519.626184,10119.844923
57,291065,plate01,I09,0,3,0,0,36,0,290999,plate01,I09,0,3,0,0,36,0,0.0,,0.0,,0.0,,G2,184A1,normal,,41.026765,148.126392,18.295315,35.514461,100.458513,33.749825,53.693365,57.645552,86.948187,65.771916,99.316916,176.841702,60.806458,36.446176,163.000022,64.56306,55.952194,24.514409,51.765313,19.439434,134.295217,19.637452,43.949357,162.218134,89.970926,275.154056,12.679415,110.516304,220.748026,42.265815,313.468876,274.557672,8.19951,6.213666,1.58785,437.427554,6760.496931,9142.113985
62,291072,plate01,I09,0,3,0,0,43,0,291006,plate01,I09,0,3,0,0,43,0,0.0,,0.0,,0.0,,G1,184A1,normal,,30.452216,166.752829,12.606276,21.299407,98.027614,30.724684,35.064603,46.082949,49.667228,36.128957,2.462416,184.807542,45.589469,40.229341,187.956096,60.462727,48.111862,11.873443,30.82599,14.291511,109.049414,16.139468,38.152107,115.967548,60.687516,258.03637,8.661483,86.111249,240.142465,33.724408,332.029395,287.157795,9.585442,5.725089,1.67183,390.576079,6154.798476,9207.17974
65,291075,plate01,I09,0,3,0,0,46,0,291009,plate01,I09,0,3,0,0,46,0,0.0,,0.0,,0.0,,G1,184A1,normal,,30.728273,144.009993,9.95187,20.723751,69.423386,22.300876,35.177623,36.450105,66.263339,40.515913,66.49903,149.422429,41.144596,27.938598,163.142341,51.501097,44.904457,18.738599,43.117029,11.673967,89.939246,14.531271,32.96375,87.976217,55.652909,231.078503,9.90905,73.505128,198.747466,32.208961,280.881381,272.938843,8.119686,5.38355,1.572494,348.092903,4967.066371,8447.841982
66,383784,plate01,I09,0,4,0,0,10,0,383748,plate01,I09,0,4,0,0,10,0,0.0,,0.0,,0.0,,G2,184A1,normal,,54.674814,217.443394,23.389013,42.552265,114.645918,50.442251,69.688654,66.876066,100.002514,93.312801,158.770971,191.79304,96.929013,47.553747,294.988487,99.866884,79.958769,30.012261,53.52473,32.892007,129.06623,29.114333,49.551348,192.975614,116.13261,321.929789,15.294121,189.01709,273.559587,49.053812,392.636731,357.615618,12.40522,7.50019,1.709835,401.616103,8665.9235,10978.492795
67,383785,plate01,I09,0,4,0,0,11,0,383749,plate01,I09,0,4,0,0,11,0,0.0,,0.0,,0.0,,G1,184A1,normal,,29.20945,155.898565,19.549355,45.5199,103.702083,22.841979,43.607188,62.610438,53.386844,57.382795,89.066996,163.711226,32.729325,29.911985,134.821598,69.790862,53.407548,13.888347,32.095461,21.971006,122.260017,25.397683,49.307263,151.464585,122.276296,253.602342,11.926216,176.885493,199.014032,41.932655,273.487956,207.440367,9.200699,10.73126,1.952378,230.066247,8869.253935,8345.3062
75,383794,plate01,I09,0,4,0,0,20,0,383758,plate01,I09,0,4,0,0,20,0,0.0,,0.0,,0.0,,S,184A1,normal,,38.415213,195.671616,21.755131,40.734558,106.328229,28.856932,48.142851,47.379795,111.919987,69.859724,112.493472,175.998229,85.018144,37.154003,226.613171,76.112933,66.028695,16.27689,54.221751,19.077028,123.649186,24.855993,48.227579,152.871314,116.375982,306.432862,10.334504,87.105631,239.149141,40.117879,280.379686,250.233257,9.602171,6.792205,1.727285,417.852885,7195.168444,9890.594841


In [11]:
test.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration,00_DAPI_avg,07_H2B_avg,01_CDK9_pT186_avg,03_CDK9_avg,05_GTF2B_avg,07_SETD1A_avg,08_H3K4me3_avg,09_SRRM2_avg,10_H3K27ac_avg,11_KPNA2_MAX_avg,12_RB1_pS807_S811_avg,13_PABPN1_avg,14_PCNA_avg,15_SON_avg,16_H3_avg,17_HDAC3_avg,19_KPNA1_MAX_avg,20_SP100_avg,21_NCL_avg,01_PABPC1_avg,02_CDK7_avg,03_RPS6_avg,05_Sm_avg,07_POLR2A_avg,09_CCNT1_avg,10_POL2RA_pS2_avg,11_PML_avg,12_YAP1_avg,13_POL2RA_pS5_avg,15_U2SNRNPB_avg,18_NONO_avg,20_ALYREF_avg,21_COIL_avg,00_BG488_avg,00_BG568_avg,00_EU_avg,09_SRRM2_ILASTIK_avg,15_SON_ILASTIK_avg
3,373538,plate01,I09,0,0,0,0,9,0,373509,plate01,I09,0,0,0,0,9,0,0.0,,0.0,,0.0,,S,184A1,normal,,43.931378,194.444293,18.353704,32.399117,104.564078,29.361877,46.617407,51.506499,127.183208,55.535536,94.269486,171.03829,77.086018,32.756867,212.862917,74.396402,81.701189,15.256486,49.931726,19.315464,136.79735,21.712636,45.183902,148.422635,82.323469,284.648928,8.077076,50.190543,271.552491,37.614959,285.61235,263.922204,7.98894,6.47702,1.717521,425.533338,7441.930589,8530.44624
7,373543,plate01,I09,0,0,0,0,14,0,373514,plate01,I09,0,0,0,0,14,0,0.0,,0.0,,0.0,,G1,184A1,normal,,31.796757,167.42106,8.088238,16.159192,54.151197,14.970476,34.413437,34.801339,45.406018,42.710192,2.001603,133.262292,15.262286,28.887519,165.452414,56.010599,75.01336,15.859371,36.431576,17.90058,65.072087,19.961104,36.62707,48.509038,41.970375,245.626208,13.282367,65.685411,237.447556,31.389509,291.177621,247.545313,7.859336,6.561661,1.630393,372.463443,5621.383479,8108.554629
9,373546,plate01,I09,0,0,0,0,17,0,373517,plate01,I09,0,0,0,0,17,0,0.0,,0.0,,0.0,,G1,184A1,normal,,23.068771,167.207771,14.759187,19.036149,126.775173,31.242133,33.646919,49.393888,35.833057,77.340375,2.787389,167.636919,56.760069,34.70028,179.047911,92.821322,97.650617,24.061671,55.531707,23.159593,142.841748,24.4826,53.103061,127.423264,69.50183,246.93037,13.879289,144.387664,232.434658,44.611408,351.037382,327.51341,8.695239,7.581144,1.64554,372.811467,6717.641716,8216.274439
24,228080,plate01,I09,0,1,0,0,13,0,228059,plate01,I09,0,1,0,0,13,0,0.0,,0.0,,0.0,,G2,184A1,normal,,32.078867,178.269895,18.796561,33.388943,107.821185,41.319326,48.081123,55.600895,72.199884,74.079468,129.651423,203.698225,63.982787,45.901579,233.402337,93.077872,106.523702,21.240588,64.664777,34.699882,157.854332,30.853312,53.062213,160.139889,104.704105,288.536858,11.130135,130.600651,280.562619,50.338534,357.159856,377.947482,10.856259,7.269352,1.851786,479.620915,7868.4949,10684.152555
28,324142,plate01,I09,0,2,0,0,7,0,324083,plate01,I09,0,2,0,0,7,0,0.0,,0.0,,0.0,,G2,184A1,normal,,44.098693,169.027253,13.698369,25.617715,84.53471,25.511925,57.734848,33.060869,124.012409,54.870709,99.773236,142.218122,50.510787,29.267737,219.390641,60.383234,53.727743,16.02016,43.992535,18.459753,113.135804,18.36744,36.470982,116.697646,49.731282,253.758183,7.708204,66.395213,228.104237,34.602493,236.486539,291.986443,7.262463,5.789343,1.5507,361.682005,5776.833068,8166.957938
38,291040,plate01,I09,0,3,0,0,11,0,290974,plate01,I09,0,3,0,0,11,0,0.0,,0.0,,0.0,,G1,184A1,normal,,26.740385,128.608741,15.4777,35.398259,93.286975,30.849339,40.518885,47.55259,38.435648,39.69179,2.440736,172.72796,43.155923,30.697981,129.359626,60.7881,50.260814,31.341237,50.148643,18.47538,79.77465,20.211609,48.55242,129.459433,95.528248,221.968939,12.843103,114.028953,179.354214,43.021642,276.596443,236.118496,7.76429,6.579578,1.685376,375.378452,6069.066786,8316.465715
69,383787,plate01,I09,0,4,0,0,13,0,383751,plate01,I09,0,4,0,0,13,0,0.0,,0.0,,0.0,,S,184A1,normal,,31.818634,161.938927,12.876318,21.587907,72.640642,20.413344,37.140296,51.690901,71.206961,54.639399,81.602091,143.364067,67.849103,32.049577,161.977275,63.961518,54.125404,21.726243,36.329224,15.294929,107.317485,18.936269,42.505164,98.650272,67.145025,257.874256,14.114544,90.145453,207.27096,37.05312,267.614096,227.716056,7.362123,5.877451,1.673182,405.264145,6301.429439,8157.930612
78,383800,plate01,I09,0,4,0,0,26,0,383764,plate01,I09,0,4,0,0,26,0,0.0,,0.0,,0.0,,S,184A1,normal,,47.675517,241.697744,20.642819,35.935752,115.948052,39.675949,57.005964,55.310825,131.154587,72.359837,121.005922,182.482643,103.909817,40.791441,328.848196,90.48215,81.111558,20.916552,64.064763,29.033937,141.790402,30.818141,46.642472,163.903121,114.71598,292.445886,13.917577,150.128375,270.051739,45.407403,327.840656,345.027969,10.330737,7.787731,1.559648,414.556345,8246.398544,9709.433265
88,332648,plate01,I09,1,0,0,0,10,0,332620,plate01,I09,1,0,0,0,10,0,0.0,,0.0,,0.0,,G1,184A1,normal,,30.980529,255.841909,15.822376,26.445325,110.250709,26.358096,44.787066,50.111705,51.966189,47.3228,2.138715,182.237653,38.019312,36.751283,327.190776,82.460819,81.607111,19.297541,57.186429,18.635874,134.631747,21.267887,58.62553,109.684895,66.53173,257.022545,9.770465,108.681996,297.779587,37.23567,325.837794,382.027718,8.527148,6.060062,1.667211,334.715923,7567.174106,10585.647168
100,383309,plate01,I09,1,3,0,0,17,0,383255,plate01,I09,1,3,0,0,17,0,0.0,,0.0,,0.0,,G2,184A1,normal,,43.623709,144.10937,18.726394,34.851099,91.931878,37.21735,54.167862,45.586895,98.531045,88.683365,111.651767,183.442014,67.46521,38.400941,176.896386,74.025455,61.575909,16.84793,50.042051,18.178955,125.077146,18.149966,39.723959,153.00664,93.50658,271.826261,11.290655,96.005264,229.114127,39.133645,312.305442,277.410413,9.729896,6.090064,1.603254,460.544177,7129.288329,10266.119997


## Save data

Prepare to save data:

In [12]:
p.keys()

dict_keys(['_comment', 'raw_data_dir', 'dir_type', 'perturbations_and_wells', 'output_data_dir', 'seed', 'input_channels', 'output_channels', 'aggregate_output', 'train_frac', 'val_frac', 'img_size', 'subtract_background', 'background_value', 'normalise', 'percentile', 'add_cell_cycle_to_metadata', 'cell_cycle_file', 'add_well_info_to_metadata', 'well_info_file', 'filter_criteria', 'filter_values', 'convert_into_image', 'remove_original_data', 'project_into_scalar', 'method', 'parameter_file_name', 'data_dirs', 'normalise_rescale_values'])

In [13]:
import shutil

# create dir
outdir = p['output_data_dir']
if os.path.exists(outdir):
    print('Warning! Directory {} already exist! Deleting...\n'.format(outdir))
    try:
        shutil.rmtree(outdir)
    except OSError as e:
        print('Dir {} could not be deleted!\n\nOSError: {}'.format(outdir, e))

print('Creating dir: {}'.format(outdir))
os.makedirs(outdir, exist_ok=False)
    


Creating dir: /home/hhughes/Documents/Master_Thesis/Project/datasets/184A1_hannah_EU_regression_TEST_2


In [14]:
# Get channels ids (proteins) which will be used to predict transcripcion rate
input_ids = list(train.channels.set_index('name').loc[p['input_channels']]['channel_id'])
# Get id of the channel that measure trancripcion rate
output_ids = list(train.channels.set_index('name').loc[p['output_channels']]['channel_id'])
# add output channel id after the input channels ids
channels_ids = input_ids + output_ids

Save metadata and used parameters

In [15]:
# save params
json.dump(p, open(os.path.join(outdir, 'params.json'), 'w'), indent=4)

# save metadata
train.metadata.to_csv(os.path.join(outdir, 'train_metadata.csv'))
val.metadata.to_csv(os.path.join(outdir, 'val_metadata.csv'))
test.metadata.to_csv(os.path.join(outdir, 'test_metadata.csv'))
pd.concat([train.metadata, val.metadata, test.metadata]).to_csv(os.path.join(outdir, 'metadata.csv'))

# Save used channels
#train.channels.to_csv(os.path.join(outdir, 'channels.csv'))
train.channels.set_index('channel_id').loc[channels_ids].to_csv(os.path.join(outdir, 'channels.csv'))

Save Images

In [16]:
# Note! instead of calculating the response value (y) here and save
# it separatelly, instead we will do it on the modeling part

"""
# get images
train_dataset = np.array(train.get_object_imgs(data='MPP', img_size=p['img_size']))
del(train)
val_dataset = np.array(val.get_object_imgs(data='MPP', img_size=p['img_size']))
del(val)
test_dataset = np.array(test.get_object_imgs(data='MPP', img_size=p['img_size']))
del(test)

# Create responce variable (y)
if p['aggregate_output'] == 'avg':
    train_dataset_y = np.array([img[img!=0].mean() for img in train_dataset[:,:,:,output_ids]])
    val_dataset_y = np.array([img[img!=0].mean() for img in val_dataset[:,:,:,output_ids]])
    test_dataset_y = np.array([img[img!=0].mean() for img in test_dataset[:,:,:,output_ids]])

# Save datasets
np.savez(os.path.join(outdir, 'train_dataset.npz'), x=train_dataset[:,:,:,input_ids], y=train_dataset_y)
del(train_dataset)
np.savez(os.path.join(outdir, 'val_dataset.npz'), x=val_dataset[:,:,:,input_ids], y=val_dataset_y)
del(val_dataset)
np.savez(os.path.join(outdir, 'test_dataset.npz'), x=test_dataset[:,:,:,input_ids], y=test_dataset_y)
del(test_dataset)
"""

if p['convert_into_image']:
    # get images and mask, save them and delete vars
    print('Saving train images and masks...')
    np.save(os.path.join(outdir, 'train_images.npy'), train.images[:,:,:,channels_ids])
    del(train.images)
    np.save(os.path.join(outdir, 'train_mask.npy'), train.masks)
    del(train.masks)
    del(train)

    print('Saving validation images and masks...')
    np.save(os.path.join(outdir, 'val_images.npy'), val.images[:,:,:,channels_ids])
    del(val.images)
    np.save(os.path.join(outdir, 'val_mask.npy'), val.masks)
    del(val.masks)
    del(val)

    print('Saving test images and masks...')
    np.save(os.path.join(outdir, 'test_images.npy'), test.images[:,:,:,channels_ids])
    del(test.images)
    np.save(os.path.join(outdir, 'test_mask.npy'), test.masks)
    del(test.masks)
    del(test)