# Data Preprocessing for predicting Transcription Rate (TS)

This notebook is ment to convert raw cell data from several wells into multichannel images (along with its corresponding metadata).

Data was taken from:
`/storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/` and server `vicb-submit-01`. 

Load libraries and set Directories paths:

In [1]:
import numpy as np
import pandas as pd
# To display all the columns
pd.options.display.max_columns = None
import os
import sys
import matplotlib.pyplot as plt
import json

# Set paths
BASE_DIR = os.path.realpath(os.path.join(os.path.abspath(''),'../..'))
if not os.path.exists(BASE_DIR):
    raise Exception('Base path {} does not exist!'.format(BASE_DIR))
else:
    print('BASE_DIR: {}'.format(BASE_DIR))
    
DATA_DIR = os.path.join(BASE_DIR, 'datasets', 'raw')
if not os.path.exists(DATA_DIR):
    raise Exception('Data path {} does not exist!'.format(DATA_DIR))
else:
    print('DATA_DIR: {}'.format(DATA_DIR))
    
# Add BASE_DIR to sys paths (for loading libraries)
sys.path.insert(1, os.path.join(BASE_DIR, 'workspace'))
# Load mpp_data library to convert raw data into images
from pelkmans.mpp_data import MPPData as MPPData

BASE_DIR: /home/hhughes/Documents/Master_Thesis/Project
DATA_DIR: /home/hhughes/Documents/Master_Thesis/Project/datasets/raw
Setting BASE_DIR to /home/hhughes/Documents/Master_Thesis/Project


Check available data (Perturbations and Wells):

In [2]:
# Save available local Perturbations and Wells
perturbations = [p for p in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, p))]
local_data = {}
#print('Local available perturbations-wells:\n')
for p in perturbations:
    pertur_dir = os.path.join(DATA_DIR, p)
    wells = [w for w in os.listdir(pertur_dir) if os.path.isdir(os.path.join(pertur_dir, w))]
    #print('{}\n\t{}\n'.format(p, wells))
    local_data[p] = wells

Select Perturbations and its wells to process: 

In [3]:
print('Local available perturbations-wells:\n{}'.format(local_data))

# In case you only want to load some specific perturbations and/or wells:
selected_data = {
    '184A1_hannah_unperturbed': ['I11', 'I09'],
    '184A1_hannah_TSA': ['J20', 'I16'],
}

selected_data = {
    '184A1_hannah_unperturbed': ['I11']
}

# Process all available data:
#selected_data = local_data

print('\nSelected perturbations-wells:\n{}'.format(selected_data))

#Generate and save data dirs
data_dirs = []
for p in selected_data.keys():
    for w in selected_data[p]:
        d = os.path.join(DATA_DIR, p, w)
        data_dirs.append(d)
        if not os.path.exists(d):
            raise Exception('{} does not exist!\nCheck if selected_data contain elements only from local_data dict.'.format(d))

Local available perturbations-wells:
{'184A1_hannah_unperturbed': ['I11', 'I09', 'J10'], '184A1_hannah_TSA': ['J20', 'I16', 'J13']}

Selected perturbations-wells:
{'184A1_hannah_unperturbed': ['I11']}


Set parameters for data transformation:

In [4]:
data_params = {
    # where to read data from
    'data_dirs': data_dirs,
    'dir_type': 'hannah',
    # make results reproducible
    'seed': 42,
    # input/output definition
    'input_channels': [
        '00_DAPI',
        '07_H2B',
        '01_CDK9_pT186',
        '03_CDK9',
        '05_GTF2B',
        '07_SETD1A',
        '08_H3K4me3',
        '09_SRRM2',
        '10_H3K27ac',
        '11_KPNA2_MAX',
        '12_RB1_pS807_S811',
        '13_PABPN1',
        '14_PCNA',
        '15_SON',
        '16_H3',
        '17_HDAC3',
        '19_KPNA1_MAX',
        '20_SP100',
        '21_NCL',
        '01_PABPC1',
        '02_CDK7',
        '03_RPS6',
        '05_Sm',
        '07_POLR2A',
        '09_CCNT1',
        '10_POL2RA_pS2',
        '11_PML',
        '12_YAP1',
        '13_POL2RA_pS5',
        '15_U2SNRNPB',
        '18_NONO',
        '20_ALYREF',
        '21_COIL',
    ],
    'output_channels': ['00_EU'],
    'aggregate_output': 'avg', # None results in output images, 'max', 'avg' aggregate output channels and output a single number
    # train/val/test split
    'train_frac': 0.8,
    'val_frac': 0.1,
    'img_size': 224,
    # Subtract background
    'subtract_background': True,
    'background_value': os.path.join(DATA_DIR, 'secondary_only_relative_normalisation.csv'),
    # normalisation
    'normalise': True,
    'percentile': 98.0,
    # Add Cell cycle to metadata
    'add_cell_cycle_to_metadata': True,
    'cell_cycle_file': os.path.join(DATA_DIR, 'cell_cycle_classification.csv'),
    # Add well info to metadata (cell_type, perturbation and duration)
    'add_well_info_to_metadata': True,
    'well_info_file': os.path.join(DATA_DIR, 'wells_metadata.csv'),
    # Fitering
    #'filter_criteria': ['is_border_cell', 'is_mitotic', 'is_polynuclei_184A1'],
    #'filter_values': [1, 1, 'NaN'],
    'filter_criteria': ['is_border_cell', 'is_polynuclei_184A1', 'cell_cycle'],
    'filter_values': [1, 1, 'NaN'],
    # Convert into image
    'convert_into_image': True,
    'remove_original_data': True,
    # Project each cell channel into a scalar
    # methods: 'avg' and 'median'
    'project_into_scalar': True,
    'method': 'avg',
}
p = data_params

Process data:

In [5]:
for data_dir in p['data_dirs']:
    print('\nProcessing dir {}...'.format(data_dir))
    # Load data as an MPPData object
    mpp_temp = MPPData.from_data_dir(data_dir,
                                     dir_type=p['dir_type'],
                                     seed=p['seed'])
    
    # Add cell cycle to metadata (G1, S, G2)
    # Important! If mapobject_id_cell is not in cell_cycle_file =>
    # its corresponding cell is in Mitosis phase!
    if p['add_cell_cycle_to_metadata']:
        mpp_temp.add_cell_cycle_to_metadata(p['cell_cycle_file'])
    
    # Add well info to metadata
    if p['add_well_info_to_metadata']:
        mpp_temp.add_well_info_to_metadata(p['well_info_file'])
    
    # Remove unwanted cells
    if p.get('filter_criteria', None) is not None:
        print('Removing unwanted cells...')
        mpp_temp.filter_cells(p['filter_criteria'], p['filter_values'])

    # Subtract background  values for each channel
    if p['subtract_background']:
        mpp_temp.subtract_background(p['background_value'])
    
    # Project every uni-channel images into a scalar for further analysis
    if p['project_into_scalar']:
        print('\nProjecting data...')
        mpp_temp.add_scalar_projection(p['method'])
        
    # Split data into train, validation and test
    train_temp, val_temp, test_temp = mpp_temp.train_val_test_split(p['train_frac'], p['val_frac'])
    del(mpp_temp)
    
    if p['convert_into_image']:
        train_temp.add_image_and_mask(data='MPP', remove_original_data=p['remove_original_data'], img_size=p['img_size'])
        val_temp.add_image_and_mask(data='MPP', remove_original_data=p['remove_original_data'], img_size=p['img_size'])
        test_temp.add_image_and_mask(data='MPP', remove_original_data=p['remove_original_data'], img_size=p['img_size'])
        print('data converted into image\n')
    
    # Concatenate wells
    # Check first if data sets are already defined
    if 'train' not in globals().keys():
        train, val, test = train_temp, val_temp, test_temp
    else:
        val.merge_instances([val_temp])
        del(val_temp)
        test.merge_instances([test_temp])
        del(test_temp)
        train.merge_instances([train_temp])
        del(train_temp)

# Normalize train, val and test using the normalization parameters
# got from the train data (inner percentile% of train data)
if p['normalise']:
    rescale_values = train.rescale_intensities_per_channel(percentile=p['percentile'], )
    _ = val.rescale_intensities_per_channel(rescale_values=rescale_values)
    _ = test.rescale_intensities_per_channel(rescale_values=rescale_values)
    p['normalise_rescale_values'] = list(rescale_values)


Processing dir /home/hhughes/Documents/Master_Thesis/Project/datasets/raw/184A1_hannah_unperturbed/I11...
Removing unwanted cells...
Total number of cells: 1025
333 cells cutted by filter: is_border_cell == 1
33 cells cutted by filter: is_polynuclei_184A1 == 1
375 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 375


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']


data converted into image



In [6]:
train.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration,00_DAPI_median,07_H2B_median,01_CDK9_pT186_median,03_CDK9_median,05_GTF2B_median,07_SETD1A_median,08_H3K4me3_median,09_SRRM2_median,10_H3K27ac_median,11_KPNA2_MAX_median,12_RB1_pS807_S811_median,13_PABPN1_median,14_PCNA_median,15_SON_median,16_H3_median,17_HDAC3_median,19_KPNA1_MAX_median,20_SP100_median,21_NCL_median,01_PABPC1_median,02_CDK7_median,03_RPS6_median,05_Sm_median,07_POLR2A_median,09_CCNT1_median,10_POL2RA_pS2_median,11_PML_median,12_YAP1_median,13_POL2RA_pS5_median,15_U2SNRNPB_median,18_NONO_median,20_ALYREF_median,21_COIL_median,00_BG488_median,00_BG568_median,00_EU_median,09_SRRM2_ILASTIK_median,15_SON_ILASTIK_median
0,384925,plate01,I11,0,0,0,0,8,0,384878,plate01,I11,0,0,0,0,8,0,0.0,,0.0,,0.0,,G2,184A1,normal,,49.3,203.3,15.889078,37.797795,127.166105,30.98985,55.921075,30.374678,97.10488,86.093431,116.208524,156.026212,87.451839,19.39003,265.498049,100.043113,84.208441,5.365936,28.973475,17.292485,126.580565,23.969013,47.192703,187.886362,113.454321,330.873113,7.143769,111.926705,243.722106,44.901764,397.345141,345.734123,9.30633,6.733834,0.009804,519.0,0.0,393.0
1,384928,plate01,I11,0,0,0,0,11,0,384881,plate01,I11,0,0,0,0,11,0,0.0,,0.0,,0.0,,G1,184A1,normal,,29.3,177.3,6.889078,17.797795,79.166105,11.98985,31.921075,14.374678,54.10488,27.093431,79.208524,119.026212,32.451839,9.39003,193.498049,52.043113,45.208441,2.365936,12.973475,9.292485,87.580565,12.969013,29.192703,72.886362,53.454321,261.873113,4.143769,50.926705,197.722106,26.901764,267.345141,231.734123,5.30633,3.733834,0.009804,355.0,0.0,361.0
2,384929,plate01,I11,0,0,0,0,12,0,384882,plate01,I11,0,0,0,0,12,0,0.0,,0.0,,0.0,,G2,184A1,normal,,50.3,99.3,11.889078,34.297795,81.166105,30.98985,59.921075,17.374678,100.10488,45.093431,118.208524,139.026212,50.451839,11.39003,117.498049,68.043113,57.208441,5.365936,21.973475,11.292485,110.580565,16.969013,34.192703,161.886362,108.454321,218.873113,5.143769,159.926705,146.722106,43.901764,239.345141,184.734123,6.30633,4.733834,0.009804,387.0,0.0,207.0
3,384930,plate01,I11,0,0,0,0,13,0,384883,plate01,I11,0,0,0,0,13,0,0.0,,0.0,,0.0,,G1,184A1,normal,,36.3,216.3,9.889078,26.797795,107.166105,24.98985,42.921075,23.374678,58.10488,46.093431,112.208524,164.026212,56.451839,16.39003,263.498049,70.043113,63.208441,4.365936,21.973475,10.292485,116.580565,15.969013,37.192703,132.886362,74.454321,305.873113,5.143769,106.926705,228.722106,34.901764,380.345141,344.734123,8.30633,4.733834,0.009804,426.0,0.0,474.0
4,384931,plate01,I11,0,0,0,0,14,0,384884,plate01,I11,0,0,0,0,14,0,0.0,,0.0,,0.0,,G1,184A1,normal,,41.8,257.3,5.889078,21.797795,61.166105,3.98985,22.921075,4.374678,79.10488,38.093431,87.208524,118.026212,17.451839,4.39003,314.498049,52.043113,57.208441,1.365936,22.973475,7.292485,91.580565,12.969013,25.192703,53.886362,54.454321,225.873113,3.143769,71.926705,236.722106,26.901764,348.345141,323.734123,6.30633,3.733834,0.009804,370.0,0.0,400.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
642,345310,plate01,I11,5,4,0,0,42,0,345252,plate01,I11,5,4,0,0,42,0,0.0,,0.0,,0.0,,S,184A1,normal,,35.3,156.3,14.889078,35.797795,113.166105,27.98985,38.921075,24.374678,94.10488,39.093431,121.208524,184.026212,68.451839,13.39003,160.498049,62.043113,64.208441,4.365936,19.973475,15.292485,159.580565,19.969013,43.192703,163.886362,122.454321,319.873113,4.143769,88.926705,243.722106,40.901764,339.345141,197.734123,7.30633,6.733834,0.009804,465.0,0.0,446.0
644,345313,plate01,I11,5,4,0,0,45,0,345255,plate01,I11,5,4,0,0,45,0,0.0,,0.0,,0.0,,G1,184A1,normal,,35.3,206.3,13.889078,35.797795,112.166105,27.98985,44.921075,34.374678,78.10488,50.093431,129.208524,188.026212,53.451839,17.39003,231.498049,78.043113,84.208441,4.365936,33.973475,22.292485,144.580565,26.969013,51.192703,174.886362,106.454321,328.873113,5.143769,127.926705,292.722106,44.901764,426.345141,305.734123,10.30633,6.733834,0.009804,430.0,0.0,720.0
645,345314,plate01,I11,5,4,0,0,46,0,345256,plate01,I11,5,4,0,0,46,0,0.0,,0.0,,0.0,,S,184A1,normal,,35.3,175.3,11.889078,26.797795,91.166105,19.98985,39.921075,18.374678,62.10488,34.093431,97.208524,187.026212,43.451839,14.39003,177.498049,63.043113,66.208441,3.365936,21.973475,18.292485,102.580565,22.969013,39.192703,106.886362,76.454321,282.873113,5.143769,77.926705,227.722106,38.901764,309.345141,207.734123,8.30633,6.733834,0.009804,419.0,0.0,0.0
646,345315,plate01,I11,5,4,0,0,47,0,345257,plate01,I11,5,4,0,0,47,0,0.0,,0.0,,0.0,,G1,184A1,normal,,39.3,188.3,7.889078,26.797795,63.166105,5.98985,29.921075,13.374678,74.10488,46.093431,104.208524,171.026212,24.451839,9.39003,201.498049,53.043113,57.208441,2.365936,28.973475,14.292485,118.580565,19.969013,33.192703,69.886362,59.454321,234.873113,4.143769,74.926705,239.722106,32.901764,361.345141,255.734123,8.30633,5.733834,0.009804,371.0,0.0,400.0


In [7]:
val.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration,00_DAPI_median,07_H2B_median,01_CDK9_pT186_median,03_CDK9_median,05_GTF2B_median,07_SETD1A_median,08_H3K4me3_median,09_SRRM2_median,10_H3K27ac_median,11_KPNA2_MAX_median,12_RB1_pS807_S811_median,13_PABPN1_median,14_PCNA_median,15_SON_median,16_H3_median,17_HDAC3_median,19_KPNA1_MAX_median,20_SP100_median,21_NCL_median,01_PABPC1_median,02_CDK7_median,03_RPS6_median,05_Sm_median,07_POLR2A_median,09_CCNT1_median,10_POL2RA_pS2_median,11_PML_median,12_YAP1_median,13_POL2RA_pS5_median,15_U2SNRNPB_median,18_NONO_median,20_ALYREF_median,21_COIL_median,00_BG488_median,00_BG568_median,00_EU_median,09_SRRM2_ILASTIK_median,15_SON_ILASTIK_median
5,384932,plate01,I11,0,0,0,0,15,0,384885,plate01,I11,0,0,0,0,15,0,0.0,,0.0,,0.0,,S,184A1,normal,,37.3,229.3,11.889078,28.797795,91.166105,15.98985,36.921075,20.374678,77.10488,54.093431,64.208524,142.026212,90.451839,15.39003,275.498049,75.043113,79.208441,3.365936,21.973475,13.292485,124.580565,19.969013,42.192703,119.886362,89.454321,329.873113,5.143769,99.926705,253.722106,35.901764,312.345141,355.734123,8.30633,5.733834,0.009804,436.0,0.0,0.0
20,384956,plate01,I11,0,0,0,0,39,0,384909,plate01,I11,0,0,0,0,39,0,0.0,,0.0,,0.0,,S,184A1,normal,,40.3,158.3,8.889078,21.797795,81.166105,17.98985,37.921075,12.374678,142.10488,41.093431,85.208524,119.026212,55.451839,9.39003,169.498049,61.043113,44.208441,2.365936,15.973475,12.292485,90.580565,17.969013,29.192703,91.886362,62.454321,234.873113,4.143769,95.926705,158.722106,30.901764,265.345141,185.734123,5.30633,4.733834,0.009804,388.0,0.0,552.0
29,383889,plate01,I11,0,1,0,0,15,0,383825,plate01,I11,0,1,0,0,15,0,0.0,,0.0,,0.0,,G1,184A1,normal,,33.3,147.3,8.889078,23.797795,87.166105,19.98985,41.921075,22.374678,74.10488,36.093431,83.208524,170.026212,36.451839,15.39003,185.498049,53.043113,45.208441,2.365936,24.973475,9.292485,118.580565,13.969013,32.192703,105.886362,73.454321,323.873113,5.143769,80.926705,210.722106,35.901764,339.345141,280.734123,8.30633,4.733834,0.009804,477.0,0.0,605.0
40,383902,plate01,I11,0,1,0,0,28,0,383838,plate01,I11,0,1,0,0,28,0,0.0,,0.0,,0.0,,G1,184A1,normal,,34.3,111.3,8.889078,22.797795,74.166105,21.98985,43.921075,18.374678,73.10488,29.093431,1.208524,115.026212,32.451839,11.39003,121.498049,39.043113,26.208441,3.365936,17.973475,8.292485,130.580565,13.969013,28.192703,92.886362,67.454321,269.873113,4.143769,71.926705,152.722106,33.901764,243.345141,188.734123,6.30633,4.733834,0.009804,309.0,0.0,0.0
81,337400,plate01,I11,0,2,0,0,39,0,337277,plate01,I11,0,2,0,0,39,0,0.0,,0.0,,0.0,,G1,184A1,normal,,49.3,177.3,7.889078,24.797795,71.166105,14.98985,66.921075,13.374678,165.10488,37.093431,65.208524,157.026212,30.451839,9.39003,233.498049,46.043113,46.208441,4.365936,19.973475,9.292485,100.580565,14.969013,31.192703,75.886362,54.454321,218.873113,4.143769,82.926705,179.722106,31.901764,281.345141,276.734123,6.30633,3.733834,0.009804,297.0,0.0,491.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,367424,plate01,I11,5,3,0,0,11,0,367331,plate01,I11,5,3,0,0,11,0,0.0,,0.0,,0.0,,G1,184A1,normal,,36.3,225.3,5.889078,15.797795,58.166105,5.48985,23.921075,15.374678,65.10488,41.093431,61.208524,146.026212,29.451839,9.39003,294.498049,55.043113,67.208441,2.365936,18.973475,9.292485,74.580565,13.969013,34.192703,67.886362,44.454321,223.873113,3.143769,74.926705,269.722106,28.901764,339.345141,291.734123,7.30633,4.733834,0.009804,262.0,0.0,143.5
628,345290,plate01,I11,5,4,0,0,22,0,345232,plate01,I11,5,4,0,0,22,0,0.0,,0.0,,0.0,,S,184A1,normal,,28.3,108.3,8.889078,17.797795,71.166105,23.98985,35.921075,26.374678,66.10488,29.093431,92.208524,196.026212,38.451839,15.39003,91.498049,42.043113,35.208441,3.365936,19.973475,14.292485,116.580565,17.969013,36.192703,111.886362,65.454321,310.873113,4.143769,76.926705,176.722106,36.901764,227.345141,160.734123,6.30633,6.733834,0.009804,463.0,0.0,0.0
631,345294,plate01,I11,5,4,0,0,26,0,345236,plate01,I11,5,4,0,0,26,0,0.0,,0.0,,0.0,,S,184A1,normal,,36.3,184.8,15.889078,31.797795,121.166105,28.98985,41.921075,25.374678,95.10488,49.093431,139.208524,214.026212,71.451839,16.39003,190.498049,78.043113,82.208441,4.365936,26.973475,17.292485,148.580565,23.969013,47.192703,155.886362,107.454321,297.873113,5.143769,134.926705,252.722106,45.901764,345.345141,237.734123,8.30633,6.733834,0.009804,477.0,0.0,0.0
637,345303,plate01,I11,5,4,0,0,35,0,345245,plate01,I11,5,4,0,0,35,0,0.0,,0.0,,0.0,,G2,184A1,normal,,42.3,176.3,11.889078,26.797795,82.166105,23.98985,39.921075,20.374678,87.10488,47.093431,109.208524,166.026212,56.451839,13.39003,197.498049,61.043113,63.208441,3.365936,23.973475,16.292485,139.580565,20.969013,34.192703,115.886362,74.454321,289.873113,5.143769,88.926705,240.722106,36.901764,338.345141,222.734123,8.30633,5.733834,0.009804,389.0,0.0,0.0


In [8]:
test.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration,00_DAPI_median,07_H2B_median,01_CDK9_pT186_median,03_CDK9_median,05_GTF2B_median,07_SETD1A_median,08_H3K4me3_median,09_SRRM2_median,10_H3K27ac_median,11_KPNA2_MAX_median,12_RB1_pS807_S811_median,13_PABPN1_median,14_PCNA_median,15_SON_median,16_H3_median,17_HDAC3_median,19_KPNA1_MAX_median,20_SP100_median,21_NCL_median,01_PABPC1_median,02_CDK7_median,03_RPS6_median,05_Sm_median,07_POLR2A_median,09_CCNT1_median,10_POL2RA_pS2_median,11_PML_median,12_YAP1_median,13_POL2RA_pS5_median,15_U2SNRNPB_median,18_NONO_median,20_ALYREF_median,21_COIL_median,00_BG488_median,00_BG568_median,00_EU_median,09_SRRM2_ILASTIK_median,15_SON_ILASTIK_median
7,384935,plate01,I11,0,0,0,0,18,0,384888,plate01,I11,0,0,0,0,18,0,0.0,,0.0,,0.0,,G1,184A1,normal,,39.3,229.3,11.889078,26.797795,96.166105,24.98985,44.921075,26.374678,75.10488,54.093431,122.208524,155.026212,51.451839,17.39003,278.498049,75.043113,69.208441,4.365936,23.973475,13.292485,135.080565,19.969013,39.192703,145.886362,73.454321,304.873113,5.143769,109.926705,221.722106,36.901764,366.345141,309.734123,8.30633,4.733834,0.009804,363.0,0.0,181.5
17,384951,plate01,I11,0,0,0,0,34,0,384904,plate01,I11,0,0,0,0,34,0,0.0,,0.0,,0.0,,G1,184A1,normal,,30.3,140.3,5.889078,13.797795,59.166105,11.98985,30.921075,12.374678,50.10488,25.093431,1.208524,128.026212,28.451839,9.39003,155.498049,49.043113,33.208441,2.365936,11.973475,7.292485,70.580565,11.969013,27.192703,60.886362,40.454321,213.873113,3.143769,59.926705,150.722106,26.901764,277.345141,208.734123,5.30633,3.733834,0.009804,312.0,0.0,0.0
30,383890,plate01,I11,0,1,0,0,16,0,383826,plate01,I11,0,1,0,0,16,0,0.0,,0.0,,0.0,,G2,184A1,normal,,44.3,184.3,10.889078,24.797795,70.166105,18.98985,47.921075,20.374678,75.10488,53.093431,98.208524,140.026212,53.451839,11.39003,205.498049,65.043113,55.208441,4.365936,18.973475,13.292485,94.580565,16.969013,37.192703,114.886362,57.454321,285.873113,5.143769,77.926705,198.722106,34.901764,317.345141,257.734123,7.30633,4.733834,0.009804,394.0,0.0,0.0
36,383898,plate01,I11,0,1,0,0,24,0,383834,plate01,I11,0,1,0,0,24,0,0.0,,0.0,,0.0,,G2,184A1,normal,,46.3,143.3,13.889078,36.797795,103.166105,23.98985,49.921075,28.374678,106.10488,46.093431,107.208524,166.026212,52.451839,16.39003,169.498049,63.043113,48.208441,6.365936,24.973475,18.292485,148.580565,19.969013,36.192703,136.886362,89.454321,312.873113,6.143769,85.926705,192.722106,36.901764,300.345141,216.734123,8.30633,5.733834,0.009804,428.0,0.0,0.0
44,383909,plate01,I11,0,1,0,0,35,0,383845,plate01,I11,0,1,0,0,35,0,0.0,,0.0,,0.0,,G1,184A1,normal,,31.3,224.3,5.889078,12.797795,60.166105,8.98985,21.921075,22.374678,38.10488,31.093431,2.208524,133.026212,35.451839,15.39003,243.498049,56.043113,61.208441,2.365936,17.973475,11.292485,84.580565,15.969013,34.192703,45.886362,41.454321,255.873113,4.143769,106.926705,216.722106,31.901764,298.345141,301.734123,8.30633,4.733834,0.009804,346.0,0.0,1104.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,345283,plate01,I11,5,4,0,0,15,0,345225,plate01,I11,5,4,0,0,15,0,0.0,,0.0,,0.0,,G1,184A1,normal,,32.3,151.3,8.889078,21.797795,73.166105,19.98985,27.921075,16.374678,57.10488,22.093431,71.208524,156.026212,31.451839,11.39003,159.498049,44.043113,46.208441,2.365936,15.973475,12.292485,95.580565,17.969013,34.192703,81.886362,58.454321,286.873113,4.143769,51.926705,226.722106,35.901764,313.345141,214.734123,6.30633,5.733834,0.009804,387.0,0.0,0.0
630,345292,plate01,I11,5,4,0,0,24,0,345234,plate01,I11,5,4,0,0,24,0,0.0,,0.0,,0.0,,G1,184A1,normal,,33.3,128.3,7.889078,19.797795,64.166105,13.98985,43.921075,16.374678,79.10488,25.093431,83.208524,154.026212,24.451839,9.39003,101.498049,46.043113,40.208441,3.365936,19.973475,13.292485,96.580565,17.469013,34.192703,81.886362,60.454321,242.873113,4.143769,68.926705,169.722106,33.901764,208.345141,145.734123,5.30633,5.733834,0.009804,377.0,0.0,348.0
643,345311,plate01,I11,5,4,0,0,43,0,345253,plate01,I11,5,4,0,0,43,0,0.0,,0.0,,0.0,,G1,184A1,normal,,31.3,173.3,10.889078,28.797795,95.166105,19.98985,32.921075,20.374678,53.10488,30.093431,67.208524,160.026212,46.451839,13.39003,198.498049,60.043113,63.208441,3.365936,28.973475,12.292485,111.580565,18.969013,40.192703,130.886362,84.454321,290.873113,4.143769,77.926705,242.722106,37.901764,337.345141,232.734123,7.30633,5.733834,0.009804,403.0,0.0,423.0
647,345316,plate01,I11,5,4,0,0,48,0,345258,plate01,I11,5,4,0,0,48,0,0.0,,0.0,,0.0,,G1,184A1,normal,,33.3,191.3,7.889078,21.797795,67.166105,13.98985,34.921075,20.374678,69.10488,32.093431,79.208524,151.026212,40.451839,11.39003,215.498049,59.043113,65.208441,3.365936,18.973475,13.292485,89.580565,18.969013,37.192703,87.886362,57.454321,269.873113,5.143769,68.926705,255.722106,33.901764,372.345141,245.734123,7.30633,5.733834,0.009804,378.0,0.0,696.0


## Save data

Prepare to save data:

In [11]:
import shutil

# create dir
dataset_name = '184A1_hannah_EU_regression_TEST'
outdir = os.path.join(BASE_DIR, 'datasets', dataset_name)
if os.path.exists(outdir):
    print('Warning! Directory {} already exist! Deleting...\n'.format(outdir))
    try:
        shutil.rmtree(outdir)
    except OSError as e:
        print('Dir {} could not be deleted!\n\nOSError: {}'.format(outdir, e))

print('Creating dir: {}'.format(outdir))
os.makedirs(outdir, exist_ok=False)
    


Creating dir: /home/hhughes/Documents/Master_Thesis/Project/datasets/184A1_hannah_EU_regression_TEST


In [12]:
# Get channels ids (proteins) which will be used to predict transcripcion rate
input_ids = list(train.channels.set_index('name').loc[p['input_channels']]['channel_id'])
# Get id of the channel that measure trancripcion rate
output_ids = list(train.channels.set_index('name').loc[p['output_channels']]['channel_id'])
# add output channel id after the input channels ids
channels_ids = input_ids + output_ids

Save metadata and used parameters

In [13]:
# save params
json.dump(data_params, open(os.path.join(outdir, 'params.json'), 'w'), indent=4)

# save metadata
train.metadata.to_csv(os.path.join(outdir, 'train_metadata.csv'))
val.metadata.to_csv(os.path.join(outdir, 'val_metadata.csv'))
test.metadata.to_csv(os.path.join(outdir, 'test_metadata.csv'))
pd.concat([train.metadata, val.metadata, test.metadata]).to_csv(os.path.join(outdir, 'metadata.csv'))

# Save used channels
#train.channels.to_csv(os.path.join(outdir, 'channels.csv'))
train.channels.set_index('channel_id').loc[channels_ids].to_csv(os.path.join(outdir, 'channels.csv'))

Save Images

In [14]:
# Note! instead of calculating the response value (y) here and save
# it separatelly, instead we will do it on the modeling part

"""
# get images
train_dataset = np.array(train.get_object_imgs(data='MPP', img_size=p['img_size']))
del(train)
val_dataset = np.array(val.get_object_imgs(data='MPP', img_size=p['img_size']))
del(val)
test_dataset = np.array(test.get_object_imgs(data='MPP', img_size=p['img_size']))
del(test)

# Create responce variable (y)
if p['aggregate_output'] == 'avg':
    train_dataset_y = np.array([img[img!=0].mean() for img in train_dataset[:,:,:,output_ids]])
    val_dataset_y = np.array([img[img!=0].mean() for img in val_dataset[:,:,:,output_ids]])
    test_dataset_y = np.array([img[img!=0].mean() for img in test_dataset[:,:,:,output_ids]])

# Save datasets
np.savez(os.path.join(outdir, 'train_dataset.npz'), x=train_dataset[:,:,:,input_ids], y=train_dataset_y)
del(train_dataset)
np.savez(os.path.join(outdir, 'val_dataset.npz'), x=val_dataset[:,:,:,input_ids], y=val_dataset_y)
del(val_dataset)
np.savez(os.path.join(outdir, 'test_dataset.npz'), x=test_dataset[:,:,:,input_ids], y=test_dataset_y)
del(test_dataset)
"""

# get images and mask, save them and delete vars
print('Saving train images and masks...')
np.save(os.path.join(outdir, 'train_images.npy'), train.images[:,:,:,channels_ids])
del(train.images)
np.save(os.path.join(outdir, 'train_mask.npy'), train.masks)
del(train.masks)
del(train)

print('Saving validation images and masks...')
np.save(os.path.join(outdir, 'val_images.npy'), val.images[:,:,:,channels_ids])
del(val.images)
np.save(os.path.join(outdir, 'val_mask.npy'), val.masks)
del(val.masks)
del(val)

print('Saving test images and masks...')
np.save(os.path.join(outdir, 'test_images.npy'), test.images[:,:,:,channels_ids])
del(test.images)
np.save(os.path.join(outdir, 'test_mask.npy'), test.masks)
del(test.masks)
del(test)

Saving train images and masks...
Saving validation images and masks...
Saving test images and masks...
