# Data Preprocessing: Projection into scalars

This notebook is ment to convert raw cell data from several wells into scalars (or a vector in $\mathbb{R}^{n\_channels}$).

Data was taken from:
`/storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/` and server `vicb-submit-01`. 

Load libraries and set Directories paths:

In [1]:
import numpy as np
import pandas as pd
# To display all the columns
pd.options.display.max_columns = None
import os
import sys
import matplotlib.pyplot as plt
import json

# Set paths
BASE_DIR = os.path.realpath(os.path.join(os.path.abspath(''),'../..'))
if not os.path.exists(BASE_DIR):
    raise Exception('Base path {} does not exist!'.format(BASE_DIR))
else:
    print('BASE_DIR: {}'.format(BASE_DIR))
    
# Add BASE_DIR to sys paths (for loading libraries)
sys.path.insert(1, os.path.join(BASE_DIR, 'workspace'))
# Load mpp_data library to convert raw data into images
from pelkmans.mpp_data import MPPData as MPPData
    
# Add BASE_DIR to sys paths (for loading libraries)
sys.path.insert(1, os.path.join(BASE_DIR, 'workspace'))
# Load mpp_data library to convert raw data into images
from pelkmans.mpp_data import MPPData as MPPData

PARAMETERS_FILE = os.path.join(BASE_DIR, 'workspace/scripts_vicb/Parameters/temp_parameters.json')
if not os.path.exists(PARAMETERS_FILE):
    raise Exception('Parameter file {} does not exist!'.format(PARAMETERS_FILE))


BASE_DIR: /storage/groups/ml01/workspace/andres.becker/master_thesis
Setting BASE_DIR to /storage/groups/ml01/workspace/andres.becker/master_thesis


Open parameters file

In [2]:
# Open parameters
with open(PARAMETERS_FILE) as params_file:
    p = json.load(params_file)
#del(p['_comment'])

In [3]:
p

{'_comment': 'Save file name as reference ---------------------------------',
 'raw_data_dir': '/storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer',
 'dir_type': 'hannah',
 'perturbations_and_wells': {'184A1_hannah_CX5461': ['I18', 'J22', 'J09'],
  '184A1_hannah_AZD4573': ['I13', 'J21', 'J14', 'I17', 'J18'],
  '184A1_hannah_meayamycin': ['I12', 'I20'],
  '184A1_hannah_DMSO': ['J16', 'I14'],
  '184A1_hannah_triptolide': ['I10', 'J15'],
  '184A1_hannah_TSA': ['J20', 'I16', 'J13'],
  '184A1_hannah_unperturbed': ['J10', 'I13', 'I09', 'I11', 'J18', 'J12']},
 'output_data_dir': '/storage/groups/ml01/workspace/andres.becker/master_thesis/datasets/184A1_hannah_avg_projection_all_wells',
 'seed': 42,
 'input_channels': ['00_DAPI',
  '07_H2B',
  '01_CDK9_pT186',
  '03_CDK9',
  '05_GTF2B',
  '07_SETD1A',
  '08_H3K4me3',
  '09_SRRM2',
  '10_H3K27ac',
  '11_KPNA2_MAX',
  '12_RB1_pS807_S811',
  '13_PABPN1',
  '14_PCNA',
  '15_SON',
  '16_H3',
  '17_HDAC3',
  '19_KPNA1_MAX'

Set raw data directory

In [4]:
DATA_DIR = p['raw_data_dir']
if not os.path.exists(DATA_DIR):
    raise Exception('Data path {} does not exist!'.format(DATA_DIR))
else:
    print('DATA_DIR: {}'.format(DATA_DIR))

DATA_DIR: /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer


Check available data (Perturbations and Wells):

In [5]:
# Save available local Perturbations and Wells
perturbations = [per for per in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, per))]
local_data = {}
#print('Local available perturbations-wells:\n')
for per in perturbations:
    pertur_dir = os.path.join(DATA_DIR, per)
    wells = [w for w in os.listdir(pertur_dir) if os.path.isdir(os.path.join(pertur_dir, w))]
    #print('{}\n\t{}\n'.format(p, wells))
    local_data[per] = wells

Select Perturbations and its wells to process: 

In [6]:
print('Local available perturbations-wells:\n{}'.format(local_data))

# In case you only want to load some specific perturbations and/or wells:
selected_data = {
    '184A1_hannah_unperturbed': ['I11', 'I09'],
    '184A1_hannah_TSA': ['J20', 'I16'],
}

selected_data = p['perturbations_and_wells']

# Process all available data:
#selected_data = local_data

print('\nSelected perturbations-wells:\n{}'.format(selected_data))

#Generate and save data dirs
data_dirs = []
for per in selected_data.keys():
    for w in selected_data[per]:
        d = os.path.join(DATA_DIR, per, w)
        data_dirs.append(d)
        if not os.path.exists(d):
            raise Exception('{} does not exist!\nCheck if selected_data contain elements only from local_data dict.'.format(d))
p['data_dirs'] = data_dirs

Local available perturbations-wells:
{'184A1_hannah_CX5461': ['I18', 'J22', 'J09'], '184A1_hannah_AZD4573': ['I13', 'J21', 'J14', 'I17', 'J18'], '184A1_hannah_meayamycin': ['I12', 'I20', 'J17'], '184A1_hannah_DMSO': ['J16', 'I14'], '184A1_hannah_triptolide': ['I10', 'J15'], '184A1_hannah_TSA': ['J20', 'I16', 'J13'], '184A1_hannah_unperturbed': ['J10', 'I13', 'I09', 'I11', 'J18', 'J12']}

Selected perturbations-wells:
{'184A1_hannah_CX5461': ['I18', 'J22', 'J09'], '184A1_hannah_AZD4573': ['I13', 'J21', 'J14', 'I17', 'J18'], '184A1_hannah_meayamycin': ['I12', 'I20'], '184A1_hannah_DMSO': ['J16', 'I14'], '184A1_hannah_triptolide': ['I10', 'J15'], '184A1_hannah_TSA': ['J20', 'I16', 'J13'], '184A1_hannah_unperturbed': ['J10', 'I13', 'I09', 'I11', 'J18', 'J12']}


Process data:

In [7]:
for data_dir in p['data_dirs']:
    print('\nProcessing dir {}...'.format(data_dir))
    # Load data as an MPPData object
    mpp_temp = MPPData.from_data_dir(data_dir,
                                     dir_type=p['dir_type'],
                                     seed=p['seed'])
    
    # Add cell cycle to metadata (G1, S, G2)
    # Important! If mapobject_id_cell is not in cell_cycle_file =>
    # its corresponding cell is in Mitosis phase!
    if p['add_cell_cycle_to_metadata']:
        mpp_temp.add_cell_cycle_to_metadata(os.path.join(DATA_DIR, p['cell_cycle_file']))
    
    # Add well info to metadata
    if p['add_well_info_to_metadata']:
        mpp_temp.add_well_info_to_metadata(os.path.join(DATA_DIR, p['well_info_file']))
    
    # Remove unwanted cells
    if p.get('filter_criteria', None) is not None:
        print('Removing unwanted cells...')
        mpp_temp.filter_cells(p['filter_criteria'], p['filter_values'])

    # Subtract background  values for each channel
    if p['subtract_background']:
        mpp_temp.subtract_background(os.path.join(DATA_DIR, p['background_value']))
    
    # Project every uni-channel images into a scalar for further analysis
    if p['project_into_scalar']:
        print('\nProjecting data...')
        mpp_temp.add_scalar_projection(p['method'])
        
        # Remove raw data now that is processed
        instance_vars = {'labels', 'x', 'y', 'mpp', 'mapobject_ids', 'mcu_ids','conditions'}
        for var in set(vars(mpp_temp).keys()).intersection(instance_vars):
            delattr(mpp_temp, var)
    
    # Concatenate wells
    # Check first if data sets are already defined
    if 'mpp_data' not in globals().keys():
        mpp_data = mpp_temp
    else:
        mpp_data.merge_instances([mpp_temp])
        del(mpp_temp)

# Normalize projected values
if p['normalise']:
    print('\nNormalizing data...')
    norm_columns = [c+'_'+p['method'] for c in mpp_data.channels.name.values]
    rescale_values = np.percentile(mpp_data.metadata[norm_columns].values,
                                   ['percentile'],
                                   axis=0)
    for i, col in enumerate(norm_columns):
        mpp_data.metadata[col] /= rescale_values[i]
    p['normalise_rescale_values'] = list(rescale_values)


Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_CX5461/I18...
Removing unwanted cells...
Total number of cells: 926
291 cells cutted by filter: is_border_cell == 1
37 cells cutted by filter: is_polynuclei_184A1 == 1
356 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 356


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_CX5461/J22...
Removing unwanted cells...
Total number of cells: 978
296 cells cutted by filter: is_border_cell == 1
42 cells cutted by filter: is_polynuclei_184A1 == 1
408 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 408


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_CX5461/J09...
Removing unwanted cells...
Total number of cells: 733
267 cells cutted by filter: is_border_cell == 1
19 cells cutted by filter: is_polynuclei_184A1 == 1
311 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 311


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_AZD4573/I13...
Removing unwanted cells...
Total number of cells: 860
293 cells cutted by filter: is_border_cell == 1
24 cells cutted by filter: is_polynuclei_184A1 == 1
322 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 322


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_AZD4573/J21...


MemoryError: Unable to allocate 1.98 GiB for an array with shape (532148504,) and data type int32

In [8]:
mpp_data.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration,00_DAPI_avg,07_H2B_avg,01_CDK9_pT186_avg,03_CDK9_avg,05_GTF2B_avg,07_SETD1A_avg,08_H3K4me3_avg,09_SRRM2_avg,10_H3K27ac_avg,11_KPNA2_MAX_avg,12_RB1_pS807_S811_avg,13_PABPN1_avg,14_PCNA_avg,15_SON_avg,16_H3_avg,17_HDAC3_avg,19_KPNA1_MAX_avg,20_SP100_avg,21_NCL_avg,01_PABPC1_avg,02_CDK7_avg,03_RPS6_avg,05_Sm_avg,07_POLR2A_avg,09_CCNT1_avg,10_POL2RA_pS2_avg,11_PML_avg,12_YAP1_avg,13_POL2RA_pS5_avg,15_U2SNRNPB_avg,18_NONO_avg,20_ALYREF_avg,21_COIL_avg,00_BG488_avg,00_BG568_avg,00_EU_avg,09_SRRM2_ILASTIK_avg,15_SON_ILASTIK_avg
0,369581,plate01,I18,0,0,0,0,9,0,369487,plate01,I18,0,0,0,0,9,0,0.0,,0.0,,0.0,,S,184A1,CX5461,120.0,44.790928,208.676435,15.962250,29.289384,84.024075,24.177410,46.880668,50.661738,107.367649,59.570214,95.348947,162.164103,75.340100,41.983402,233.016897,75.355192,83.016410,19.006747,47.585081,21.462703,119.898365,22.528784,45.912931,104.033265,80.562506,303.177564,11.478352,93.097852,221.812826,39.005296,287.133040,266.622592,7.860114,6.162881,1.671627,342.910464,7830.081611,10304.112271
1,369582,plate01,I18,0,0,0,0,10,0,369488,plate01,I18,0,0,0,0,10,0,0.0,,0.0,,0.0,,G1,184A1,CX5461,120.0,44.693453,192.564041,14.927710,32.759411,92.919336,27.445029,50.696422,53.129896,67.922149,41.104070,101.088256,191.014345,45.255686,49.039914,196.708995,66.276975,63.790027,21.981683,40.383979,14.368812,146.185885,19.856793,46.780805,132.973481,85.893447,288.406003,14.120230,102.825017,205.574996,42.433129,312.214706,231.791821,8.029727,6.515615,1.657841,182.502609,7826.491151,10242.813811
2,369583,plate01,I18,0,0,0,0,11,0,369489,plate01,I18,0,0,0,0,11,0,0.0,,0.0,,0.0,,G1,184A1,CX5461,120.0,40.628654,224.238141,11.883508,25.218904,88.832127,29.588301,46.845729,54.031310,61.636522,47.324893,120.000485,186.245955,58.354520,43.540287,263.274235,72.690354,76.520639,26.116184,47.600271,18.067183,125.463465,20.618981,52.433856,121.306955,80.992489,292.695766,13.591581,111.875495,221.776828,39.558433,333.519357,310.301499,8.908267,6.080175,1.548610,195.049564,6211.161568,10094.878316
3,369584,plate01,I18,0,0,0,0,12,0,369490,plate01,I18,0,0,0,0,12,0,0.0,,0.0,,0.0,,G1,184A1,CX5461,120.0,47.190139,255.008827,8.761640,22.483270,86.429829,7.113698,28.268655,50.675528,54.239803,39.124333,88.022382,157.279270,31.450823,28.895709,260.202417,59.677551,66.606941,10.139325,47.945185,16.539284,105.200716,20.212364,47.114630,55.256595,58.227016,254.808453,10.881472,98.594081,247.179793,37.353835,326.267001,289.691050,7.332875,5.868882,1.554583,192.862674,6784.361001,7852.095976
4,369585,plate01,I18,0,0,0,0,13,0,369491,plate01,I18,0,0,0,0,13,0,0.0,,0.0,,0.0,,S,184A1,CX5461,120.0,44.595874,155.674791,22.491115,34.184902,118.482454,43.344288,51.013306,58.614662,52.146755,78.335567,127.013187,171.134241,75.371029,45.063884,164.659560,78.160541,86.018186,18.520590,35.027228,25.870875,225.888079,24.161104,54.150928,151.001520,112.756650,267.606364,18.161617,154.100137,183.721177,47.454413,289.719015,209.839964,9.956070,7.354605,1.695586,269.511643,7898.951239,9751.622238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2095,312038,plate01,I13,5,4,0,0,10,0,312018,plate01,I13,5,4,0,0,10,0,0.0,,0.0,,0.0,,G2,184A1,AZD4573,120.0,50.171597,203.079870,12.294496,22.002306,108.692295,29.245423,60.991982,61.580411,88.741077,112.502233,132.514915,190.236195,92.747940,53.177610,291.607698,88.840073,102.516518,16.436603,47.910283,24.704059,175.416843,29.071327,57.157738,111.731255,103.681614,98.654193,8.749086,115.419219,137.184296,51.061171,228.029856,308.023445,10.027534,9.217393,1.778263,196.667877,9561.491304,11298.371673
2096,312040,plate01,I13,5,4,0,0,12,0,312020,plate01,I13,5,4,0,0,12,0,0.0,,0.0,,0.0,,S,184A1,AZD4573,120.0,47.112931,221.512177,7.724703,15.356291,72.237641,17.287674,40.321914,47.961561,80.478696,80.023728,105.505874,161.286796,95.994218,34.346854,339.519516,79.912203,97.734061,20.724889,40.168812,13.537411,123.677224,22.690146,50.085843,83.140589,78.983222,85.031049,7.677815,152.712483,147.091725,45.360427,213.976995,331.673747,7.941405,6.771050,1.614569,181.809034,6542.658574,9379.312420
2097,312042,plate01,I13,5,4,0,0,14,0,312022,plate01,I13,5,4,0,0,14,0,0.0,,0.0,,0.0,,S,184A1,AZD4573,120.0,26.056533,167.477532,9.088328,13.688656,106.758598,20.083653,49.065546,70.005819,65.281665,43.520425,55.924847,156.864170,53.032340,59.247138,222.153895,80.214920,85.173352,33.251583,43.449660,14.853871,125.961222,24.859379,71.965400,108.812233,71.486826,79.742961,9.744324,139.239205,115.629029,60.155940,194.169561,245.113029,8.074733,8.107991,1.759757,173.245959,9902.470973,14742.915881
2098,312044,plate01,I13,5,4,0,0,16,0,312024,plate01,I13,5,4,0,0,16,0,0.0,,0.0,,0.0,,G1,184A1,AZD4573,120.0,45.602241,239.402712,5.147188,9.312947,66.656417,16.053429,46.705648,45.041349,71.126418,66.524799,80.206460,143.199555,44.358867,44.819274,351.735431,75.851485,86.034974,15.067244,39.120749,18.459648,123.118586,27.021043,43.038603,69.313208,51.416500,60.912004,9.196590,148.687201,141.042860,39.509665,193.329485,295.824453,7.902105,7.895647,1.786310,179.385849,6925.429363,12019.470637


## Save data

Prepare to save data:

In [9]:
import shutil

# create dir
outdir = p['output_data_dir']
if os.path.exists(outdir):
    print('Warning! Directory {} already exist! Deleting...\n'.format(outdir))
    try:
        shutil.rmtree(outdir)
    except OSError as e:
        print('Dir {} could not be deleted!\n\nOSError: {}'.format(outdir, e))

print('Creating dir: {}'.format(outdir))
os.makedirs(outdir, exist_ok=False)
    

Creating dir: /storage/groups/ml01/workspace/andres.becker/master_thesis/datasets/184A1_hannah_avg_projection_all_wells


In [10]:
# Get channels ids (proteins) which will be used to predict transcripcion rate
input_ids = list(mpp_data.channels.set_index('name').loc[p['input_channels']]['channel_id'])
# Get id of the channel that measure trancripcion rate
output_ids = list(mpp_data.channels.set_index('name').loc[p['output_channels']]['channel_id'])
# add output channel id after the input channels ids
channels_ids = input_ids + output_ids

Save metadata and used parameters

In [11]:
# save params
json.dump(p, open(os.path.join(outdir, 'params.json'), 'w'), indent=4)

# save metadata
mpp_data.metadata.to_csv(os.path.join(outdir, 'metadata.csv'))

# Save used channels
mpp_data.channels.to_csv(os.path.join(outdir, 'channels.csv'))
#mpp_data.channels.set_index('channel_id').loc[channels_ids].to_csv(os.path.join(outdir, 'channels.csv'))