# Data Preprocessing: Projection into scalars

This notebook is ment to convert raw cell data from several wells into scalars (or a vector in $\mathbb{R}^{n\_channels}$).

Data was taken from:
`/storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/` and server `vicb-submit-01`. 

Load libraries and set Directories paths:

In [1]:
import numpy as np
import pandas as pd
# To display all the columns
pd.options.display.max_columns = None
import os
import sys
import matplotlib.pyplot as plt
import json

# Set paths
BASE_DIR = os.path.realpath(os.path.join(os.path.abspath(''),'../..'))
if not os.path.exists(BASE_DIR):
    raise Exception('Base path {} does not exist!'.format(BASE_DIR))
else:
    print('BASE_DIR: {}'.format(BASE_DIR))
    
# Add BASE_DIR to sys paths (for loading libraries)
sys.path.insert(1, os.path.join(BASE_DIR, 'workspace'))
# Load mpp_data library to convert raw data into images
from pelkmans.mpp_data import MPPData as MPPData
    
# Add BASE_DIR to sys paths (for loading libraries)
sys.path.insert(1, os.path.join(BASE_DIR, 'workspace'))
# Load mpp_data library to convert raw data into images
from pelkmans.mpp_data import MPPData as MPPData

PARAMETERS_FILE = os.path.join(BASE_DIR, 'workspace/scripts_vicb/Parameters/temp_parameters.json')
if not os.path.exists(PARAMETERS_FILE):
    raise Exception('Parameter file {} does not exist!'.format(PARAMETERS_FILE))


BASE_DIR: /home/hhughes/Documents/Master_Thesis/Project
Setting BASE_DIR to /home/hhughes/Documents/Master_Thesis/Project


Open parameters file

In [2]:
# Open parameters
with open(PARAMETERS_FILE) as params_file:
    p = json.load(params_file)
#del(p['_comment'])

In [3]:
p

{'_comment': 'Save file name as reference ---------------------------------',
 'raw_data_dir': '/home/hhughes/Documents/Master_Thesis/Project/datasets/raw',
 'dir_type': 'hannah',
 'perturbations_and_wells': {'184A1_hannah_unperturbed': ['I09']},
 'output_data_dir': '/home/hhughes/Documents/Master_Thesis/Project/datasets/184A1_hannah_EU_scalar_projection',
 'seed': 42,
 'input_channels': ['00_DAPI',
  '07_H2B',
  '01_CDK9_pT186',
  '03_CDK9',
  '05_GTF2B',
  '07_SETD1A',
  '08_H3K4me3',
  '09_SRRM2',
  '10_H3K27ac',
  '11_KPNA2_MAX',
  '12_RB1_pS807_S811',
  '13_PABPN1',
  '14_PCNA',
  '15_SON',
  '16_H3',
  '17_HDAC3',
  '19_KPNA1_MAX',
  '20_SP100',
  '21_NCL',
  '01_PABPC1',
  '02_CDK7',
  '03_RPS6',
  '05_Sm',
  '07_POLR2A',
  '09_CCNT1',
  '10_POL2RA_pS2',
  '11_PML',
  '12_YAP1',
  '13_POL2RA_pS5',
  '15_U2SNRNPB',
  '18_NONO',
  '20_ALYREF',
  '21_COIL'],
 'output_channels': ['00_EU'],
 'aggregate_output': 'avg',
 'train_frac': 0.8,
 'val_frac': 0.1,
 'img_size': 224,
 'subtract

Set raw data directory

In [4]:
DATA_DIR = p['raw_data_dir']
if not os.path.exists(DATA_DIR):
    raise Exception('Data path {} does not exist!'.format(DATA_DIR))
else:
    print('DATA_DIR: {}'.format(DATA_DIR))

DATA_DIR: /home/hhughes/Documents/Master_Thesis/Project/datasets/raw


Check available data (Perturbations and Wells):

In [5]:
# Save available local Perturbations and Wells
perturbations = [per for per in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, per))]
local_data = {}
#print('Local available perturbations-wells:\n')
for per in perturbations:
    pertur_dir = os.path.join(DATA_DIR, per)
    wells = [w for w in os.listdir(pertur_dir) if os.path.isdir(os.path.join(pertur_dir, w))]
    #print('{}\n\t{}\n'.format(p, wells))
    local_data[per] = wells

Select Perturbations and its wells to process: 

In [6]:
print('Local available perturbations-wells:\n{}'.format(local_data))

# In case you only want to load some specific perturbations and/or wells:
selected_data = {
    '184A1_hannah_unperturbed': ['I11', 'I09'],
    '184A1_hannah_TSA': ['J20', 'I16'],
}

selected_data = p['perturbations_and_wells']

# Process all available data:
#selected_data = local_data

print('\nSelected perturbations-wells:\n{}'.format(selected_data))

#Generate and save data dirs
data_dirs = []
for per in selected_data.keys():
    for w in selected_data[per]:
        d = os.path.join(DATA_DIR, per, w)
        data_dirs.append(d)
        if not os.path.exists(d):
            raise Exception('{} does not exist!\nCheck if selected_data contain elements only from local_data dict.'.format(d))
p['data_dirs'] = data_dirs

Local available perturbations-wells:
{'184A1_hannah_unperturbed': ['I11', 'I09', 'J10'], '184A1_hannah_TSA': ['J20', 'I16', 'J13']}

Selected perturbations-wells:
{'184A1_hannah_unperturbed': ['I09']}


Process data:

In [7]:
for data_dir in p['data_dirs']:
    print('\nProcessing dir {}...'.format(data_dir))
    # Load data as an MPPData object
    mpp_temp = MPPData.from_data_dir(data_dir,
                                     dir_type=p['dir_type'],
                                     seed=p['seed'])
    
    # Add cell cycle to metadata (G1, S, G2)
    # Important! If mapobject_id_cell is not in cell_cycle_file =>
    # its corresponding cell is in Mitosis phase!
    if p['add_cell_cycle_to_metadata']:
        mpp_temp.add_cell_cycle_to_metadata(os.path.join(DATA_DIR, p['cell_cycle_file']))
    
    # Add well info to metadata
    if p['add_well_info_to_metadata']:
        mpp_temp.add_well_info_to_metadata(os.path.join(DATA_DIR, p['well_info_file']))
    
    # Remove unwanted cells
    if p.get('filter_criteria', None) is not None:
        print('Removing unwanted cells...')
        mpp_temp.filter_cells(p['filter_criteria'], p['filter_values'])

    # Subtract background  values for each channel
    if p['subtract_background']:
        mpp_temp.subtract_background(os.path.join(DATA_DIR, p['background_value']))
    
    # Project every uni-channel images into a scalar for further analysis
    if p['project_into_scalar']:
        print('\nProjecting data...')
        mpp_temp.add_scalar_projection(p['method'])
        
        instance_vars = {'labels', 'x', 'y', 'mpp', 'mapobject_ids', 'mcu_ids','conditions'}
        for var in set(vars(mpp_temp).keys()).intersection(instance_vars):
            delattr(mpp_temp, var)
    
    # Concatenate wells
    # Check first if data sets are already defined
    if 'mpp_data' not in globals().keys():
        mpp_data = mpp_temp
    else:
        mpp_data.merge_instances([mpp_temp])
        del(mpp_temp)



Processing dir /home/hhughes/Documents/Master_Thesis/Project/datasets/raw/184A1_hannah_unperturbed/I09...
Removing unwanted cells...
Total number of cells: 886
269 cells cutted by filter: is_border_cell == 1
30 cells cutted by filter: is_polynuclei_184A1 == 1
323 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 323


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...


In [8]:
mpp_data.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration,00_DAPI_avg,07_H2B_avg,01_CDK9_pT186_avg,03_CDK9_avg,05_GTF2B_avg,07_SETD1A_avg,08_H3K4me3_avg,09_SRRM2_avg,10_H3K27ac_avg,11_KPNA2_MAX_avg,12_RB1_pS807_S811_avg,13_PABPN1_avg,14_PCNA_avg,15_SON_avg,16_H3_avg,17_HDAC3_avg,19_KPNA1_MAX_avg,20_SP100_avg,21_NCL_avg,01_PABPC1_avg,02_CDK7_avg,03_RPS6_avg,05_Sm_avg,07_POLR2A_avg,09_CCNT1_avg,10_POL2RA_pS2_avg,11_PML_avg,12_YAP1_avg,13_POL2RA_pS5_avg,15_U2SNRNPB_avg,18_NONO_avg,20_ALYREF_avg,21_COIL_avg,00_BG488_avg,00_BG568_avg,00_EU_avg,09_SRRM2_ILASTIK_avg,15_SON_ILASTIK_avg
0,373535,plate01,I09,0,0,0,0,6,0,373506,plate01,I09,0,0,0,0,6,0,0.0,,0.0,,0.0,,S,184A1,normal,,43.893030,175.995428,27.577226,54.206169,128.697294,41.469521,60.416069,64.842372,125.169772,65.557846,105.751293,192.479292,78.460671,49.046642,187.326717,99.378451,117.227410,26.282266,60.407161,33.006360,131.946547,28.173045,54.011811,187.774103,131.526724,385.672558,11.862217,81.837107,291.005551,51.128612,368.260323,285.676524,11.584543,8.256946,1.803026,599.449225,8305.377207,10121.675842
1,373536,plate01,I09,0,0,0,0,7,0,373507,plate01,I09,0,0,0,0,7,0,0.0,,0.0,,0.0,,G1,184A1,normal,,39.837376,273.655677,12.569294,22.001503,81.730022,20.032913,39.631146,44.481310,73.860256,54.789958,67.896671,162.914590,35.864765,37.245010,355.807515,67.943580,76.462250,12.614654,55.775873,14.612852,120.283067,19.574581,41.063924,101.116543,55.055405,297.648758,11.671538,91.374197,329.135675,33.598199,335.386680,410.320858,9.813957,5.888977,1.561608,398.334857,6055.847451,10278.970546
2,373537,plate01,I09,0,0,0,0,8,0,373508,plate01,I09,0,0,0,0,8,0,0.0,,0.0,,0.0,,G1,184A1,normal,,44.735922,258.942987,23.616477,47.406444,140.684383,44.831516,68.078372,67.289663,85.795525,90.090190,87.339633,249.505730,88.193348,51.833120,332.372822,124.072929,130.945283,38.285344,76.223063,43.566143,151.189717,39.663339,71.273700,205.703836,124.654347,332.255405,17.381051,196.612601,361.317387,55.593623,457.713302,469.516855,14.441616,9.757942,1.859657,464.431683,8280.577392,11757.616541
3,373538,plate01,I09,0,0,0,0,9,0,373509,plate01,I09,0,0,0,0,9,0,0.0,,0.0,,0.0,,S,184A1,normal,,43.931378,194.444293,18.353704,32.399117,104.564078,29.361877,46.617407,51.506499,127.183208,55.535536,94.269486,171.038290,77.086018,32.756867,212.862917,74.396402,81.701189,15.256486,49.931726,19.315464,136.797350,21.712636,45.183902,148.422635,82.323469,284.648928,8.077076,50.190543,271.552491,37.614959,285.612350,263.922204,7.988940,6.477020,1.717521,425.533338,7441.930589,8530.446240
4,373539,plate01,I09,0,0,0,0,10,0,373510,plate01,I09,0,0,0,0,10,0,0.0,,0.0,,0.0,,S,184A1,normal,,30.892690,231.842236,19.046228,37.117579,100.918987,25.272207,43.387094,43.176593,78.249253,76.013457,112.938592,178.444723,81.792631,41.639270,292.071767,90.931247,99.625565,27.330404,66.549839,20.411372,107.398853,23.828493,55.354041,134.139140,99.027838,301.044591,15.211128,122.575573,334.431894,42.139486,362.906348,438.364101,10.736532,6.967087,1.697280,410.540764,5541.760160,10350.776433
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558,248096,plate01,I09,5,4,0,0,31,0,248051,plate01,I09,5,4,0,0,31,0,0.0,,0.0,,0.0,,G1,184A1,normal,,34.088505,155.096587,13.024716,28.936674,95.528086,23.275727,43.408660,39.239153,52.576061,40.188708,5.078687,175.720682,51.470518,35.296696,174.323644,56.949624,56.759946,18.220416,41.090229,17.958328,122.376457,20.033855,43.918364,119.742814,86.827567,218.802525,10.731663,134.708834,183.026731,37.249065,260.121881,235.003543,7.600701,6.105711,1.609373,325.157342,5004.892771,9991.788145
559,248097,plate01,I09,5,4,0,0,32,0,248052,plate01,I09,5,4,0,0,32,0,0.0,,0.0,,0.0,,G1,184A1,normal,,29.103252,157.851762,9.229010,18.525031,76.213924,19.376481,31.879794,39.433825,48.630776,37.535381,79.274388,165.326735,42.447831,37.616003,190.478952,54.423588,51.708736,21.694139,44.480561,17.644941,95.801883,17.100752,37.933908,81.901958,60.711370,268.427825,9.421172,47.902741,201.229701,32.069356,284.005220,313.050669,7.123799,5.655949,1.528000,382.174679,6101.246866,9692.065772
560,248098,plate01,I09,5,4,0,0,33,0,248053,plate01,I09,5,4,0,0,33,0,0.0,,0.0,,0.0,,S,184A1,normal,,41.039714,180.212368,17.454660,31.442297,84.303652,28.616915,44.196115,49.574424,103.050050,39.699195,69.999549,180.591155,86.435982,45.644611,242.215620,63.927868,64.317648,17.133886,51.754897,19.469017,133.090863,22.530236,39.120315,125.629150,85.664930,338.872580,7.997690,113.691155,251.432130,38.683705,336.799904,329.825839,8.433395,6.436342,1.709882,451.010388,6905.080085,9703.626920
561,248102,plate01,I09,5,4,0,0,37,0,248057,plate01,I09,5,4,0,0,37,0,0.0,,0.0,,0.0,,S,184A1,normal,,41.121487,153.873601,15.314953,28.338914,82.752061,22.497689,40.477599,45.760907,118.973075,45.119237,100.598724,150.048637,83.140813,33.998817,199.528098,58.714794,62.800221,25.627111,44.674296,19.702687,98.382955,20.627130,39.620163,109.974209,74.349470,267.687032,10.510172,68.689059,215.263432,35.644982,311.052738,289.755046,7.408682,6.552806,1.713042,515.735421,7191.907034,9512.493248


## Save data

Prepare to save data:

In [9]:
import shutil

# create dir
outdir = p['output_data_dir']
if os.path.exists(outdir):
    print('Warning! Directory {} already exist! Deleting...\n'.format(outdir))
    try:
        shutil.rmtree(outdir)
    except OSError as e:
        print('Dir {} could not be deleted!\n\nOSError: {}'.format(outdir, e))

print('Creating dir: {}'.format(outdir))
os.makedirs(outdir, exist_ok=False)
    


Creating dir: /home/hhughes/Documents/Master_Thesis/Project/datasets/184A1_hannah_EU_scalar_projection


In [10]:
# Get channels ids (proteins) which will be used to predict transcripcion rate
input_ids = list(mpp_data.channels.set_index('name').loc[p['input_channels']]['channel_id'])
# Get id of the channel that measure trancripcion rate
output_ids = list(mpp_data.channels.set_index('name').loc[p['output_channels']]['channel_id'])
# add output channel id after the input channels ids
channels_ids = input_ids + output_ids

Save metadata and used parameters

In [11]:
# save params
json.dump(p, open(os.path.join(outdir, 'params.json'), 'w'), indent=4)

# save metadata
mpp_data.metadata.to_csv(os.path.join(outdir, 'metadata.csv'))

# Save used channels
mpp_data.channels.to_csv(os.path.join(outdir, 'channels.csv'))
#mpp_data.channels.set_index('channel_id').loc[channels_ids].to_csv(os.path.join(outdir, 'channels.csv'))