# Data Preprocessing: Projection into scalars

This notebook is ment to convert raw cell data from several wells into scalars (or a vector in $\mathbb{R}^{n\_channels}$).

Data was taken from:
`/storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/` and server `vicb-submit-01`. 

Load libraries and set Directories paths:

In [1]:
import numpy as np
import pandas as pd
# To display all the columns
pd.options.display.max_columns = None
import os
import sys
import matplotlib.pyplot as plt
import json

# Set paths
BASE_DIR = os.path.realpath(os.path.join(os.path.abspath(''),'../..'))
if not os.path.exists(BASE_DIR):
    raise Exception('Base path {} does not exist!'.format(BASE_DIR))
else:
    print('BASE_DIR: {}'.format(BASE_DIR))
    
# Add BASE_DIR to sys paths (for loading libraries)
sys.path.insert(1, os.path.join(BASE_DIR, 'workspace'))
# Load mpp_data library to convert raw data into images
from pelkmans.mpp_data import MPPData as MPPData
    
# Add BASE_DIR to sys paths (for loading libraries)
sys.path.insert(1, os.path.join(BASE_DIR, 'workspace'))
# Load mpp_data library to convert raw data into images
from pelkmans.mpp_data import MPPData as MPPData

PARAMETERS_FILE = os.path.join(BASE_DIR, 'workspace/scripts_vicb/Parameters/temp_parameters.json')
if not os.path.exists(PARAMETERS_FILE):
    raise Exception('Parameter file {} does not exist!'.format(PARAMETERS_FILE))


BASE_DIR: /storage/groups/ml01/workspace/andres.becker/master_thesis
Setting BASE_DIR to /storage/groups/ml01/workspace/andres.becker/master_thesis


Open parameters file

In [2]:
# Open parameters
with open(PARAMETERS_FILE) as params_file:
    p = json.load(params_file)
#del(p['_comment'])

In [3]:
p

{'_comment': 'Save file name as reference ---------------------------------',
 'raw_data_dir': '/storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer',
 'dir_type': 'hannah',
 'perturbations_and_wells': {'184A1_hannah_CX5461': ['I18', 'J22', 'J09'],
  '184A1_hannah_AZD4573': ['I13', 'J21', 'J14', 'I17', 'J18'],
  '184A1_hannah_meayamycin': ['I12', 'I20'],
  '184A1_hannah_DMSO': ['J16', 'I14'],
  '184A1_hannah_triptolide': ['I10', 'J15'],
  '184A1_hannah_TSA': ['J20', 'I16', 'J13'],
  '184A1_hannah_unperturbed': ['J10', 'I13', 'I09', 'I11', 'J18', 'J12']},
 'output_data_dir': '/storage/groups/ml01/workspace/andres.becker/master_thesis/datasets/184A1_hannah_EU_scalar_projection_vicb',
 'seed': 42,
 'input_channels': ['00_DAPI',
  '07_H2B',
  '01_CDK9_pT186',
  '03_CDK9',
  '05_GTF2B',
  '07_SETD1A',
  '08_H3K4me3',
  '09_SRRM2',
  '10_H3K27ac',
  '11_KPNA2_MAX',
  '12_RB1_pS807_S811',
  '13_PABPN1',
  '14_PCNA',
  '15_SON',
  '16_H3',
  '17_HDAC3',
  '19_KPNA1_MAX

Set raw data directory

In [4]:
DATA_DIR = p['raw_data_dir']
if not os.path.exists(DATA_DIR):
    raise Exception('Data path {} does not exist!'.format(DATA_DIR))
else:
    print('DATA_DIR: {}'.format(DATA_DIR))

DATA_DIR: /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer


Check available data (Perturbations and Wells):

In [5]:
# Save available local Perturbations and Wells
perturbations = [per for per in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, per))]
local_data = {}
#print('Local available perturbations-wells:\n')
for per in perturbations:
    pertur_dir = os.path.join(DATA_DIR, per)
    wells = [w for w in os.listdir(pertur_dir) if os.path.isdir(os.path.join(pertur_dir, w))]
    #print('{}\n\t{}\n'.format(p, wells))
    local_data[per] = wells

Select Perturbations and its wells to process: 

In [6]:
print('Local available perturbations-wells:\n{}'.format(local_data))

# In case you only want to load some specific perturbations and/or wells:
selected_data = {
    '184A1_hannah_unperturbed': ['I11', 'I09'],
    '184A1_hannah_TSA': ['J20', 'I16'],
}

selected_data = p['perturbations_and_wells']

# Process all available data:
#selected_data = local_data

print('\nSelected perturbations-wells:\n{}'.format(selected_data))

#Generate and save data dirs
data_dirs = []
for per in selected_data.keys():
    for w in selected_data[per]:
        d = os.path.join(DATA_DIR, per, w)
        data_dirs.append(d)
        if not os.path.exists(d):
            raise Exception('{} does not exist!\nCheck if selected_data contain elements only from local_data dict.'.format(d))
p['data_dirs'] = data_dirs

Local available perturbations-wells:
{'184A1_hannah_CX5461': ['I18', 'J22', 'J09'], '184A1_hannah_AZD4573': ['I13', 'J21', 'J14', 'I17', 'J18'], '184A1_hannah_meayamycin': ['I12', 'I20', 'J17'], '184A1_hannah_DMSO': ['J16', 'I14'], '184A1_hannah_triptolide': ['I10', 'J15'], '184A1_hannah_TSA': ['J20', 'I16', 'J13'], '184A1_hannah_unperturbed': ['J10', 'I13', 'I09', 'I11', 'J18', 'J12']}

Selected perturbations-wells:
{'184A1_hannah_CX5461': ['I18', 'J22', 'J09'], '184A1_hannah_AZD4573': ['I13', 'J21', 'J14', 'I17', 'J18'], '184A1_hannah_meayamycin': ['I12', 'I20'], '184A1_hannah_DMSO': ['J16', 'I14'], '184A1_hannah_triptolide': ['I10', 'J15'], '184A1_hannah_TSA': ['J20', 'I16', 'J13'], '184A1_hannah_unperturbed': ['J10', 'I13', 'I09', 'I11', 'J18', 'J12']}


Process data:

In [7]:
for data_dir in p['data_dirs']:
    print('\nProcessing dir {}...'.format(data_dir))
    # Load data as an MPPData object
    mpp_temp = MPPData.from_data_dir(data_dir,
                                     dir_type=p['dir_type'],
                                     seed=p['seed'])
    
    # Add cell cycle to metadata (G1, S, G2)
    # Important! If mapobject_id_cell is not in cell_cycle_file =>
    # its corresponding cell is in Mitosis phase!
    if p['add_cell_cycle_to_metadata']:
        mpp_temp.add_cell_cycle_to_metadata(os.path.join(DATA_DIR, p['cell_cycle_file']))
    
    # Add well info to metadata
    if p['add_well_info_to_metadata']:
        mpp_temp.add_well_info_to_metadata(os.path.join(DATA_DIR, p['well_info_file']))
    
    # Remove unwanted cells
    if p.get('filter_criteria', None) is not None:
        print('Removing unwanted cells...')
        mpp_temp.filter_cells(p['filter_criteria'], p['filter_values'])

    # Subtract background  values for each channel
    if p['subtract_background']:
        mpp_temp.subtract_background(os.path.join(DATA_DIR, p['background_value']))
    
    # Project every uni-channel images into a scalar for further analysis
    if p['project_into_scalar']:
        print('\nProjecting data...')
        mpp_temp.add_scalar_projection(p['method'])
        
        instance_vars = {'labels', 'x', 'y', 'mpp', 'mapobject_ids', 'mcu_ids','conditions'}
        for var in set(vars(mpp_temp).keys()).intersection(instance_vars):
            delattr(mpp_temp, var)
    
    # Concatenate wells
    # Check first if data sets are already defined
    if 'mpp_data' not in globals().keys():
        mpp_data = mpp_temp
    else:
        mpp_data.merge_instances([mpp_temp])
        del(mpp_temp)



Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_CX5461/I18...
Removing unwanted cells...
Total number of cells: 926
291 cells cutted by filter: is_border_cell == 1
37 cells cutted by filter: is_polynuclei_184A1 == 1
356 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 356


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_CX5461/J22...
Removing unwanted cells...
Total number of cells: 978
296 cells cutted by filter: is_border_cell == 1
42 cells cutted by filter: is_polynuclei_184A1 == 1
408 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 408


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_CX5461/J09...
Removing unwanted cells...
Total number of cells: 733
267 cells cutted by filter: is_border_cell == 1
19 cells cutted by filter: is_polynuclei_184A1 == 1
311 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 311


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_AZD4573/I13...
Removing unwanted cells...
Total number of cells: 860
293 cells cutted by filter: is_border_cell == 1
24 cells cutted by filter: is_polynuclei_184A1 == 1
322 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 322


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_AZD4573/J21...
Removing unwanted cells...
Total number of cells: 1017
300 cells cutted by filter: is_border_cell == 1
36 cells cutted by filter: is_polynuclei_184A1 == 1
362 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 362


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_AZD4573/J14...
Removing unwanted cells...
Total number of cells: 880
298 cells cutted by filter: is_border_cell == 1
16 cells cutted by filter: is_polynuclei_184A1 == 1
324 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 324


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_AZD4573/I17...
Removing unwanted cells...
Total number of cells: 1121
337 cells cutted by filter: is_border_cell == 1
33 cells cutted by filter: is_polynuclei_184A1 == 1
380 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 380


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_AZD4573/J18...
Removing unwanted cells...
Total number of cells: 1000
307 cells cutted by filter: is_border_cell == 1
29 cells cutted by filter: is_polynuclei_184A1 == 1
347 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 347


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_meayamycin/I12...
Removing unwanted cells...
Total number of cells: 556
203 cells cutted by filter: is_border_cell == 1
42 cells cutted by filter: is_polynuclei_184A1 == 1
236 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 236


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_meayamycin/I20...
Removing unwanted cells...
Total number of cells: 692
206 cells cutted by filter: is_border_cell == 1
46 cells cutted by filter: is_polynuclei_184A1 == 1
251 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 251


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_DMSO/J16...
Removing unwanted cells...
Total number of cells: 962
294 cells cutted by filter: is_border_cell == 1
35 cells cutted by filter: is_polynuclei_184A1 == 1
347 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 347


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_DMSO/I14...
Removing unwanted cells...
Total number of cells: 844
272 cells cutted by filter: is_border_cell == 1
25 cells cutted by filter: is_polynuclei_184A1 == 1
311 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 311


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_triptolide/I10...
Removing unwanted cells...
Total number of cells: 842
275 cells cutted by filter: is_border_cell == 1
21 cells cutted by filter: is_polynuclei_184A1 == 1
332 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 332


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_triptolide/J15...
Removing unwanted cells...
Total number of cells: 853
279 cells cutted by filter: is_border_cell == 1
20 cells cutted by filter: is_polynuclei_184A1 == 1
338 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 338


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_TSA/J20...
Removing unwanted cells...
Total number of cells: 944
285 cells cutted by filter: is_border_cell == 1
24 cells cutted by filter: is_polynuclei_184A1 == 1
341 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 341


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_TSA/I16...
Removing unwanted cells...
Total number of cells: 994
306 cells cutted by filter: is_border_cell == 1
35 cells cutted by filter: is_polynuclei_184A1 == 1
372 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 372


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_TSA/J13...
Removing unwanted cells...
Total number of cells: 819
272 cells cutted by filter: is_border_cell == 1
23 cells cutted by filter: is_polynuclei_184A1 == 1
320 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 320


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_unperturbed/J10...
Removing unwanted cells...
Total number of cells: 1080
333 cells cutted by filter: is_border_cell == 1
31 cells cutted by filter: is_polynuclei_184A1 == 1
400 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 400


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_unperturbed/I13...
Removing unwanted cells...
Total number of cells: 860
293 cells cutted by filter: is_border_cell == 1
24 cells cutted by filter: is_polynuclei_184A1 == 1
322 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 322


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_unperturbed/I09...
Removing unwanted cells...
Total number of cells: 886
269 cells cutted by filter: is_border_cell == 1
30 cells cutted by filter: is_polynuclei_184A1 == 1
323 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 323


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_unperturbed/I11...
Removing unwanted cells...
Total number of cells: 1025
333 cells cutted by filter: is_border_cell == 1
33 cells cutted by filter: is_polynuclei_184A1 == 1
375 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 375


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_unperturbed/J18...
Removing unwanted cells...
Total number of cells: 1000
307 cells cutted by filter: is_border_cell == 1
29 cells cutted by filter: is_polynuclei_184A1 == 1
347 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 347


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...

Processing dir /storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_unperturbed/J12...
Removing unwanted cells...
Total number of cells: 1049
292 cells cutted by filter: is_border_cell == 1
41 cells cutted by filter: is_polynuclei_184A1 == 1
376 cells cutted by filter: cell_cycle == NaN
Number of cutted cells: 376


missing background value for channels ['00_EU', '09_SRRM2_ILASTIK', '15_SON_ILASTIK']



Projecting data...


In [8]:
mpp_data.metadata

Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels,cell_cycle,cell_type,perturbation,duration,00_DAPI_avg,07_H2B_avg,01_CDK9_pT186_avg,03_CDK9_avg,05_GTF2B_avg,07_SETD1A_avg,08_H3K4me3_avg,09_SRRM2_avg,10_H3K27ac_avg,11_KPNA2_MAX_avg,12_RB1_pS807_S811_avg,13_PABPN1_avg,14_PCNA_avg,15_SON_avg,16_H3_avg,17_HDAC3_avg,19_KPNA1_MAX_avg,20_SP100_avg,21_NCL_avg,01_PABPC1_avg,02_CDK7_avg,03_RPS6_avg,05_Sm_avg,07_POLR2A_avg,09_CCNT1_avg,10_POL2RA_pS2_avg,11_PML_avg,12_YAP1_avg,13_POL2RA_pS5_avg,15_U2SNRNPB_avg,18_NONO_avg,20_ALYREF_avg,21_COIL_avg,00_BG488_avg,00_BG568_avg,00_EU_avg,09_SRRM2_ILASTIK_avg,15_SON_ILASTIK_avg
0,369581,plate01,I18,0,0,0,0,9,0,369487,plate01,I18,0,0,0,0,9,0,0.0,,0.0,,0.0,,S,184A1,CX5461,120.0,44.790928,208.676435,15.962250,29.289384,84.024075,24.177410,46.880668,50.661738,107.367649,59.570214,95.348947,162.164103,75.340100,41.983402,233.016897,75.355192,83.016410,19.006747,47.585081,21.462703,119.898365,22.528784,45.912931,104.033265,80.562506,303.177564,11.478352,93.097852,221.812826,39.005296,287.133040,266.622592,7.860114,6.162881,1.671627,342.910464,7830.081611,10304.112271
1,369582,plate01,I18,0,0,0,0,10,0,369488,plate01,I18,0,0,0,0,10,0,0.0,,0.0,,0.0,,G1,184A1,CX5461,120.0,44.693453,192.564041,14.927710,32.759411,92.919336,27.445029,50.696422,53.129896,67.922149,41.104070,101.088256,191.014345,45.255686,49.039914,196.708995,66.276975,63.790027,21.981683,40.383979,14.368812,146.185885,19.856793,46.780805,132.973481,85.893447,288.406003,14.120230,102.825017,205.574996,42.433129,312.214706,231.791821,8.029727,6.515615,1.657841,182.502609,7826.491151,10242.813811
2,369583,plate01,I18,0,0,0,0,11,0,369489,plate01,I18,0,0,0,0,11,0,0.0,,0.0,,0.0,,G1,184A1,CX5461,120.0,40.628654,224.238141,11.883508,25.218904,88.832127,29.588301,46.845729,54.031310,61.636522,47.324893,120.000485,186.245955,58.354520,43.540287,263.274235,72.690354,76.520639,26.116184,47.600271,18.067183,125.463465,20.618981,52.433856,121.306955,80.992489,292.695766,13.591581,111.875495,221.776828,39.558433,333.519357,310.301499,8.908267,6.080175,1.548610,195.049564,6211.161568,10094.878316
3,369584,plate01,I18,0,0,0,0,12,0,369490,plate01,I18,0,0,0,0,12,0,0.0,,0.0,,0.0,,G1,184A1,CX5461,120.0,47.190139,255.008827,8.761640,22.483270,86.429829,7.113698,28.268655,50.675528,54.239803,39.124333,88.022382,157.279270,31.450823,28.895709,260.202417,59.677551,66.606941,10.139325,47.945185,16.539284,105.200716,20.212364,47.114630,55.256595,58.227016,254.808453,10.881472,98.594081,247.179793,37.353835,326.267001,289.691050,7.332875,5.868882,1.554583,192.862674,6784.361001,7852.095976
4,369585,plate01,I18,0,0,0,0,13,0,369491,plate01,I18,0,0,0,0,13,0,0.0,,0.0,,0.0,,S,184A1,CX5461,120.0,44.595874,155.674791,22.491115,34.184902,118.482454,43.344288,51.013306,58.614662,52.146755,78.335567,127.013187,171.134241,75.371029,45.063884,164.659560,78.160541,86.018186,18.520590,35.027228,25.870875,225.888079,24.161104,54.150928,151.001520,112.756650,267.606364,18.161617,154.100137,183.721177,47.454413,289.719015,209.839964,9.956070,7.354605,1.695586,269.511643,7898.951239,9751.622238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13115,266565,plate01,J12,5,4,0,0,44,0,266516,plate01,J12,5,4,0,0,44,0,0.0,,0.0,,0.0,,G2,184A1,normal,,38.974759,76.047797,17.563898,36.184834,104.156500,36.770887,50.232165,39.952018,108.585726,60.649794,121.661078,190.923176,43.494781,39.594944,104.188014,81.363560,95.575175,23.937036,46.848424,22.532612,106.693459,20.003956,38.476473,155.247328,99.968160,220.554683,19.768186,100.464770,149.299705,48.944906,177.062955,176.829221,8.079843,5.841951,1.774400,454.006752,6096.764857,9030.592238
13116,266567,plate01,J12,5,4,0,0,46,0,266518,plate01,J12,5,4,0,0,46,0,0.0,,0.0,,0.0,,G1,184A1,normal,,24.476809,114.236520,9.333659,17.067157,84.052574,20.107556,29.089046,40.066597,35.008262,54.100903,101.773654,138.471705,49.261588,34.179771,140.279671,56.732578,78.094136,14.535272,35.866044,19.288567,94.852197,17.413186,33.232495,75.095330,45.150487,237.378196,12.801110,78.007890,180.029544,27.640450,229.530331,219.995247,7.077227,5.008458,1.655675,539.162437,7589.989633,10868.888994
13117,266568,plate01,J12,5,4,0,0,47,0,266519,plate01,J12,5,4,0,0,47,0,0.0,,0.0,,0.0,,G2,184A1,normal,,42.480616,100.187324,15.959638,33.498925,107.580525,34.908351,57.304635,43.267011,128.689051,67.229615,129.272916,189.395093,55.927412,40.912666,143.235915,85.090585,101.301561,23.145633,51.677571,19.898499,114.990447,20.685950,46.968741,161.135374,104.073947,239.133683,22.447838,130.532238,175.388739,48.492558,219.345445,213.692021,7.781674,5.808005,1.695866,441.322981,7067.131928,10451.550664
13118,266569,plate01,J12,5,4,0,0,48,0,266520,plate01,J12,5,4,0,0,48,0,0.0,,0.0,,0.0,,G1,184A1,normal,,38.905315,201.388706,11.547569,24.211109,116.649598,23.047248,56.393690,42.628522,66.590856,80.956005,135.325230,146.662401,46.230914,42.600095,265.396794,95.698502,110.890282,17.727125,47.791309,20.166960,120.956057,25.565485,50.193811,105.725014,60.766474,240.183645,17.510868,82.874540,235.066348,33.932558,250.880820,253.379054,7.639894,6.397234,1.752594,387.570989,6728.237451,10566.501599


## Save data

Prepare to save data:

In [9]:
import shutil

# create dir
outdir = p['output_data_dir']
if os.path.exists(outdir):
    print('Warning! Directory {} already exist! Deleting...\n'.format(outdir))
    try:
        shutil.rmtree(outdir)
    except OSError as e:
        print('Dir {} could not be deleted!\n\nOSError: {}'.format(outdir, e))

print('Creating dir: {}'.format(outdir))
os.makedirs(outdir, exist_ok=False)
    

Creating dir: /storage/groups/ml01/workspace/andres.becker/master_thesis/datasets/184A1_hannah_EU_scalar_projection_vicb


In [10]:
# Get channels ids (proteins) which will be used to predict transcripcion rate
input_ids = list(mpp_data.channels.set_index('name').loc[p['input_channels']]['channel_id'])
# Get id of the channel that measure trancripcion rate
output_ids = list(mpp_data.channels.set_index('name').loc[p['output_channels']]['channel_id'])
# add output channel id after the input channels ids
channels_ids = input_ids + output_ids

Save metadata and used parameters

In [11]:
# save params
json.dump(p, open(os.path.join(outdir, 'params.json'), 'w'), indent=4)

# save metadata
mpp_data.metadata.to_csv(os.path.join(outdir, 'metadata.csv'))

# Save used channels
mpp_data.channels.to_csv(os.path.join(outdir, 'channels.csv'))
#mpp_data.channels.set_index('channel_id').loc[channels_ids].to_csv(os.path.join(outdir, 'channels.csv'))