# Exploratory analysis for multiplex data set

The data analyzed here was take from the dir `/storage/groups/ml01/datasets/raw/20201020_Pelkmans_NascentRNA_hannah.spitzer/184A1_hannah_unperturbed/` and server `vicb-submit-01`. 

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
# Files needed for each well
WELL_FILES = ['labels.npy',
               'mpp.npy',
               'x.npy',
               'y.npy',
               'mapobject_ids.npy',
               'channels.csv',
               'metadata.csv']

# Set paths
BASE_DIR = os.path.realpath(os.path.join(os.path.abspath(''),'../'))
if not os.path.exists(BASE_DIR):
    print('ERROR!, base path {} does not exist! Setting to None'.format(BASE_DIR))
    BASE_DIR = None

DATA_DIR = os.path.join(BASE_DIR, 'datasets')
if not os.path.exists(DATA_DIR):
    print('ERROR!, data path {} does not exist! Setting to None'.format(DATA_DIR))
    DATA_DIR = None

RAW_DATA_DIR = os.path.join(DATA_DIR, 'raw')
if not os.path.exists(RAW_DATA_DIR):
    print('ERROR!, raw data path {} does not exist! Setting to None'.format(RAW_DATA_DIR))
    RAW_DATA_DIR = None

In [3]:
def load_well_data(RAW_DATA_DIR, well):
    try: 
        del(well_dic)
    except:
        pass
    well_dic = {}
    
    well_dir = os.path.join(RAW_DATA_DIR, well)
    print('Reading well {} raw data...'.format(well))
    
    # Check if all needed files to load the well (in WELL_FILES) are
    # in the well dir 
    for file in WELL_FILES:
        if file in os.listdir(well_dir):
            # Load file
            print('Reading file {}'.format(file))
            if file[-3:] == 'npy':
                well_dic[file] = np.load(os.path.join(well_dir, file))
            if file[-3:] == 'csv':
                if file == 'channels.csv':
                    well_dic[file] = pd.read_csv(os.path.join(well_dir, file), 
                                                 sep=',', header=None,
                                                 names=['id', 'channel_name'])
                else:
                    well_dic[file] = pd.read_csv(os.path.join(well_dir, file), sep=',')
                    
        else:
            print('WARNING! File {} is not in well {}.\nWell skipped.'.format(file,well))
            
            return 1
    
    return well_dic

In [4]:
# Load raw data for each well
for well in os.listdir(RAW_DATA_DIR):
    print(well, RAW_DATA_DIR)

I11 /home/hhughes/Documents/Master_Thesis/Project/datasets/raw
J10 /home/hhughes/Documents/Master_Thesis/Project/datasets/raw


In [5]:
well = 'I11'
well = 'J10'
well_dic = load_well_data(RAW_DATA_DIR, well)

Reading well J10 raw data...
Reading file labels.npy
Reading file mpp.npy
Reading file x.npy
Reading file y.npy
Reading file mapobject_ids.npy
Reading file channels.csv
Reading file metadata.csv


In [6]:
well_dic.keys()

dict_keys(['labels.npy', 'mpp.npy', 'x.npy', 'y.npy', 'mapobject_ids.npy', 'channels.csv', 'metadata.csv'])

# 1. Analysis for .csv files

## 1.a) metadata.csv file

In [8]:
pd.options.display.max_columns=None
well_dic['metadata.csv']

Unnamed: 0.1,Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels
0,0,279425,plate01,J10,0,0,0,0,1,0,279369,plate01,J10,0,0,0,0,1,1,0.0,,0.0,,0.0,
1,1,279426,plate01,J10,0,0,0,0,2,0,279370,plate01,J10,0,0,0,0,2,1,0.0,,0.0,,0.0,
2,2,279427,plate01,J10,0,0,0,0,3,0,279371,plate01,J10,0,0,0,0,3,1,1.0,,1.0,,0.0,
3,3,279428,plate01,J10,0,0,0,0,4,0,279372,plate01,J10,0,0,0,0,4,1,0.0,,0.0,,0.0,
4,4,279429,plate01,J10,0,0,0,0,5,0,279373,plate01,J10,0,0,0,0,5,1,0.0,,0.0,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,1314,361989,plate01,J10,5,4,0,0,65,0,361855,plate01,J10,5,4,0,0,65,1,0.0,,0.0,,0.0,
1076,1315,361990,plate01,J10,5,4,0,0,66,0,361856,plate01,J10,5,4,0,0,66,1,0.0,,0.0,,0.0,
1077,1316,361991,plate01,J10,5,4,0,0,67,0,361857,plate01,J10,5,4,0,0,67,0,1.0,,0.0,,0.0,
1078,1317,361992,plate01,J10,5,4,0,0,68,0,361858,plate01,J10,5,4,0,0,68,1,0.0,,0.0,,0.0,


In [9]:
mask = well_dic['metadata.csv'].label_cell == 1
well_dic['metadata.csv'][mask]

Unnamed: 0.1,Unnamed: 0,mapobject_id,plate_name,well_name,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,plate_name_cell,well_name_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels
0,0,279425,plate01,J10,0,0,0,0,1,0,279369,plate01,J10,0,0,0,0,1,1,0.0,,0.0,,0.0,
134,162,298387,plate01,J10,0,3,0,0,1,0,298369,plate01,J10,0,3,0,0,1,1,0.0,,0.0,,0.0,
151,180,287423,plate01,J10,0,4,0,0,1,0,287391,plate01,J10,0,4,0,0,1,1,0.0,,0.0,,0.0,
282,349,337008,plate01,J10,1,3,0,0,1,0,336985,plate01,J10,1,3,0,0,1,1,0.0,,0.0,,0.0,
519,636,346253,plate01,J10,2,4,0,0,1,0,346227,plate01,J10,2,4,0,0,1,1,0.0,,0.0,,1.0,
711,875,284758,plate01,J10,4,0,0,0,1,0,284463,plate01,J10,4,0,0,0,1,1,0.0,,0.0,,0.0,
757,932,259948,plate01,J10,4,1,0,0,1,0,259903,plate01,J10,4,1,0,0,1,1,0.0,,0.0,,0.0,
798,977,193561,plate01,J10,4,2,0,0,1,0,193515,plate01,J10,4,2,0,0,1,1,0.0,,0.0,,0.0,
818,1002,353330,plate01,J10,4,3,0,0,1,0,353299,plate01,J10,4,3,0,0,1,1,0.0,,0.0,,0.0,
882,1087,315259,plate01,J10,5,0,0,0,1,0,315209,plate01,J10,5,0,0,0,1,1,0.0,,0.0,,0.0,


In [10]:
well_dic['metadata.csv'].describe()

Unnamed: 0.1,Unnamed: 0,mapobject_id,well_pos_y,well_pos_x,tpoint,zplane,label,is_border,mapobject_id_cell,well_pos_y_cell,well_pos_x_cell,tpoint_cell,zplane_cell,label_cell,is_border_cell,is_mitotic,is_mitotic_labels,is_polynuclei_HeLa,is_polynuclei_HeLa_labels,is_polynuclei_184A1,is_polynuclei_184A1_labels
count,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,1080.0,0.0,1080.0,1.0,1080.0,3.0
mean,662.358333,299171.776852,2.559259,1.915741,0.0,0.0,25.098148,0.0,299101.661111,2.559259,1.915741,0.0,0.0,25.098148,0.308333,0.062963,,0.061111,1.0,0.028704,0.666667
std,383.046696,50908.301383,1.709757,1.466756,0.0,0.0,15.133438,0.0,50902.187868,1.709757,1.466756,0.0,0.0,15.133438,0.462019,0.243009,,0.239645,,0.16705,0.57735
min,0.0,193561.0,0.0,0.0,0.0,0.0,1.0,0.0,193515.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,0.0,1.0,0.0,0.0
25%,332.75,259969.75,1.0,1.0,0.0,0.0,13.0,0.0,259924.75,1.0,1.0,0.0,0.0,13.0,0.0,0.0,,0.0,1.0,0.0,0.5
50%,666.5,284808.5,3.0,2.0,0.0,0.0,23.0,0.0,284513.5,3.0,2.0,0.0,0.0,23.0,0.0,0.0,,0.0,1.0,0.0,1.0
75%,991.5,351318.25,4.0,3.0,0.0,0.0,36.0,0.0,351193.25,4.0,3.0,0.0,0.0,36.0,1.0,0.0,,0.0,1.0,0.0,1.0
max,1318.0,380696.0,5.0,4.0,0.0,0.0,69.0,0.0,380646.0,5.0,4.0,0.0,0.0,69.0,1.0,1.0,,1.0,1.0,1.0,1.0


In [11]:
print(well_dic['metadata.csv'].label_cell.unique())
print(well_dic['metadata.csv'].label_cell.unique().shape)

[ 1  2  3  4  5  6  8  9 10 11 12 14 15 16 17 18 19 20 21 22 23 24 25 26
 27 28 29 30 31 32 34 35 36 38 39 40 41 42 43 44 45 46 47 48 49 50 51 53
 54  7 13 33 37 55 52 57 58 59 61 62 63 65 66 56 60 64 67 68 69]
(69,)


From the two tables above we can observe that:
- we have 69 cells in this well J10 dataset
- we can confirm that in the analisis of the file `labels.npy`

## 1.b) channels.csv file

In [12]:
well_dic['channels.csv']

Unnamed: 0,id,channel_name
0,0,00_DAPI
1,1,07_H2B
2,2,01_CDK9_pT186
3,3,03_CDK9
4,4,05_GTF2B
5,5,07_SETD1A
6,6,08_H3K4me3
7,7,09_SRRM2
8,8,10_H3K27ac
9,9,11_KPNA2_MAX


The file `channels.csv` contains the protein name of each channel. From this file we can conclude that we have 38 channels in well J10 dataset

# 2. Analisis for .npy files

First lest see the shape of each .npy file:

In [13]:
for f in WELL_FILES:
    if f[-3:] == 'npy':
        print('File {} shape {}'.format(f,well_dic[f].shape))

File labels.npy shape (14265789,)
File mpp.npy shape (14265789, 38)
File x.npy shape (14265789,)
File y.npy shape (14265789,)
File mapobject_ids.npy shape (14265789,)


Here we can see that **all** files have the same number of elements in dim 1, and that only `mpp.npy` has a second dim of size 38, the same number of cannels! <br>
#Therefore, we can infere that:
#- `mpp.npy` contains the mesured value in the microscope

## 2.a) labels.npy

In [14]:
print(well_dic['labels.npy'])
print(np.unique(well_dic['labels.npy']))
print(np.unique(well_dic['labels.npy']).shape)

[ 1  1  1 ... 69 69 69]
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69]
(69,)


From the last cell we see that `labels.npy` only have 69 unique values among the 14,265,789 elements, same as the number of cells given in the `metadata.csv` file! Therefore, we can infere that:
- `labels.npy` contains a label for each pixel which denotes to which cell (among the 69) it belongs.

Now, lets see if every cell has the same number of pixels:

In [15]:
print(np.unique(well_dic['labels.npy']))
for i in np.unique(well_dic['labels.npy']):
    print('Val/Count: {}/{}'.format(i, (well_dic['labels.npy'] == i).sum()))

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69]
Val/Count: 1/140075
Val/Count: 2/223792
Val/Count: 3/260736
Val/Count: 4/278511
Val/Count: 5/249588
Val/Count: 6/355970
Val/Count: 7/230960
Val/Count: 8/415525
Val/Count: 9/292884
Val/Count: 10/396774
Val/Count: 11/386896
Val/Count: 12/408974
Val/Count: 13/399290
Val/Count: 14/341176
Val/Count: 15/320078
Val/Count: 16/343816
Val/Count: 17/360778
Val/Count: 18/291176
Val/Count: 19/356674
Val/Count: 20/313926
Val/Count: 21/360463
Val/Count: 22/333139
Val/Count: 23/280250
Val/Count: 24/353348
Val/Count: 25/317961
Val/Count: 26/359927
Val/Count: 27/267236
Val/Count: 28/249186
Val/Count: 29/245927
Val/Count: 30/242582
Val/Count: 31/267527
Val/Count: 32/228334
Val/Count: 33/210601
Val/Count: 34/231987
Val/Count: 35/241067
Val/Count: 36/224714
Val/Count: 37/229339
Val/Count: 38/186439


From the last cell we can see that the number of pixels varies from cell to cell. For instance, cell 1 is represented by 140,075 pixels, while cell 2 is represented by 223,792 pixels.
<br>
Now lets see if the pixels are ordered, i.e. that the first 140,075 pixels corresponds only to cell 1:

In [16]:
well_dic['labels.npy'][140070:140080]

array([17, 17, 17, 17, 17, 17, 17, 17, 17, 17], dtype=uint16)

Since wee see that label of pixel 140,075 corresponds to cell 17, then we conclude that the pixels are not ordered w.r.t. the cell label.

## 2.b) mpp.npy

`mpp.npy` is the only file that contains 2 dimension, where the size of the first dim coincides with the number of pixels and the size of the second dim with the number of channels (or proteins):

In [17]:
print(well_dic['mpp.npy'].shape)
well_dic['mpp.npy']

(14265789, 38)


array([[ 120,  153,  105, ...,  134,    0, 1134],
       [ 122,  150,  110, ...,  135,    0, 2238],
       [ 128,  138,  108, ...,  142,    0, 1821],
       ...,
       [ 118,  230,  141, ...,  138,    0,  429],
       [ 131,  274,  122, ...,  136,    0,    0],
       [ 118,  262,  126, ...,  124,    0,    0]], dtype=int32)

In [18]:
mpp_unique_vals = np.unique(well_dic['mpp.npy'])
print(mpp_unique_vals)
print(mpp_unique_vals.shape)
print(mpp_unique_vals.min())
print(mpp_unique_vals.max())

[    0     1     2 ... 65533 65534 65535]
(65536,)
0
65535


Therefore, we conclude that this file contains the observerd/measured values of each pixel and for each protein. Now, note that the values in `mpp_unique.npy` vary from 0 to 65535 i.e. 2^18 i.e. 2 byts or 16 bits, which is mentioned in the paper **Multiplexed protein maps link subcellular organization to cellular states** (page 4, *Multiplexed singel-pixel analysis framework* section).

## 2.c) x.npy and y.npy

We infere that this two files just contain the pixel coordinates in x and y of each active/measured pixel.

Lets see the content of this arrays:
- For x:

In [19]:
print(well_dic['x.npy'])
print(np.unique(well_dic['x.npy']))
print(np.unique(well_dic['x.npy']).shape)

[ 145  137  140 ... 1731 1728 1729]
[   4    5    6 ... 2403 2404 2405]
(2402,)


In [20]:
print(np.unique(well_dic['x.npy']).min())
print(np.unique(well_dic['x.npy']).max())

4
2405


- For y

In [21]:
print(well_dic['y.npy'])
print(np.unique(well_dic['y.npy']))
print(np.unique(well_dic['y.npy']).shape)

[  30   31   31 ... 1842 1843 1843]
[   2    3    4 ... 1898 1899 1900]
(1899,)


In [22]:
print(np.unique(well_dic['y.npy']).min())
print(np.unique(well_dic['y.npy']).max())

2
1900


In the last cells we can see that the images are no bigger than 2405x1900 pixels. The paper **Multiplexed protein maps link subcellular organization to cellular states** mentions in page 10 (Microscopy section) that the size of the images are 2,560x2160 pixels. Therefore, we can assume that the size 2,560x2160 for this data is correct.

## 2.d) mapobject_ids.npy

In [23]:
print(well_dic['mapobject_ids.npy'])
print(np.unique(well_dic['mapobject_ids.npy']))
print(np.unique(well_dic['mapobject_ids.npy']).shape)

[279425 279425 279425 ... 361993 361993 361993]
[193561 193562 193563 ... 380693 380695 380696]
(1080,)


We see that `mapobject_ids.npy` has 1080 unique elements, same as rows in the metadata file (metadata.csv)!. Therefore, this file maps the information given in the metadata file with each pixel in the well.