#### Algorithm walks through the dataset in each plate then:
#### 1. Extract Cell ID, Well, Field Channel, Cycle, Area, Integrated Density for each cell
#### 2. Extract the number of cells in each field and in each well

##### First, import Pandas library for reading the dataset from local drive to .csv daframe
##### Second, import re library for text extraction and processing

In [1]:
import pandas as pd
import re

''' Construct the universal Results-Cyto- data file path in local drive named path
    Construct a list of well IDs: B02-B10 named WELLS
    Construct a list of field IDs: fld1-fld9 named FIELD
'''
path = './052018_Cyto/Results-Cyto-'
WELLS = ['B0{}'.format(k) for k in range(2, 5)]
FIELD = ['fld{}'.format(k) for k in range(1, 10)]

''' Construct a list of filenames associated with Cyto in the dataset named files_Cyto
'''
files_Cyto = [path + '{}{}.txt'.format(w, f) 
              for w in WELLS for f in FIELD]

''' Read all Cyto data files in the dataset into a list of dataframes
'''
df_Cyto = [pd.read_csv(file, sep='\t', 
                 engine='python', 
                 usecols=['Label', 'Area', 'IntDen']) 
           for file in files_Cyto]

''' Double check WELLS, FIELDs that have been processed
    Double check the number of Cyto files that have been read
'''
n = len(df_Cyto)
print(WELLS)
print(FIELD)
print('number of Cyto files that have been read: ', len(df_Cyto))

['B02', 'B03', 'B04']
['fld1', 'fld2', 'fld3', 'fld4', 'fld5', 'fld6', 'fld7', 'fld8', 'fld9']
number of Cyto files that have been read:  27


#### Print the first 10 entries in all dataframes of the dataset

In [2]:
for df in df_Cyto:
    print('\n', '* ' * 26, '\n')
    print(df.head(10))


 * * * * * * * * * * * * * * * * * * * * * * * * * *  

                          Label     Area      IntDen
0  B02_fld1:0001-0031:DAPI-0001  164.712  518987.253
1  B02_fld1:0002-0033:DAPI-0001  134.147  312344.627
2  B02_fld1:0003-0032:DAPI-0001  151.977  385528.682
3  B02_fld1:0004-0029:DAPI-0001  122.261  304401.927
4  B02_fld1:0005-0033:DAPI-0001  213.956  668529.047
5  B02_fld1:0006-0030:DAPI-0001  118.865  320494.490
6  B02_fld1:0007-0031:DAPI-0001  134.147  414814.360
7  B02_fld1:0008-0033:DAPI-0001  211.409  852663.821
8  B02_fld1:0009-0034:DAPI-0001  250.465  551172.377
9  B02_fld1:0010-0031:DAPI-0001  159.618  601225.391

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

                          Label     Area       IntDen
0  B02_fld2:0001-0027:DAPI-0001  164.712  1321761.924
1  B02_fld2:0002-0028:DAPI-0001  108.676   404588.613
2  B02_fld2:0003-0031:DAPI-0001  129.053   360008.463
3  B02_fld2:0004-0034:DAPI-0001  202.070   623556.644
4  B02_fld2:0005-0031:DAPI-0001  

#### Print information of each column in every dataframe

In [3]:
for df in df_Cyto:
    print('\n', '* ' * 26, '\n')
    print(df.info())


 * * * * * * * * * * * * * * * * * * * * * * * * * *  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22776 entries, 0 to 22775
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Label   22776 non-null  object 
 1   Area    22776 non-null  float64
 2   IntDen  22776 non-null  float64
dtypes: float64(2), object(1)
memory usage: 533.9+ KB
None

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8424 entries, 0 to 8423
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Label   8424 non-null   object 
 1   Area    8424 non-null   float64
 2   IntDen  8424 non-null   float64
dtypes: float64(2), object(1)
memory usage: 197.6+ KB
None

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20664 entries, 0 to 20663
Data columns (total 3 columns):
 #   Column  Non-Nu

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10128 entries, 0 to 10127
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Label   10128 non-null  object 
 1   Area    10128 non-null  float64
 2   IntDen  10128 non-null  float64
dtypes: float64(2), object(1)
memory usage: 237.5+ KB
None

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16704 entries, 0 to 16703
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Label   16704 non-null  object 
 1   Area    16704 non-null  float64
 2   IntDen  16704 non-null  float64
dtypes: float64(2), object(1)
memory usage: 391.6+ KB
None

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19176 entries, 0 to 19175
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  


#### Extract well, field, cell, channel IDs in every entry of all dataframes 
#### Save each of these IDs in Well, Field, Cell ID, Channel columns in the dataframe, respectively
#### Extract Cycle, Area, Integrated Density columns as well

In [4]:
well = r'[A-Z]+.*?(?=_)'
field = r'(?<=d)[\d]{1,1}?(?=:)'
cell_id = r'(\d+)-(\d+)'
channel = r'(?<=:)[A-Z]+.*?(?=-)'

for k in range(len(df_Cyto)):
    df_Cyto[k]['Well'] = df_Cyto[k]['Label'].apply(lambda x: re.search(well, x).group())
    df_Cyto[k]['Field'] = df_Cyto[k]['Label'].apply(lambda x: re.search(field, x).group())
    df_Cyto[k]['Cell ID'] = df_Cyto[k]['Label'].apply(lambda x: re.search(cell_id, x).group())
    df_Cyto[k]['Channel'] = df_Cyto[k]['Label'].apply(lambda x: re.search(channel, x).group())
    df_Cyto[k]['Cycle'] = df_Cyto[k]['Label'].apply(lambda x: int(x[-1]))
    df_Cyto[k] = df_Cyto[k][['Cell ID', 'Well', 'Field', 'Channel', 'Cycle', 'Area', 'IntDen']]

#### Redouble check the dataframes

In [5]:
for df in df_Cyto:
    print('\n', '* ' * 32, '\n')
    print(df.head(10))


 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  

     Cell ID Well Field Channel  Cycle     Area      IntDen
0  0001-0031  B02     1    DAPI      1  164.712  518987.253
1  0002-0033  B02     1    DAPI      1  134.147  312344.627
2  0003-0032  B02     1    DAPI      1  151.977  385528.682
3  0004-0029  B02     1    DAPI      1  122.261  304401.927
4  0005-0033  B02     1    DAPI      1  213.956  668529.047
5  0006-0030  B02     1    DAPI      1  118.865  320494.490
6  0007-0031  B02     1    DAPI      1  134.147  414814.360
7  0008-0033  B02     1    DAPI      1  211.409  852663.821
8  0009-0034  B02     1    DAPI      1  250.465  551172.377
9  0010-0031  B02     1    DAPI      1  159.618  601225.391

 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  

     Cell ID Well Field Channel  Cycle     Area       IntDen
0  0001-0027  B02     2    DAPI      1  164.712  1321761.924
1  0002-0028  B02     2    DAPI      1  108.676   404588.613
2  0003-0031  B02  

In [6]:
for df in df_Cyto[:n]:
    print('\n', '* ' * 26, '\n')
    print(df.info())


 * * * * * * * * * * * * * * * * * * * * * * * * * *  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22776 entries, 0 to 22775
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Cell ID  22776 non-null  object 
 1   Well     22776 non-null  object 
 2   Field    22776 non-null  object 
 3   Channel  22776 non-null  object 
 4   Cycle    22776 non-null  int64  
 5   Area     22776 non-null  float64
 6   IntDen   22776 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 1.2+ MB
None

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8424 entries, 0 to 8423
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Cell ID  8424 non-null   object 
 1   Well     8424 non-null   object 
 2   Field    8424 non-null   object 
 3   Channel  8424 non-null   object 
 4   Cycle    8424 non-null   int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11592 entries, 0 to 11591
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Cell ID  11592 non-null  object 
 1   Well     11592 non-null  object 
 2   Field    11592 non-null  object 
 3   Channel  11592 non-null  object 
 4   Cycle    11592 non-null  int64  
 5   Area     11592 non-null  float64
 6   IntDen   11592 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 634.1+ KB
None

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5448 entries, 0 to 5447
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Cell ID  5448 non-null   object 
 1   Well     5448 non-null   object 
 2   Field    5448 non-null   object 
 3   Channel  5448 non-null   object 
 4   Cycle    5448 non-null   int64  
 5   Area     5448 non-null   float64
 6   IntDen   

#### Count the number of unique channels in each of the dataframes

In [7]:
for df in df_Cyto:
    print('\n', '* ' * 26, '\n')
    print(df['Channel'].value_counts())



 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Cy3     5694
DAPI    5694
Cy5     5694
FITC    5694
Name: Channel, dtype: int64

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Cy3     2106
Cy5     2106
DAPI    2106
FITC    2106
Name: Channel, dtype: int64

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Cy3     5166
DAPI    5166
Cy5     5166
FITC    5166
Name: Channel, dtype: int64

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Cy3     4608
DAPI    4608
Cy5     4608
FITC    4608
Name: Channel, dtype: int64

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Cy3     2100
Cy5     2100
DAPI    2100
FITC    2100
Name: Channel, dtype: int64

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Cy3     3060
DAPI    3060
Cy5     3060
FITC    3060
Name: Channel, dtype: int64

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Cy3     5010
DAPI    5010
Cy5     5010
FITC    5010
Name: Channel, dtype: int64

 * * * * * * * * * * * * * * * * * * * *

#### Count the number of unique cycles in each of the dataframes

In [8]:
for df in df_Cyto:
    print('\n', '* ' * 26, '\n')
    print(df['Cycle'].value_counts())


 * * * * * * * * * * * * * * * * * * * * * * * * * *  

6    3796
5    3796
4    3796
3    3796
2    3796
1    3796
Name: Cycle, dtype: int64

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

6    1404
5    1404
4    1404
3    1404
2    1404
1    1404
Name: Cycle, dtype: int64

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

6    3444
5    3444
4    3444
3    3444
2    3444
1    3444
Name: Cycle, dtype: int64

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

6    3072
5    3072
4    3072
3    3072
2    3072
1    3072
Name: Cycle, dtype: int64

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

6    1400
5    1400
4    1400
3    1400
2    1400
1    1400
Name: Cycle, dtype: int64

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

6    2040
5    2040
4    2040
3    2040
2    2040
1    2040
Name: Cycle, dtype: int64

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

6    3340
5    3340
4    3340
3    3340
2    3340
1    3340
Name: Cycle, dtype: int64

#### Count the number of unique Cell IDs in each of the dataframes

In [9]:
for df in df_Cyto[:n]:
    print('\n', '* ' * 26, '\n')
    print(df['Cell ID'].value_counts())


 * * * * * * * * * * * * * * * * * * * * * * * * * *  

0125-0137    24
0506-0514    24
0114-0133    24
0568-0569    24
0322-0296    24
             ..
0535-0534    24
0006-0030    24
0278-0267    24
0608-0631    24
0307-0280    24
Name: Cell ID, Length: 949, dtype: int64

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

0238-0526    24
0340-1001    24
0037-0093    24
0182-0366    24
0024-0071    24
             ..
0301-0870    24
0026-0077    24
0138-0261    24
0177-0359    24
0009-0039    24
Name: Cell ID, Length: 351, dtype: int64

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

0555-0542    24
0242-0258    24
0624-0643    24
0088-0108    24
0190-0208    24
             ..
0588-0577    24
0638-0672    24
0304-0301    24
0054-0078    24
0422-0395    24
Name: Cell ID, Length: 861, dtype: int64

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

0172-0188    24
0429-0546    24
0410-0494    24
0277-0337    24
0625-0844    24
             ..
0098-0094    24
0250-0310

#### Compute the number of cells in every field of each well 

In [10]:
for df in df_Cyto[:n]:
    print('\n', '* ' * 26, '\n')
    print('Total cells in well {} field {}: {}'\
          .format(df.Well[0], df.Field[0], 
                  int(df.shape[0] / df['Cell ID'].value_counts()[0])))


 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Total cells in well B02 field 1: 949

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Total cells in well B02 field 2: 351

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Total cells in well B02 field 3: 861

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Total cells in well B02 field 4: 768

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Total cells in well B02 field 5: 350

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Total cells in well B02 field 6: 510

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Total cells in well B02 field 7: 835

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Total cells in well B02 field 8: 732

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Total cells in well B02 field 9: 680

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Total cells in well B03 field 1: 655

 * * * * * * * * * * * * * * * * * * * * * * * * * *  

Tot