In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
pd.set_option('precision', 1)

''' Construct the universal Results-Cyto- data file path in local drive named path
    Enter the row and a list of column indice you want to extract data 
    (e.g row E, column indice = [8, 9, 10])
    Construct a list of well IDs: B02-B10 named WELLS
    Construct a list of field IDs: fld1-fld9 named FIELD
'''

def read_df(path, rows, cols):
    WELLS, rows, cols = [], ['B'], range(2, 5)
    for r in rows:
        for k in cols:
            if k < 10:
                WELLS.append(r + '0{}'.format(k))
            else:
                WELLS.append(r + '10')
    
    FIELD = ['fld{}'.format(k) for k in range(1, 10)]

    files = [path + '{}{}.txt'.format(w, f) 
             for w in WELLS for f in FIELD]

    ''' Read all Cyto data files in the dataset into a list of dataframes
    '''
    
    df_list = [pd.read_csv(file, sep='\t', engine='python', 
                            usecols=['Label', 'IntDen']) for file in files]
    
    return df_list, WELLS, FIELD


def process(df_list):
    well = r'[A-Z]+.*?(?=_)'
    field = r'(?<=d)[\d]{1,1}?(?=:)'
    cell_id = r'(\d+)-(\d+)'
    channel = r'(?<=:)[A-Z]+.*?(?=-)'

    for k in range(len(df_list)):
        df_list[k]['Well'] = df_list[k]['Label'].\
                        apply(lambda x: re.search(well, x).group())
        
        df_list[k]['Row'] = df_list[k]['Well'].apply(lambda w: w[0])
        
        df_list[k]['Col'] = df_list[k]['Well'].\
                        apply(lambda w: 10 if w[-1] == '0' else int(w[-1]))
        
        df_list[k]['Field'] = df_list[k]['Label'].\
                        apply(lambda x: re.search(field, x).group())
        
        df_list[k]['Cell ID'] = df_list[k]['Label'].\
                        apply(lambda x: re.search(cell_id, x).group())
        
        df_list[k]['Channel'] = df_list[k]['Label'].apply(lambda x: re.search(channel, x).group())
        
        df_list[k]['Cycle'] = df_list[k]['Label'].apply(lambda x: int(x[-1]))
        
        df_list[k] = df_list[k][['Cell ID', 'Well', 'Row', 'Col', 'Field', 'Channel', 'Cycle', 'IntDen']]
    
    return df_list

In [2]:
path_cyto = './032018_48hrs/Results-Cyto-'
rows_cyto, cols_cyto = ['B'], range(2, 5)
df_cyto, WELLS_cyto, FIELD_cyto = read_df(path_cyto, rows_cyto, cols_cyto)

''' Double check WELLS, FIELDs that have been processed in Cyto dataset
    Double check the number of files in the dataset that have been read
'''
print('\n', '* ' * 16, 'Cyto Reading ', '* ' * 16)
n_cyto = len(df_cyto)
print('\nWells:', WELLS_cyto)
print('\nFields:', FIELD_cyto)
print('\nNumber of files that have been read: ', n_cyto)

path_nucl = './032018_48hrs/Results-Nuc-'
rows_nucl, cols_nucl = ['B'], range(2, 5)
df_nucl, WELLS_nucl, FIELD_nucl = read_df(path_nucl, rows_nucl, cols_nucl)

''' Double check WELLS, FIELDs that have been processed in Nucl dataset
    Double check the number of files in the dataset that have been read
'''
print('\n', '* ' * 16, 'Nucl Reading ', '* ' * 16)
n_nucl = len(df_nucl)
print('\nWells:', WELLS_nucl)
print('\nFields:', FIELD_nucl)
print('\nNumber of files that have been read: ', n_nucl)



 * * * * * * * * * * * * * * * *  Cyto Reading  * * * * * * * * * * * * * * * * 

Wells: ['B02', 'B03', 'B04']

Fields: ['fld1', 'fld2', 'fld3', 'fld4', 'fld5', 'fld6', 'fld7', 'fld8', 'fld9']

Number of files that have been read:  27

 * * * * * * * * * * * * * * * *  Nucl Reading  * * * * * * * * * * * * * * * * 

Wells: ['B02', 'B03', 'B04']

Fields: ['fld1', 'fld2', 'fld3', 'fld4', 'fld5', 'fld6', 'fld7', 'fld8', 'fld9']

Number of files that have been read:  27


In [3]:
process(df_cyto)
process(df_nucl)
;

''

In [4]:
df_cyto[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8720 entries, 0 to 8719
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Cell ID  8720 non-null   object 
 1   Well     8720 non-null   object 
 2   Row      8720 non-null   object 
 3   Col      8720 non-null   int64  
 4   Field    8720 non-null   object 
 5   Channel  8720 non-null   object 
 6   Cycle    8720 non-null   int64  
 7   IntDen   8720 non-null   float64
dtypes: float64(1), int64(2), object(5)
memory usage: 545.1+ KB


In [5]:
df_nucl[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8720 entries, 0 to 8719
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Cell ID  8720 non-null   object 
 1   Well     8720 non-null   object 
 2   Row      8720 non-null   object 
 3   Col      8720 non-null   int64  
 4   Field    8720 non-null   object 
 5   Channel  8720 non-null   object 
 6   Cycle    8720 non-null   int64  
 7   IntDen   8720 non-null   float64
dtypes: float64(1), int64(2), object(5)
memory usage: 545.1+ KB


In [6]:
df_cyto[8]['Channel'].value_counts()

Cy5     2292
FITC    2292
Cy3     2292
DAPI    2292
Name: Channel, dtype: int64

In [7]:
df_nucl[8]['Channel'].value_counts()

Cy5     2292
FITC    2292
Cy3     2292
DAPI    2292
Name: Channel, dtype: int64

In [8]:
df_cyto[0][df_cyto[0]['Cell ID'] == '0087-0121']

Unnamed: 0,Cell ID,Well,Row,Col,Field,Channel,Cycle,IntDen
86,0087-0121,B02,B,2,1,DAPI,1,910000.0
631,0087-0121,B02,B,2,1,DAPI,2,1200000.0
1176,0087-0121,B02,B,2,1,DAPI,3,880000.0
1721,0087-0121,B02,B,2,1,DAPI,4,740000.0
2266,0087-0121,B02,B,2,1,Cy3,1,40000.0
2811,0087-0121,B02,B,2,1,Cy3,2,140000.0
3356,0087-0121,B02,B,2,1,Cy3,3,35000.0
3901,0087-0121,B02,B,2,1,Cy3,4,36000.0
4446,0087-0121,B02,B,2,1,Cy5,1,29000.0
4991,0087-0121,B02,B,2,1,Cy5,2,69000.0


In [9]:
df_nucl[0][df_nucl[0]['Cell ID'] == '0087-0121']

Unnamed: 0,Cell ID,Well,Row,Col,Field,Channel,Cycle,IntDen
86,0087-0121,B02,B,2,1,DAPI,1,2700000.0
631,0087-0121,B02,B,2,1,DAPI,2,4400000.0
1176,0087-0121,B02,B,2,1,DAPI,3,2800000.0
1721,0087-0121,B02,B,2,1,DAPI,4,2500000.0
2266,0087-0121,B02,B,2,1,Cy3,1,64000.0
2811,0087-0121,B02,B,2,1,Cy3,2,220000.0
3356,0087-0121,B02,B,2,1,Cy3,3,53000.0
3901,0087-0121,B02,B,2,1,Cy3,4,49000.0
4446,0087-0121,B02,B,2,1,Cy5,1,41000.0
4991,0087-0121,B02,B,2,1,Cy5,2,100000.0


In [10]:
found = False
for df in df_nucl:
    if 'TRITC' in df['Channel'].value_counts().index:
        print('FITC found in Nuc dataset')
        found = True
        break;
        
if not found:
    print('TRITC not found in Nuc dataset')

TRITC not found in Nuc dataset


In [11]:
found = False
for df in df_cyto:
    if 'TRITC' in df['Channel'].value_counts().index:
        print('TRITC found in Cyto dataset')
        found = True
        break;
        
if not found:
    print('TRITC not found in Cyto dataset')

TRITC not found in Cyto dataset
