In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
pd.set_option('precision', 1)

IntDen_new = [('Nucleus (Cycle 1 - DAPI)', 1, 'DAPI'),
              ('BT474-H2BeGFP (Cycle 1 - FITC)', 1, 'FITC'),
              ('MCL1 (Cycle 2 - Cy3)', 2, 'Cy3'),
              ('p-p65NFkB (Cycle 2 - Cy5)', 2, 'Cy5'),
              ('p-Akt (Cycle 3 - FITC)', 3, 'FITC'),
              ('aSMA (Cycle 3 - Cy3)', 2, 'Cy3'),
              ('p-gH2AX (Cycle 3 - Cy5)', 3, 'Cy5'),
              ('GRP78 (Cycle 4 - FITC)', 4, 'FITC'),
              ('Vimentin (Cycle 4 - Cy3)', 4, 'Cy3')]


''' Construct the universal Results-Cyto- data file path in local drive named path
    Enter the row and a list of column indice you want to extract data 
    (e.g row E, column indice = [8, 9, 10])
    Construct a list of well IDs: B02-B10 named WELLS
    Construct a list of field IDs: fld1-fld9 named FIELD
'''

def read_df(path, rows, cols):
    WELLS = []
    
    for r in rows:
        for k in cols:
            if k < 10:
                WELLS.append(r + '0{}'.format(k))
            else:
                WELLS.append(r + '10')
    
    FIELD = ['fld{}'.format(k) for k in range(1, 10)]

    files = [path + '{}{}.txt'.format(w, f) 
             for w in WELLS for f in FIELD]

    ''' Read all Cyto data files in the dataset into a list of dataframes
    '''
    
    df_list = [pd.read_csv(file, sep='\t', engine='python', 
                            usecols=['Label', 'IntDen']) for file in files]
    
    return df_list, WELLS, FIELD

def extract(df):
    well = r'[A-Z]+.*?(?=_)'
    field = r'(?<=d)[\d]{1,1}?(?=:)'
    cell_id = r'(\d+)-(\d+)'
    channel = r'(?<=:)[A-Z]+.*?(?=-)'
    
    df['Cell ID'] = df['Label'].apply(lambda x: int((re.search(cell_id, x).group()).\
                                                        replace('-', '')))
    
    df['Cycle'] = df['Label'].apply(lambda x: int(x[-1]))
    
    df['Channel'] = df['Label'].apply(lambda x: re.search(channel, x).group())
    
    df['Well'] = df['Label'].apply(lambda x: re.search(well, x).group())
    
    df['Field'] = df['Label'].apply(lambda x: re.search(field, x).group())
    
    df['Col'] = df['Well'].apply(lambda w: 10 if w[-1] == '0' else int(w[-1]))
    df['Group'] = df['Col'].apply(lambda c: (c - 2)//3)
    
    df = df[['Cell ID', 'Cycle', 'Channel', 'IntDen', 'Well', 'Field', 'Group']]
    
    return df

def process(df):
    dt = extract(df)

    df_field = dt[(dt['Cycle']==IntDen_new[0][1]) & 
              (dt['Channel']==IntDen_new[0][2])][['Cell ID', 'Well', 'Field', 'Group']]

    for col in IntDen_new:
        dy = dt[(dt['Cycle']==col[1]) & (dt['Channel']==col[2])]
        df_field[col[0]] = dy[['IntDen']].values

    return df_field

In [2]:
path_cyto = './032018_48hrs/Results-Cyto-'
rows_cyto, cols_cyto = ['B'], range(2, 5)
df_cyto, WELLS_cyto, FIELD_cyto = read_df(path_cyto, rows_cyto, cols_cyto)

''' Double check WELLS, FIELDs that have been processed in Cyto dataset
    Double check the number of files in the dataset that have been read
'''
print('\n', '* ' * 16, 'Cyto Reading ', '* ' * 16)
n_cyto = len(df_cyto)
print('\nWells:', WELLS_cyto)
print('\nFields:', FIELD_cyto)
print('\nNumber of files that have been read: ', n_cyto)

path_nucl = './032018_48hrs/Results-Nuc-'
rows_nucl, cols_nucl = ['B'], range(2, 5)
df_nucl, WELLS_nucl, FIELD_nucl = read_df(path_nucl, rows_nucl, cols_nucl)

''' Double check WELLS, FIELDs that have been processed in Nucl dataset
    Double check the number of files in the dataset that have been read
'''
print('\n', '* ' * 16, 'Nucl Reading ', '* ' * 16)
n_nucl = len(df_nucl)
print('\nWells:', WELLS_nucl)
print('\nFields:', FIELD_nucl)
print('\nNumber of files that have been read: ', n_nucl)


 * * * * * * * * * * * * * * * *  Cyto Reading  * * * * * * * * * * * * * * * * 

Wells: ['B02', 'B03', 'B04']

Fields: ['fld1', 'fld2', 'fld3', 'fld4', 'fld5', 'fld6', 'fld7', 'fld8', 'fld9']

Number of files that have been read:  27

 * * * * * * * * * * * * * * * *  Nucl Reading  * * * * * * * * * * * * * * * * 

Wells: ['B02', 'B03', 'B04']

Fields: ['fld1', 'fld2', 'fld3', 'fld4', 'fld5', 'fld6', 'fld7', 'fld8', 'fld9']

Number of files that have been read:  27


In [3]:
df_cells = process(df_cyto[0])
for df in df_cyto[1:]:
    dt = process(df)
    df_cells = df_cells.append(dt, ignore_index=True)
df_cells

Unnamed: 0,Cell ID,Well,Field,Group,Nucleus (Cycle 1 - DAPI),BT474-H2BeGFP (Cycle 1 - FITC),MCL1 (Cycle 2 - Cy3),p-p65NFkB (Cycle 2 - Cy5),p-Akt (Cycle 3 - FITC),aSMA (Cycle 3 - Cy3),p-gH2AX (Cycle 3 - Cy5),GRP78 (Cycle 4 - FITC),Vimentin (Cycle 4 - Cy3)
0,10018,B02,1,0,6.1e+05,34418.9,54596.2,25364.8,28330.5,54596.2,1913.7,9510.9,13633.8
1,20013,B02,1,0,1.4e+06,311103.3,101851.6,43709.0,73056.7,101851.6,6120.7,16078.1,12079.2
2,30022,B02,1,0,1.9e+06,162176.2,183881.8,104492.1,206216.4,183881.8,23716.0,23564.9,31449.9
3,40016,B02,1,0,1.3e+06,538824.0,84439.7,45970.9,70549.5,84439.7,1426.4,20359.0,16116.3
4,50019,B02,1,0,1.3e+06,217196.1,58911.0,26316.6,27367.7,58911.0,2993.7,17980.0,21591.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10613,4631033,B04,9,0,2.3e+05,11638.5,151306.9,63075.5,35282.4,151306.9,2543.7,11439.0,15167.1
10614,4641033,B04,9,0,2.8e+05,21242.8,132232.6,36378.5,22053.6,132232.6,2306.8,18401.1,16474.6
10615,4651034,B04,9,0,8.2e+04,11778.6,26821.8,7778.8,5967.0,26821.8,6034.1,2763.6,6554.5
10616,4661036,B04,9,0,3.8e+05,87891.0,131880.2,41151.8,28778.0,131880.2,1563.9,22780.4,16340.5
