# Piloting out a script that takes:
- YAML/JSON experiment metadata (captured from form)
- idx file metadata (user upload)
- images (user upload)
- plate map (user upload)

and creates an easy-to-read excel sheet for users to check (similar to GEO)

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import glob
import os
import json
import xlsxwriter
import re
import javabridge
import bioformats
import xmljson
from xmljson import badgerfish as bf
from xml.etree.ElementTree import fromstring
from xml.etree import ElementTree as ETree
from json import dumps
# from lxml import etree

javabridge.start_vm(class_path=bioformats.JARS)

In [3]:
input_dir = '/projects/ps-yeolab4/NCRCRG/example_files/'
output_dir = '/projects/ps-yeolab4/NCRCRG/example_files/'

# Get Experiment-level metadata from the YAML/JSON form 

In [4]:
expt_metadata_json = os.path.join(input_dir, 'imaging_metadata_v10.json')

In [5]:
def read_expt_metadata(fn):
    with open(fn) as f:
        f.readline()  # skips the /usr/bin/env line
        try:
            data = json.load(f)
            return data
        except Exception as e:
            print(e)
            return None
        
def generate_plate_map():
    pass

In [6]:
expt_dict = read_expt_metadata(expt_metadata_json)
print(expt_dict.keys())

[u'wells_per_plate', u'investigator', u'number_of_plates', u'image_capture_date', u'Channel_Type', u'microscope', u'experiment_type', u'contact_email', u'magnification_power', u'samples', u'treatment_protocol_description', u'imaging_and_segmentation_software', u'pi_name', u'experiment_nickname', u'organism', u'cell_growth_protocol', u'experiment_summary']


In [7]:
# let's parse the the top level metadata fields out first, leave samples for another day.
top_level_metadata_fields = [
    u'Channel_Type', 
    u'contact_email', 
    u'cell_growth_protocol',
    u'investigator',
    u'number_of_plates',
    u'image_capture_date',
    u'microscope',
    u'experiment_type',
    u'magnification_power',
    u'treatment_protocol_description',
    u'imaging_and_segmentation_software',
    u'pi_name',
    u'experiment_nickname',
    u'organism',
    u'cell_growth_protocol',
    u'experiment_summary',
]

In [8]:
# Create a Pandas dataframe from the data.
df = pd.DataFrame(expt_dict).T
# Grab the toplevel metadata
top_level_metadata = df.loc[top_level_metadata_fields,]
sample_level_metadata = df.loc[u'samples']
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(
    os.path.join(output_dir, 'imaging_metadata_v10.xlsx'), 
    engine='xlsxwriter'
)
# Convert the dataframe to an XlsxWriter Excel object.
top_level_metadata.to_excel(writer, sheet_name='Sheet1')
# Close the Pandas Excel writer and output the Excel file.
writer.save()


# Get Plate-level metadata from idx and platemap file

In [9]:
def read_platemap_from_excel(fn, sheet='Sheet1'):
    excel = pd.ExcelFile(fn, index_col=0)
    # Read in the sheet specified by param
    try:
        df = excel.parse(sheet)
    except Exception as e:
        print("{}. Parsing first sheet in file.".format(e))
        df = excel.parse(excel.sheet_names[0])
    
    # select only the "Plate Layout" columns, leaving everything else as comment.
    df = df[df['Plate Layout'] == 'Plate Layout']
    # use the plate row names as index, dropping the 'Plate Layout' column 
    df.set_index('Unnamed: 1', inplace=True)
    df.drop('Plate Layout', axis=1, inplace=True)
    return df

plate_map = read_platemap_from_excel(
    os.path.join(input_dir, 'from_anthony_vu', '181221_H9_G3BP1gfp_SGopt_TEMPLATED.xlsx')
)
plate_map

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,17,18,19,20,21,22,23,24,Unnamed: 26,Unnamed: 27
Unnamed: 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,,,,min stressed: 0,min stressed: 15,min stressed: 15,min stressed: 15,min stressed: 30,min stressed: 30,min stressed: 30,...,min stressed: 75,min stressed: 75,min stressed: 75,min stressed: 90,min stressed: 90,min stressed: 90,NaAsO2 (uM),NaAsO2 (uM),,
B,,negative control,negative control,negative control,negative control,negative control,negative control,negative control,negative control,negative control,...,negative control,negative control,negative control,negative control,negative control,negative control,0,,,
C,,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,...,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,100,,,
D,,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,...,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,250,,,
E,,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,...,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,500,,,(orange) stressed cells
F,,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,...,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,750,,,(red) negative control
G,,negative control,negative control,negative control,negative control,negative control,negative control,negative control,negative control,negative control,...,negative control,negative control,negative control,negative control,negative control,negative control,0,,,
H,,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,...,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,100,,,
I,,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,...,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,250,,,
J,,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,...,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,stressed cells,500,,,


In [10]:
def read_idx_csv(fn, software="Columbus"):
    """
    Reads in an idx file (for now just Columbus-formatted IDX files).
    Returns a pandas.DataFrame
    """
    if software == "Columbus":
        df = pd.read_table(fn, sep=',')
        df.columns = [c.lstrip(' ') for c in df.columns]
        df = df.apply(lambda x: x.str.lstrip(' ') if x.dtype == "object" else x)
        return df # .to_json()
    else:
        print("Unsupported software!")
        return None

fn = os.path.join(input_dir, 'from_anthony_vu', 'ImageIndex.ColumbusIDX.csv')
idx = read_idx_csv(fn)
idx.head()

Unnamed: 0,AbsoluteTime@s,Channel,ChannelName,Column,Field,ImageResolutionX@um,ImageResolutionY@um,MeasurementDate,Plane,PlateName,PositionX@um,PositionY@um,PositionZ@um,Row,TimeOffset@s,Timepoint,WellName,sourcefilename
0,1516331533,0,DAPI,2,1,0.65,0.65,2018-01-19T03:12:13.000Z,0,AnthinyVu_plate1_20x,1404.0,832.0,0.0,2,0,0,B2,DAPI__B_002_r_0001_c_0002_t_00000000_z_0000-00...
1,1516331541,0,DAPI,2,2,0.65,0.65,2018-01-19T03:12:13.000Z,0,AnthinyVu_plate1_20x,2106.0,832.0,0.0,2,0,0,B2,DAPI__B_002_r_0001_c_0003_t_00000000_z_0000-00...
2,1516331536,0,DAPI,2,3,0.65,0.65,2018-01-19T03:12:13.000Z,0,AnthinyVu_plate1_20x,1404.0,1664.0,0.0,2,0,0,B2,DAPI__B_002_r_0002_c_0002_t_00000000_z_0000-00...
3,1516331539,0,DAPI,2,4,0.65,0.65,2018-01-19T03:12:13.000Z,0,AnthinyVu_plate1_20x,2106.0,1664.0,0.0,2,0,0,B2,DAPI__B_002_r_0002_c_0003_t_00000000_z_0000-00...
4,1516331533,1,GFP,2,1,0.65,0.65,2018-01-19T03:12:13.000Z,0,AnthinyVu_plate1_20x,1404.0,832.0,0.0,2,0,0,B2,GFP__B_002_r_0001_c_0002_t_00000000_z_0000-000...


In [11]:
def get_sample_id_from_plate_map(plate_map, row, col):
    return plate_map.loc[row, col]

def get_filled_well_positions_from_idx(idx, software="Columbus"):
    """ 
    Assuming we have an index file whose rows correspond to well images, return the positions 
    describing wells filled
    """
    if software == "Columbus":
        all_image_positions = list(set(idx['WellName']))
    else:
        all_image_positions = []
    return all_image_positions

def get_list_of_associated_files(idx, row, col, software="Columbus"):
    """ Given a row and a column and a template, return the list of files belonging to that template. """
    if software == "Columbus":
        return idx[idx['WellName'] == '{}{}'.format(row, col)]['sourcefilename'].values
    else:
        print("Unsupported software! Assumes no leading whitespace in well positions")
    
def get_template(well_format='standard96well'):
    """ Returns an empty well template. Supports only 96well for now, but will do for later """
    if well_format == 'standard96well':
        rows = [chr(l) for l in range(65, 65+8)]  # 'A' char is ascii 65
        cols = [c for c in range(1, 13)]
        return pd.DataFrame(index=rows, columns=cols)
    if well_format == 'standard384well':
        rows = [chr(l) for l in range(65, 65+16)]  # 'A' char is ascii 65
        cols = [c for c in range(1, 25)]
        return pd.DataFrame(index=rows, columns=cols)
    else:
        print("Unsupported well range!")
        return None
    
def split_letter_number(position):
    """
    Splits a position (ie. A3) into letter and number (ie. 'A', 3).
    Letters must come before numbers
    
    param position: string
    returns: letter, number
    """
    letter, number = re.findall('\d+|\D+', position)
    return letter, int(number)

def return_platemap_based_on_idx(idx, well_format='standard384well', software="Columbus"):
    """
    Returns a dataframe containing 
    df: pandas.DataFrame
        DataFrame of an index file
        @see: read_idx_csv()
    """
    filled_letters = []
    filled_numbers = []
    filled = []
    
    template = get_template(well_format=well_format)
    
    all_image_positions = get_filled_well_positions_from_idx(idx=idx, software=software)
        
    for position in all_image_positions:
        letter, number = split_letter_number(position)
        template.loc[letter, number] = 'x'
    return template

def label_from_plate_map(row, plate_map, software):
    """ 
    Given an IDX row containing a 'WellName' (maybe Columbus only?), 
    parse into row and column identifiers and 
    return the label on the plate_map.
    """
    if software == "Columbus":
        letter, number = split_letter_number(row['WellName'])
        return get_sample_id_from_plate_map(plate_map=plate_map, row=letter, col=number)
    else:
        return None
    
def label_idx_with_plate_map_identifiers(plate_map, idx, well_format='standard384well', software="Columbus"):
    """
    Uses the plate map identifiers in each well to label the index IDX file.
    """
    idx['label'] = idx.apply(label_from_plate_map, args=[plate_map, software], axis=1)
    return idx

In [12]:
label_idx_with_plate_map_identifiers(
    plate_map=plate_map, 
    idx=idx, 
    well_format='standard384well', 
    software="Columbus"
)

Unnamed: 0,AbsoluteTime@s,Channel,ChannelName,Column,Field,ImageResolutionX@um,ImageResolutionY@um,MeasurementDate,Plane,PlateName,PositionX@um,PositionY@um,PositionZ@um,Row,TimeOffset@s,Timepoint,WellName,sourcefilename,label
0,1516331533,0,DAPI,2,1,0.65,0.65,2018-01-19T03:12:13.000Z,0,AnthinyVu_plate1_20x,1404.0,832.0,0.0,2,0,0,B2,DAPI__B_002_r_0001_c_0002_t_00000000_z_0000-00...,negative control
1,1516331541,0,DAPI,2,2,0.65,0.65,2018-01-19T03:12:13.000Z,0,AnthinyVu_plate1_20x,2106.0,832.0,0.0,2,0,0,B2,DAPI__B_002_r_0001_c_0003_t_00000000_z_0000-00...,negative control
2,1516331536,0,DAPI,2,3,0.65,0.65,2018-01-19T03:12:13.000Z,0,AnthinyVu_plate1_20x,1404.0,1664.0,0.0,2,0,0,B2,DAPI__B_002_r_0002_c_0002_t_00000000_z_0000-00...,negative control
3,1516331539,0,DAPI,2,4,0.65,0.65,2018-01-19T03:12:13.000Z,0,AnthinyVu_plate1_20x,2106.0,1664.0,0.0,2,0,0,B2,DAPI__B_002_r_0002_c_0003_t_00000000_z_0000-00...,negative control
4,1516331533,1,GFP,2,1,0.65,0.65,2018-01-19T03:12:13.000Z,0,AnthinyVu_plate1_20x,1404.0,832.0,0.0,2,0,0,B2,GFP__B_002_r_0001_c_0002_t_00000000_z_0000-000...,negative control
5,1516331541,1,GFP,2,2,0.65,0.65,2018-01-19T03:12:13.000Z,0,AnthinyVu_plate1_20x,2106.0,832.0,0.0,2,0,0,B2,GFP__B_002_r_0001_c_0003_t_00000000_z_0000-000...,negative control
6,1516331536,1,GFP,2,3,0.65,0.65,2018-01-19T03:12:13.000Z,0,AnthinyVu_plate1_20x,1404.0,1664.0,0.0,2,0,0,B2,GFP__B_002_r_0002_c_0002_t_00000000_z_0000-000...,negative control
7,1516331538,1,GFP,2,4,0.65,0.65,2018-01-19T03:12:13.000Z,0,AnthinyVu_plate1_20x,2106.0,1664.0,0.0,2,0,0,B2,GFP__B_002_r_0002_c_0003_t_00000000_z_0000-000...,negative control
8,1516331544,0,DAPI,3,1,0.65,0.65,2018-01-19T03:12:13.000Z,0,AnthinyVu_plate1_20x,1404.0,832.0,0.0,2,0,0,B3,DAPI__B_003_r_0001_c_0002_t_00000000_z_0000-00...,negative control
9,1516331552,0,DAPI,3,2,0.65,0.65,2018-01-19T03:12:13.000Z,0,AnthinyVu_plate1_20x,2106.0,832.0,0.0,2,0,0,B3,DAPI__B_003_r_0001_c_0003_t_00000000_z_0000-00...,negative control


In [13]:
get_list_of_associated_files(idx=idx, row='B', col=7)

array(['DAPI__B_007_r_0001_c_0002_t_00000000_z_0000-00000040.tif',
       'DAPI__B_007_r_0001_c_0003_t_00000000_z_0000-00000041.tif',
       'DAPI__B_007_r_0002_c_0002_t_00000000_z_0000-00000042.tif',
       'DAPI__B_007_r_0002_c_0003_t_00000000_z_0000-00000043.tif',
       'GFP__B_007_r_0001_c_0002_t_00000000_z_0000-00000044.tif',
       'GFP__B_007_r_0001_c_0003_t_00000000_z_0000-00000045.tif',
       'GFP__B_007_r_0002_c_0002_t_00000000_z_0000-00000046.tif',
       'GFP__B_007_r_0002_c_0003_t_00000000_z_0000-00000047.tif'], dtype=object)

# Getting Image-level metadata from each TIF file

In [14]:
def read_tif_metadata(fn):
    img = bioformats.load_image(path=fn)
    md = bioformats.get_omexml_metadata(path=fn)
    return bioformats.OMEXML(xml=md)

def read_tif(fn):
    img = bioformats.load_image(path=fn)
    return img

img = read_tif_metadata(
    '/projects/ps-yeolab4/SG_imaging_data/aqvu/rawdata/180123_AnthonyVu_plate1_20x-1516382552/GFP__J_021_r_0002_c_0003_t_00000000_z_0000-00001567.tif'
)
img = read_tif_metadata(
    '/projects/ps-yeolab3/ecwheele/images/cellraft_Air/20180416_100um_g3bp_staining_fullscan/B0D6R.tiff'
)

In [15]:
print(img.image().Name)
print(img.image().Pixels.ID)
print(img.image().Pixels.PhysicalSizeX)
print(img.image_count)


B0D6R.tiff
Pixels:0
None
1


In [16]:
with open('/oasis/tscc/scratch/bay001/example_xml.xml', 'w') as o:
    o.write(img.to_xml())

In [17]:
img = read_tif_metadata(
    '/projects/ps-yeolab4/NCRCRG/example_files/from_regina_powers/HCS_6.18.18_20x_W1NeunFR_W2NestinR_W3Tuj1G_W4DAPIB_E09_s1_w1.TIF'
)

In [18]:
print(img.image().Name)
print(img.image().Pixels.ID)
print(img.image().Pixels.PhysicalSizeX)
print(img.image_count)


HCS_6.18.18_20x_W1NeunFR_W2NestinR_W3Tuj1G_W4DAPIB_E09_s1_w1.TIF
Pixels:0
None
1


In [19]:
with open('/oasis/tscc/scratch/bay001/example_xml.xml', 'w') as o:
    o.write(img.to_xml())