# Collect Metadata from "czi" files

This script uses "czifile", "lxml", and pandas to collect metadata of czi files <br> 
(obtained using a Zeiss Z1 lightsheet) in xml format,
<br> subsequently parse individual tree elements into a pandas dataframe and export as excel file.

In [2]:
import czifile
from lxml import etree
import re
import pandas as pd
import os
import time

Function bellow generates a list of all .czi files (or other filetypes with extension <span style="color:blue"><b>ext</b></span>)  <br> in a given <span style="color:blue"><b>directory</b></span>. It also includes all files in subdirectories

In [3]:
def file_ls(directory, ext = ('.czi')):
    flist = list()
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            filepath = os.path.join(subdir, file)
            if file.endswith(ext):
                size = os.path.getsize(filepath)
                if (size > 10^6):
                    flist.append(filepath)  
            else:
                continue
    return(flist)

Function below collects a variety of metadata from a given file <br> (use absolute path for 
<span style="color:blue"><b>filepath</b></span>).

In [11]:
def metadataDF(filepath):
    #get filename from path
    filename = os.path.split(filepath)[1]
    #read czi file
    czi_file = czifile.CziFile(filepath)
    #get embryo name from file name
    embryo = re.sub(".czi", "", filename.split("_img")[0])
    #get metadata
    czi_parsed = etree.fromstring(czi_file.metadata()) # print(czi_file.metadata()) to get all metadata
    #get indivisual elements
    creation_date = czi_parsed.xpath("//CreationDate")[0].text
    sizex = int(float(czi_parsed.xpath("//SizeX")[0].text))
    sizey = int(float(czi_parsed.xpath("//SizeY")[0].text))
    if len(czi_parsed.xpath("//SizeZ")) > 0:
        sizez = int(float(czi_parsed.xpath("//SizeZ")[0].text))
    else:
        sizez = 0
    if len(czi_parsed.xpath("//SizeC")) > 0:
        sizec = int(float(czi_parsed.xpath("//SizeC")[0].text))
    else:
        sizec = 0
    if len(czi_parsed.xpath("//I//StartPosition")) > 0:
        ill_sides = round(int(float(czi_parsed.xpath("//I//StartPosition")[0].text)))
    else:
        ill_sides = 0
    dist_ls = []
    for distances in czi_parsed.iter('Distance'):
        dist_ls.append(distances[0].text)
    distx = float(dist_ls[0])
    disty = float(dist_ls[1])
    if sizez !=0:
        distz = float(dist_ls[2])
    else:
        distz = ""
    Comments = czi_parsed.xpath("//Comment")[0].text
    Comments = Comments.split("\n")
    com_dic = dict()
    for element in Comments:
        if len(element.split(": "))>1:
            com_dic[element.split(": ")[0]] = element.split(": ")[1]
    if "Light sheet thickness" in com_dic:
        LS_thickness = com_dic["Light sheet thickness"]
    
    if "Illumination mode" in com_dic:
        ill_mode = com_dic['Illumination mode']
        
    metadata = dict()
    
    #collect the metadata in a dictionary
    metadata[1] = {'embryo':embryo,'file':filename, 'path':os.path.abspath(filepath),'date':creation_date,
                   'X':sizex, 'Y':sizey, 'Z':sizez, 'C':sizec, 
                   'I':ill_sides, 
                   'X_dist': distx, 'Y_dist': disty, 'Z_dist':distz,
                   'LS_thickness' : LS_thickness, 'I_mode' :ill_mode
                  }
    #loop through different channels and append to dictionary
    for i in range(0,len(czi_parsed.xpath("//Dimensions//Channels//IlluminationWavelength//SinglePeak"))):
        wavelen = czi_parsed.xpath("//Dimensions//Channels//IlluminationWavelength//SinglePeak")[i].text
        wavelen = str(round(int(float(wavelen))))
        exposuretime = float(czi_parsed.xpath("//ExposureTime")[i].text)
        NA_ls = float(czi_parsed.xpath("//NALightSheet")[i].text)
        power = float(czi_parsed.xpath("//LaserPower")[i].text)
        metadata[1]["_".join([wavelen, 'expt'])] = exposuretime
        metadata[1]["_".join([wavelen, 'lp'])] = power
        metadata[1]["_".join([wavelen, 'NA_ls'])] = NA_ls

    return(pd.DataFrame.from_dict(metadata, orient='index'))

Function below uses the functions above to generate an excel file <br>
containing the extracted metadata of all files in directory "<span style="color:blue"><b>mypath</b></span>". <br>
The file will be saved to the <span style="color:blue"><b>saveto</b></span> directory. If <span style="color:blue"><b>saveto</b></span> does not exist, it will be created. <br>
Use <span style="color:blue"><b>timestamp</b></span> = False to disable the timestamp in the filename.

In [22]:
def metatoexel(mypath, saveto, timestamp = True):
    #create list of czi files in path
    all_czi_files = file_ls(mypath)
    #collect all metadata
    for i in enumerate(all_czi_files):
        if i[0] == 0:
            all_metadata = metadataDF(all_czi_files[0])
        all_metadata = pd.concat([all_metadata, metadataDF(i[1])], axis = 0) 
    #export to excel
    mainfolder =  os.path.basename(mypath)
    if timestamp == True:
        timestr = time.strftime("%Y%m%d%H%M")
        filename = ".".join(["_".join([ "Metadata", mainfolder, timestr]), "xlsx"])
    else:
        filename = ".".join(["_".join([ "Metadata", mainfolder]), "xlsx"])
    if os.path.exists(saveto)== False: 
        os.mkdir(saveto)
    all_metadata.to_excel(os.path.join(saveto, filename))
    return(all_metadata)

Test the script

In [24]:
path = '..\\Testfiles'
save_dir = os.path.realpath('..\\Output')
metatoexel(path, save_dir, timestamp=False) 

Unnamed: 0,embryo,file,path,date,X,Y,Z,C,I,X_dist,Y_dist,Z_dist,LS_thickness,I_mode
1,P01_A_bf,P01_A_bf.czi,C:\Users\rueegga\Desktop\Testfiles\P01_A_bf.czi,2022-09-08T17:06:52,1024,1024,0,0,0,2.313481e-07,2.313481e-07,,3.92 µm,single
1,P01_A_bf,P01_A_bf.czi,C:\Users\rueegga\Desktop\Testfiles\P01_A_bf.czi,2022-09-08T17:06:52,1024,1024,0,0,0,2.313481e-07,2.313481e-07,,3.92 µm,single
1,P01_B,P01_B_img_bf.czi,C:\Users\rueegga\Desktop\Testfiles\P01_B_img_b...,2022-09-08T17:19:33,1920,1920,0,0,0,2.313481e-07,2.313481e-07,,5.37 µm,single
1,P01_C_bf,P01_C_bf.czi,C:\Users\rueegga\Desktop\Testfiles\P01_C_bf.czi,2022-09-08T17:29:14,960,960,0,0,0,2.313481e-07,2.313481e-07,,3.80 µm,single
1,P01_D_bf,P01_D_bf.czi,C:\Users\rueegga\Desktop\Testfiles\P01_D_bf.czi,2022-09-08T17:37:24,1024,1024,0,0,0,2.313481e-07,2.313481e-07,,3.92 µm,single
