In [1]:
import os
import numpy as np
from glob import glob
from tqdm import tqdm_notebook

# Add extra code to run things in parallel.
The combining exposure maps takes a REALLY long time. So I added this bit so I can run it all in parallel.

You don't need to run any of this if you don't want to run things in paralle. I'll try to comment out code at the bottom so you can run things in parallel or not depending on what you want to do.

In [2]:
from concurrent.futures import ProcessPoolExecutor, as_completed

def parallel_process(array, function, n_jobs=None, use_kwargs=False, front_num=0):
    """
        A parallel version of the map function with a progress bar. 

        Args:
            array (array-like): An array to iterate over.
            function (function): A python function to apply to the elements of array
            n_jobs (int, default=16): The number of cores to use
            use_kwargs (boolean, default=False): Whether to consider the elements of array as dictionaries of 
                keyword arguments to function 
            front_num (int, default=3): The number of iterations to run serially before kicking off the 
                parallel job. This can be useful for catching bugs
        Returns:
            [function(array[0]), function(array[1]), ...]
    """
    #We run the first few iterations serially to catch bugs
    if front_num > 0:
        front = [function(**a) if use_kwargs else function(a) for a in array[:front_num]]
    #If we set n_jobs to 1, just run a list comprehension. This is useful for benchmarking and debugging.
    if n_jobs==1:
        out = [function(**a) if use_kwargs else function(a) for a in tqdm_notebook(array[front_num:])]
#         if front_num:
#             return front + out
#         else: 
#             return out
    #Assemble the workers
    with ProcessPoolExecutor(max_workers=n_jobs) as pool:
        #Pass the elements of array into function
        if use_kwargs:
            futures = [pool.submit(function, **a) for a in array[front_num:]]
        else:
            futures = [pool.submit(function, a) for a in array[front_num:]]
        kwargs = {
            'total': len(futures),
            'unit': 'it',
            'unit_scale': True,
            'leave': True
        }
        #Print out the progress as tasks complete
        for f in tqdm_notebook(as_completed(futures), **kwargs):
            pass
#     out = []
#     #Get the results from the futures. 
#     for i, future in tqdm_notebook(enumerate(futures)):
#         try:
#             out.append(future.result())
#         except Exception as e:
#             out.append(e)
#     if front_num:
#         return front + out
#     else: 
#         return out

In [3]:
def combineXRT(name, outpath):
    ''' This combines the individual observations
    
    IMPORTANT! The scripts that this (and other functions) creat are designed
    to be run from the same directory as this notebook. They WILL NOT work 
    if you try to run them from the individual data directories.
    
    '''
    
    # find the x-ray files
    files = glob(f'{outpath}/{name}/reduced/**/*xpcw3po_cl.evt', recursive=True)

    # sort the files so we can control their input order
    # this is used to make sure all of the data products are the same 
    # when we are combining observations
    
    files = np.sort(files)
    
    if len(files) < 1:
        return

    # write xsel.in
    with open(f'{outpath}/{name}/{name}_xsel.in', 'w') as f:
        for i, f_in in enumerate(files):
            f_parts = f_in.split('/')
            if i == 0:
                # this is the session name... specify random letters to run many at once.
                f.writelines(f'{name}\n')
                f.writelines('read events\n')
                # set the data directory
                f.writelines('/'.join(f_parts[:3]) + '\n')
                # first entry
                f.writelines('/'.join(f_parts[3:]) + '\n')
                f.writelines('yes\n')
                continue

            f.writelines('read events\n')
            f.writelines('/'.join(f_parts[3:]) + '\n')
            # if you try to read more than 20 exposures, it says "more?"
            if i >= 19:
                f.writelines('\n')
            if i >= 42:
                f.writelines('\n')
            if i >= 65:
                f.writelines('\n')
            if i >= 88:
                f.writelines('\n')
            if i >= 111:
                f.writelines('\n')
            if i >= 134:
                f.writelines('\n')
            if i >= 157:
                f.writelines('\n')
            if i >= 180:
                f.writelines('\n')
            if i >= 203:
                f.writelines('\n')
            if i >= 226:
                f.writelines('\n')
            if i >= 249:
                f.writelines('\n')
        f.writelines('extract events\n')
        f.writelines(f'save events {outpath}/{name}/{name}_events.fits\n')
        if os.path.isfile(f'{outpath}/{name}/{name}_events.fits'):
            f.writelines('yes\n')
        f.writelines('yes\n')

                     
        f.writelines('set phaname PI\n')
        # here we are going to make a few binned images for a few different energy ranges
        # energies in loop
        for eng in [200, 300, 400, 500, 600]:
            f.writelines(f'filter pha_cutoff 50 {eng}\n')

            # save non-binned image -- the yes's are to overwrite if file is already there
            f.writelines('extract image\n')
            f.writelines(f'save image {"/".join(f_parts[:3])}/{name}_img_50-{eng}.fits\n')
            if os.path.isfile(f'{outpath}/{name}/{name}_img_50-{eng}.fits'):
                f.writelines('yes\n')

            # save binned image -- see above
            f.writelines('set xybinsize 8\n')
            f.writelines('extract image\n')
            f.writelines(f'save image {"/".join(f_parts[:3])}/{name}_img_50-{eng}_bl8.fits\n')
            if os.path.isfile(f'{outpath}/{name}/{name}_img_50-{eng}_bl8.fits'):
                f.writelines('yes\n')

            f.writelines('set xybinsize 4\n')
            f.writelines('extract image\n')
            f.writelines(f'save image {"/".join(f_parts[:3])}/{name}_img_50-{eng}_bl4.fits\n')
            if os.path.isfile(f'{outpath}/{name}/{name}_img_50-{eng}_bl4.fits'):
                f.writelines('yes\n')

        f.writelines('exit\n')
        f.writelines('no\n')

    # log the output
    log_file = f'{outpath}/{name}/{name}_xsel.log'
                         
    # call xselect
    os.system(f'xselect < {outpath}/{name}/{name}_xsel.in > {log_file}')

    return

In [4]:
def combineXRT_exp(name, outpath):
    ''' This combines the exposure maps'''
    
    # find the x-ray files
    files = glob(f'{outpath}/{name}/reduced/**/*xpcw3po_ex.img', recursive=True)

    # sort the observations -- see above
    files = np.sort(files)
    
    if len(files) < 1:
        return name

    # remove the old file if it is there
    if os.path.isfile(f'{outpath}/{name}/{name}_exp.fits'):
        os.remove(f'{outpath}/{name}/{name}_exp.fits')

    # write xsel.in
    with open(f'{outpath}/{name}/{name}_ximg_exp.in', 'w') as f:
        for i, f_in in enumerate(files):
            f_parts = f_in.split('/')
            f.writelines(f'read {f_in}\n')
            if i == 0:
                continue
            f.writelines('sum\n')
            f.writelines('save\n')


        f.writelines(f'write/fits {"/".join(f_parts[:3])}/{name}_exp.fits\n')

        f.writelines('exit\n')
 
    # log the output
    log_file = f'{outpath}/{name}/{name}_ximg_exp.log'
    # call ximage
    os.system(f'ximage < {outpath}/{name}/{name}_ximg_exp.in > {log_file}')

    return name

In [5]:
def load_PSZcatalog():
    from astropy.table import Table                                                       
    from numpy import append as npappend                                             

    datapath = './../planckClusters/catalogs/'
    
    ps1 = Table.read(f'{datapath}/PSZ1v2.1.fits')
    ps2 = Table.read(f'{datapath}/PSZ2v1.fits')

    # convert to pandas
    df1 = ps1.to_pandas()
    df2 = ps2.to_pandas()

    # clean up strings -- not required
    df1 = df1.applymap(lambda x: x.decode() if isinstance(x, bytes) else x)
    df2 = df2.applymap(lambda x: x.decode() if isinstance(x, bytes) else x)

    # merge the catalogs together
    df_m = df1.merge(df2, how='outer', left_on='INDEX', right_on='PSZ', suffixes=('_PSZ1', '_PSZ2'))
    
    # get the columns that we want
    cols = df_m.columns[[0, 1, 4, 5, 8, 29, 33, 34, 37, 38, 40, 51]]
    df_final = df_m[cols]

    # remerge to find bits that were missing                                        
    df_final_bigger = df_final.merge(df2, how='left', left_on='INDEX_PSZ1',         
                                 right_on='PSZ')
    # fill in nans                                                                  
    for col in ['NAME', 'RA', 'DEC', 'SNR', 'REDSHIFT', 'INDEX']:                   
        df_final_bigger[col+'_PSZ2'] = df_final_bigger[col+'_PSZ2'].fillna(df_final_bigger[col])
    # fill in nans                                                                  
    for col in ['NAME', 'RA', 'DEC', 'SNR', 'REDSHIFT', 'INDEX']:
        df_final_bigger[col+'_PSZ2'] = df_final_bigger[col+'_PSZ2'].fillna(df_final_bigger[col])
    for col in ['NAME', 'RA', 'DEC']:
        df_final_bigger[col] = df_final_bigger[col+'_PSZ2'].fillna(df_final_bigger[col+'_PSZ1'])

    df_final_bigger = df_final_bigger[npappend(df_final_bigger.columns[:12].values, ['NAME', 'RA', 'DEC'])]

    return df_final_bigger


In [6]:
# get file data
data = load_PSZcatalog()
data = data.sort_index(axis=1)

outpath = './data_full'

arr = [{'name':n.replace(' ', '_'), 'outpath':outpath} for n in data['NAME']]
parallel_process(arr, combineXRT, use_kwargs=True)
#parallel_process(arr, combineXRT_exp, use_kwargs=True)



HBox(children=(IntProgress(value=0, max=1943), HTML(value='')))




In [23]:
np.random.randint(0,1000)

517