In [1]:
#------------------------------------------------------------------------------
# This script upload local files to a blob in the azure clud.
# It needs account information:
#   - Account name.
#   - Account key.
# It needs the blob container information
#   - Container name
#   - Container sub-directory
#------------------------------------------------------------------------------
import os, uuid, sys
import subprocess
import tqdm
import astropy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import multiprocessing
import json

from io import BytesIO
from astropy.io import fits
from functools import partial
from azure.storage.blob import BlockBlobService, PublicAccess

In [2]:
with open('credentials.json') as f:
    data = json.load(f)
    accountName = data['accountName']
    accountKey = data['accountKey']

In [4]:
# Create the BlockBlockService that is used to call the Blob service 
# for the storage account
block_blob_service = BlockBlobService(account_name=accountName, account_key=accountKey)

# Create a container called 'raw' (it is already there).
ContainerNameZ = 'raw'
container_name_Z = ContainerNameZ
#block_blob_service.create_container(container_name)

# Set the permission so the blobs are public.
block_blob_service.set_container_acl(container_name_Z, public_access=PublicAccess.Container)

# Create a container called 'processed' (it is already there).
ContainerNameFits = 'processed'
container_name_fits = ContainerNameFits
#block_blob_service.create_container(container_name)

# Set the permission so the blobs are public.
block_blob_service.set_container_acl(container_name_fits, public_access=PublicAccess.Container)

<azure.storage.blob.models.ResourceProperties at 0x11216a4e0>

In [5]:
# Create a list "filelist" with the blob content
# inside the "Azure:container/folder" location 
def BlobList(container, folder, filelist, verbose=False):
    
    gen = block_blob_service.list_blobs(container, prefix=folder)
    
    for blob in gen:
        file = str(blob.name).replace(folder,'')
        filelist.append(file)
        if verbose == True:
            print("\t Blob name: " + blob.name)
        
    return filelist

# Download a file "blobfile" from "container" and save it 
# in the file "locfile"
def DownBlob(container, blobfile, locfile, verbose=False):
    
    if verbose == True:
        print('Downloading ' + blobfile + ' to ' + locfile)
    
    block_blob_service.get_blob_to_path(container, blobfile, locfile)

# Uncompress data 
def UnCompress(file, verbose=False):
    
    if verbose == True:
        print('Uncompressing ' + file)
    
    subprocess.call(['uncompress', file])
    #os.popen('uncompress ' + file)

# Upload file "locfile" to the blob "blobfile" in container
def UpBlob(container, blobfile, locfile, verbose=False):
    
    if verbose == True:
        print('Uploading ' + locfile + ' to ' + blobfile)
        
    block_blob_service.create_blob_from_path(container, blobfile, locfile, validate_content=True)

# Process Blob
def ProcessBlob(path_loc, blob_sub_dir, Z_file, verbose=False):
    # Download the data
    blob_name = os.path.join(blob_sub_dir,Z_file)
    path_to_file_loc = os.path.join(path_loc, Z_file)
    DownBlob(container_name_Z, blob_name, path_to_file_loc, False)

    # Uncompress the data
    #sizeZ = os.path.getsize(path_to_file_loc)
    #print('.Z    size: ',sizeZ)
    UnCompress(path_to_file_loc,False)
    #sizefits = os.path.getsize(path_to_fits_file)
    #print('.fits size: ',sizefits)

    # Upload the data
    path_to_fits_file = path_to_file_loc.replace('.Z','')
    fits_file = path_to_fits_file.replace(path_loc,'')
    blob_name = blob_sub_dir + fits_file
    while not os.path.exists(path_to_fits_file):
        time.sleep(0.1)
        
    if verbose:
        statinfo=os.stat(path_to_fits_file)
        print("File size {} MB".format(statinfo.st_size/1024**2))
    
    UpBlob(container_name_fits, blob_name, path_to_fits_file, False)

    # Remove uploaded file
    #time.sleep(5)
    os.remove(path_to_fits_file)

In [6]:
# List the blob content
# Download from the raw blob
# Uncompress .Z files
# Upload to the processed blob

#Test
BlobSubDirs = ['test']
#BlobSubDirs = ['red_arc_flat']#, 'bias_red', 'blue_arc_flat','red_arc_flat']
path_loc = '../Temp'
if not os.path.exists(path_loc):
    os.mkdir(path_loc)

start_time_out = time.time()

for blob_sub_dir in BlobSubDirs:
    
    # List the data
    Z_files = []
    FolderRem = blob_sub_dir + '/'
    folder_rem = FolderRem
    BlobList(container_name_Z, folder_rem, Z_files)
    
    #-----------------------------------------------------------------------
    # Patch to continue 
    fits_files = []
    BlobList(container_name_fits, folder_rem, fits_files)
    # Remove already uploaded files
    fits_files = [file.replace('.fits','.fits.Z') for file in fits_files]
    Z_files = [file for file in Z_files if file not in fits_files]
    #-----------------------------------------------------------------------
    
    #Test
    #Z_files=Z_files[0:30]
    start_time_dir= time.time()
    
    print('Working on ' + blob_sub_dir + '...')
    
    tasks = partial(ProcessBlob, path_loc, blob_sub_dir)
    with multiprocessing.Pool(10) as p:
        result = list(tqdm.tqdm(p.imap(tasks, Z_files), total=len(Z_files)))
    
    end_time_dir = time.time()
    total_time_dir = end_time_dir - start_time_dir
    print('Total dir time: ', total_time_dir)

end_time = time.time()
total_time_out = end_time - start_time_out
print('Total out time: ', total_time_out)

Working on test...


100%|██████████| 10/10 [01:21<00:00,  8.14s/it]

Total dir time:  81.50801587104797
Total out time:  82.1095540523529



