In [None]:
import json
from calitp_data.storage import get_fs
from tqdm.notebook import tqdm
from google.cloud import storage

In [None]:
read_from_bucket = 'gtfs-data-test'

In [None]:
all_large_subpaths = [f'{read_from_bucket}/rt', f'{read_from_bucket}/schedule', f'{read_from_bucket}/rt-fixed-timestamp']

dt_subpaths = [f'{read_from_bucket}/rt', f'{read_from_bucket}/schedule']
fixed_timestamp_subpaths = [f'{read_from_bucket}/rt-fixed-timestamp']

In [None]:
def list_subpath_blobs(read_from_bucket, run_larger_subpaths=True, run_smaller_subpaths=True):
    
    fs = get_fs()
    subpaths = fs.ls('gs://' + read_from_bucket)
    
    smaller_subpaths = [subpath for subpath in subpaths if subpath not in all_large_subpaths]
    
    if run_smaller_subpaths:
        for subpath in tqdm(smaller_subpaths):
            print(f'starting smaller subpaths')
            print(f'starting {subpath}')

            # create new blob name from subpath for blob write
            new_blob_name = subpath.split('/')[1]
            if new_blob_name.find('.'):
                 new_blob_name = new_blob_name.split('.')[0]

            # open connection to storage client
            storage_client = storage.Client()

            # name write-to bucket
            write_to_bucket = 'cold-storage-outputs-' + read_from_bucket
            
            # declare write-to bucket
            bucket = storage_client.bucket(write_to_bucket)
            
            # name/ create blob to write to
            blob = bucket.blob(f'{new_blob_name}.txt')

            # create prefix out of subpath to read bucket
            # this is different than new_blob_name above - it preserves the file file extension
            prefix = subpath.split('/')[1]

            # list contents of read bucket in gcs file object
            print('listing contents in read-from bucket')
            file_object = list(storage_client.list_blobs(read_from_bucket, prefix=prefix))

            # open up new blob in write mode
            with blob.open('w') as f:
                
                print('writing results to files')
                for result in tqdm(file_object):
                    f.write(result.name + '\n')
                f.close()
            
            print(f'finished with {subpath}')

        print('finished with smaller subpaths')
        
    if run_larger_subpaths:
        for subpath in all_large_subpaths:
            print(f'starting {subpath}')
            shortened_subpath = subpath.split('/')[1]
            
            print(f'walking top subpaths')
            top_subpaths = list(fs.walk(f'gs://{read_from_bucket}/{shortened_subpath}/', maxdepth=0)) # , recursive=True
            just_subpath_names = top_subpaths[0][1]
            
            split_subpath_names = []
            
            print(f'finding unique dates')
            if subpath in dt_subpaths:
                for sp in just_subpath_names:
                    sp = sp.split('T')[0]
                    split_subpath_names.append(sp)
            
            if subpath in fixed_timestamp_subpaths:
                split_subpath_names = just_subpath_names

            unique_split_subpath_names = unique(split_subpath_names)
    
            print(f'writing unique subpath names')
            # open file in write mode
            with open(f'top_{shortened_subpath}_directories.txt', 'w') as fp:
                for item in tqdm(unique_split_subpath_names):
                    # write each item on a new line
                    fp.write(item + '\n')
                fp.close()
                print(f'finished writing top_{shortened_subpath}_directories.txt')

            print(f'reading unique subpath names')
            with open(f'top_{shortened_subpath}_directories.txt', 'r') as nfp:
                unique_split_subpath_names_dirty = list(nfp)
                unique_split_subpath_names_clean = []
                for ud in unique_split_subpath_names_dirty:
                    ud = ud.replace('\n', '')
                    unique_split_subpath_names_clean.append(ud)

            for unique_subpath in tqdm(unique_split_subpath_names_clean):

                # open connection to storage client
                storage_client = storage.Client()

                # write-to bucket name
                write_to_bucket = 'cold-storage-outputs-' + read_from_bucket
                # identify bucket to write to
                bucket = storage_client.bucket(write_to_bucket)

                # create blob to write to
                blob = bucket.blob(f'{shortened_subpath}/{unique_subpath}.txt')

                # list contents of read bucket in gcs file object
                all_date_files = list(storage_client.list_blobs(read_from_bucket, prefix = f'{shortened_subpath}/' + unique_subpath.lstrip().rstrip()))
                all_target_files = []
                for adf in all_date_files:
                    target_file_name = adf.name
                    all_target_files.append(target_file_name)

                # open up new blob in write mode
                with blob.open('w') as f:

                    for result in tqdm(all_target_files):
                        f.write(result + '\n')
                    f.close()
            print(f'finished with {subpath}')
        
        print('finished with larger subpaths')

In [None]:
def unique(list):
 
    # initialize a null list
    unique_list = []
 
    # traverse for all elements
    for x in list:
        # check if exists in unique_list or not
        if x not in unique_list:
            unique_list.append(x)
    return unique_list

In [None]:
list_subpath_blobs(read_from_bucket, run_larger_subpaths=False) #run_smaller_subpaths=False)

In [None]:
larger_blobs = [f'cold-storage-outputs-{read_from_bucket}/rt-fixed-timestamp' , f'cold-storage-outputs-{read_from_bucket}/rt', f'cold-storage-outputs-{read_from_bucket}/schedule']

In [None]:
def change_file_storage_class(read_from_bucket, run_larger_subpaths=True, run_smaller_subpaths=True):
    '''Change the default storage class of the blob'''

    fs = get_fs()
    subpaths = fs.ls(f'gs://cold-storage-outputs-{read_from_bucket}')
    smaller_blobs = [subpath for subpath in subpaths if subpath not in larger_blobs]
    clean_smaller_blobs = [smaller_blob.split('/')[1] for smaller_blob in smaller_blobs]
    clean_larger_blobs = [file_blob.split('/')[1] for file_blob in larger_blobs]
    
    if run_smaller_subpaths:
        for file_blob in tqdm(clean_smaller_blobs):
            
            storage_client = storage.Client()

            # declare bucket to read the list of files created from gtfs-data
            bucket = storage_client.bucket(f'cold-storage-outputs-{read_from_bucket}')

            # name/ create blob to write to
            blob = bucket.blob(file_blob)

            with blob.open("r") as f:
                class_change_list = []
                for each_blob in f:
                    each_blob = each_blob.replace("\n", "")
                    class_change_list.append(each_blob)
            
            for item in class_change_list:

                bucket_name = read_from_bucket
                blob_name = item
                bucket = storage_client.bucket(bucket_name)
                blob = bucket.blob(blob_name)
                generation_match_precondition = None

                # Optional: set a generation-match precondition to avoid potential race
                # conditions and data corruptions. The request is aborted if the
                # object's generation number does not match your precondition.
                blob.reload()  # Fetch blob metadata to use in generation_match_precondition.
                generation_match_precondition = blob.generation

                blob.update_storage_class('ARCHIVE', if_generation_match=generation_match_precondition)

                print(
                    'Blob {} in bucket {} had its storage class set to {}'.format(
                        blob_name,
                        bucket_name,
                        blob.storage_class
                    )
                )
                return blob
            
        storage_client.close()


    if run_larger_subpaths:

        for file_blob in tqdm(clean_larger_blobs):
            
            storage_client = storage.Client()

            # declare bucket to read the list of files created from gtfs-data
            bucket = storage_client.bucket(f'cold-storage-outputs-{read_from_bucket}')

            # name/ create blob to write to
            blob = bucket.blob(file_blob)
            
            print(f'walking top subpaths')
            top_subpaths = list(fs.walk(f'gs://cold-storage-outputs-{read_from_bucket}/{file_blob}/', maxdepth=0)) # , recursive=True

            just_subpath_names = top_subpaths[0][2]
            
            for subpath in just_subpath_names:
                # declare bucket to read the list of files created from gtfs-data
                bucket_name = f'cold-storage-outputs-{read_from_bucket}'

                bucket = storage_client.bucket(bucket_name)

                # name/ create blob to write to
                blob = bucket.blob(f'{file_blob}/{subpath}')
                
                blob.reload()
                
                class_change_list = []
                with blob.open("r") as f:
                    for each_blob in f:
                        each_blob = each_blob.replace("\n", "")
                        class_change_list.append(each_blob)

                for item in class_change_list:

                    bucket_name = read_from_bucket
                    blob_name = item
                    bucket = storage_client.bucket(bucket_name)
                    blob = bucket.blob(blob_name)

                    generation_match_precondition = None

                    # Optional: set a generation-match precondition to avoid potential race
                    # conditions and data corruptions. The request is aborted if the
                    # object's generation number does not match your precondition.
                    blob.reload()  # Fetch blob metadata to use in generation_match_precondition.
                    generation_match_precondition = blob.generation

                    blob.update_storage_class('ARCHIVE', if_generation_match=generation_match_precondition)

                    print(
                        'Blob {} in bucket {} had its storage class set to {}'.format(
                            blob_name,
                            bucket_name,
                            blob.storage_class
                        )
                    )
                    return blob

            storage_client.close()

In [None]:
change_file_storage_class(read_from_bucket, run_smaller_subpaths=False)

In [None]:
# use this to check the storage class type
# for each_blob in class_change_list[:3]:
#     bucket = storage_client.bucket('gtfs-data-test')
#     blob = bucket.blob(each_blob)
#     blob.reload()
#     print(blob.storage_class)
# storage_client.close()

---

In [None]:
## use this if need to create a new bucket

#storage_client = storage.Client()

#storage_client.create_bucket('cold-storage-outputs-gtfs-data')

In [None]:
# # use below for memory monitoring

# # starting the monitoring
# tracemalloc.start()

# # displaying the memory
# print(tracemalloc.get_traced_memory())
 
# # stopping the library
# tracemalloc.stop()