<a href="https://colab.research.google.com/github/carloslme/data-engineering/blob/main/validate_data_buckets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


#Introduction
This notebook has the code to perform a bucket validation after a migration from AWS S3 to an endpoint, for this case, Wasabisys (also called Wasabi).

#Contents
* Prerequisites
* Step 1: Import libraries
* Step 2: Setting credentials
* Step 3: Declare functions
* Step 4: Validate S3 objects in the Wasabi bucket


# Prerequisites

In [1]:
!pip install boto3

Collecting boto3
  Downloading boto3-1.18.31-py3-none-any.whl (131 kB)
[?25l[K     |██▌                             | 10 kB 19.4 MB/s eta 0:00:01[K     |█████                           | 20 kB 22.2 MB/s eta 0:00:01[K     |███████▌                        | 30 kB 24.4 MB/s eta 0:00:01[K     |██████████                      | 40 kB 26.0 MB/s eta 0:00:01[K     |████████████▌                   | 51 kB 27.3 MB/s eta 0:00:01[K     |███████████████                 | 61 kB 28.6 MB/s eta 0:00:01[K     |█████████████████▍              | 71 kB 30.1 MB/s eta 0:00:01[K     |████████████████████            | 81 kB 29.7 MB/s eta 0:00:01[K     |██████████████████████▍         | 92 kB 31.3 MB/s eta 0:00:01[K     |█████████████████████████       | 102 kB 32.9 MB/s eta 0:00:01[K     |███████████████████████████▍    | 112 kB 32.9 MB/s eta 0:00:01[K     |█████████████████████████████▉  | 122 kB 32.9 MB/s eta 0:00:01[K     |████████████████████████████████| 131 kB 32.9 MB/s 
[?25h

# Step 1: Import libraries
Just importing the library installed and some preinstalled ones in notebook session

In [2]:
import boto3
import multiprocessing
from multiprocessing import Pool
import time
import io
from tqdm import tqdm

# Step 2: Setting credentials¶


In [None]:
import boto3

# Setting up the credentials for S3
ACCESS_ID_RAW = ''
ACCESS_KEY_RAW = ''
AWS_S3_CREDS = {
         'aws_access_key_id':ACCESS_ID_RAW,
         'aws_secret_access_key':ACCESS_KEY_RAW
         }

# Setup up the credentiald for Wasabi
ACCESS_ID_W = ''
ACCESS_KEY_W = ''
ENPOINT_URL = 'https://s3.YOUR-REGION-HERE.wasabisys.com'
AWS_WASABI_CREDS = {
    'endpoint_url':ENPOINT_URL,
    'aws_access_key_id':ACCESS_ID_W,
    'aws_secret_access_key':ACCESS_KEY_W
}

# Step 3: Declare functions

In [None]:
def get_all_s3_objects(s3, **base_kwargs):
    continuation_token = None
    while True:
        list_kwargs = dict(MaxKeys=1000, **base_kwargs)
        if continuation_token:
            list_kwargs['ContinuationToken'] = continuation_token
        response = s3.list_objects_v2(**list_kwargs)
        yield from response.get('Contents', [])
        if not response.get('IsTruncated'):  # At the end of the list?
            break
        continuation_token = response.get('NextContinuationToken')

def getHumanReadable(size,precision=2):
    suffixes=['B','KB','MB','GB','TB']
    suffixIndex = 0
    while size > 1024 and suffixIndex < 4:
        suffixIndex += 1 #increment the index of the suffix
        size = size/1024.0 #apply the division
    return "%.*f%s"%(precision,size,suffixes[suffixIndex])

def get_info_obj(bucket_s3, prefix_s3, bucket_wasabi, prefix_wasabi):
    
    count_s3 = 0
    sum_s3 = 0
    count_wasabi = 0
    sum_wasabi = 0

    for file in get_all_s3_objects(s3=boto3.client('s3', **AWS_S3_CREDS), Bucket=bucket_s3, Prefix=prefix_s3):
      count_s3 = count_s3
      sum_s3 = sum_s3 + (file['Size'])
      count_s3 +=1
    
    for file in get_all_s3_objects(s3=boto3.client('s3', **AWS_WASABI_CREDS), Bucket=bucket_wasabi, Prefix=prefix_wasabi):
      count_wasabi = count_wasabi
      sum_wasabi = sum_wasabi + (file['Size'])
      count_wasabi +=1
    
    return (getHumanReadable(sum_s3),count_s3), (getHumanReadable(sum_wasabi),count_wasabi)

# Step 4: Validate S3 objects in the Wasabi bucket


In [None]:
list_prefixes = [
'prefix-1',
'prefix-2',
'prefix-3'
]

successful_results = []
error_results = []

for prefix in list_prefixes:
  res_s3, res_wasabi = get_info_obj(bucket_s3='S3-BUCKET', 
             prefix_s3=prefix,
             bucket_wasabi='WASABI-BUCKET', 
             prefix_wasabi=prefix)
  
  if res_s3 == res_wasabi:
    print('All good with {} - {}'.format(prefix, res_s3))
    successful_results.append([prefix, res_s3, res_wasabi])
  else:
    print('ERROR: Take a look at {}'.format(prefix))
    error_results.append([prefix, res_s3, res_wasabi])