In [None]:
# 1_LIST OBJECTS
# 2_CHECKING IF OBJECT EXISTS
# 3_DOWNLOAD OBJECTS
# 4_UPLOAD OBJECTS
# 5_DELETE OBJECTS
# 6_WRITE OBJECTS
# 7_READ OBJECTS

In [2]:
# CONNECTING TO S3 BUCKET
import os
import io
import boto3
import awswrangler as wr
import pandas as pd

boto3.setup_default_session(aws_access_key_id = 'your_key',
                            aws_secret_access_key = 'your_secret_access_key')

bucket = 'coding-tutorials'

In [4]:
# 1. LISTING OBJECTS (BOTO3 WINS)

print('--BOTO3--') 
# BOTO3 - Preferred Method
client = boto3.client('s3')

for obj in client.list_objects(Bucket=bucket)['Contents']:
    print('File Name:', obj['Key'], 'Size:', round(obj['Size']/ (1024*1024), 2), 'MB')
    
print('----') 
# BOTO3 - Alternative Method
resource = boto3.resource('s3')

for obj in resource.Bucket(bucket).objects.all():
    print('File Name:', obj.key, 'Size:', round(obj.size/ (1024*1024), 2), 'MB')

print('\n') 
print('--AWS_WRANGLER--') 
# AWS WRANGLER

for obj in wr.s3.list_objects("s3://coding-tutorials/"):
    print('File Name:', obj.replace('s3://coding-tutorials/', ''))
    
print('----') 

for obj, size in wr.s3.size_objects("s3://coding-tutorials/").items():
    print('File Name:', obj.replace('s3://coding-tutorials/', '') , 'Size:', round(size/ (1024*1024), 2), 'MB')

--BOTO3--
File Name: account_balances_feb2023.parquet Size: 0.15 MB
File Name: account_balances_jan2023.parquet Size: 0.15 MB
File Name: account_balances_mar2023.parquet Size: 0.15 MB
----
File Name: account_balances_feb2023.parquet Size: 0.15 MB
File Name: account_balances_jan2023.parquet Size: 0.15 MB
File Name: account_balances_mar2023.parquet Size: 0.15 MB


--AWS_WRANGLER--
File Name: account_balances_feb2023.parquet
File Name: account_balances_jan2023.parquet
File Name: account_balances_mar2023.parquet
----
File Name: account_balances_feb2023.parquet Size: 0.15 MB
File Name: account_balances_jan2023.parquet Size: 0.15 MB
File Name: account_balances_mar2023.parquet Size: 0.15 MB


In [10]:
# 2. CHECKING IF OBJECT EXISTS (AWS WRANGLER WINS)
object_key = 'account_balances_jan2023.parquet'

print('--BOTO3--') 
# BOTO3
client = boto3.client('s3')
try:
    client.head_object(Bucket=bucket, Key = object_key)
    print(f"The object exists in the bucket {bucket}.")
except client.exceptions.NoSuchKey:
    print(f"The object does not exist in the bucket {bucket}.")
    
print('\n') 
print('--AWS_WRANGLER--') 
# AWS WRANGLER
try:
    wr.s3.does_object_exist(f's3://{bucket}/{object_key}')
    print(f"The object exists in the bucket {bucket}.")
except:
    print(f"The object does not exist in the bucket {bucket}.")    


--BOTO3--
The object exists in the bucket coding-tutorials.


--AWS_WRANGLER--
The object exists in the bucket coding-tutorials.


In [18]:
# 3. DOWNLOAD OBJECTS (DRAW)
object_key = 'account_balances_jan2023.parquet'

# BOTO3
client = boto3.client('s3')
client.download_file(bucket, object_key, 'tmp/account_balances_jan2023_v2.parquet')

# AWS WRANGLER
wr.s3.download(path=f's3://{bucket}/{object_key}', local_file='tmp/account_balances_jan2023_v3.parquet')

In [22]:
# 4. UPLOAD OBJECTS (DRAW)
object_key_1 = 'account_balances_apr2023.parquet'
object_key_2 = 'account_balances_may2023.parquet'

file_path_1 = os.path.dirname(os.path.realpath(object_key_1)) + '/' + object_key_1
file_path_2 = os.path.dirname(os.path.realpath(object_key_2)) + '/' + object_key_2

# BOTO3
client = boto3.client('s3')
client.upload_file(file_path_1, bucket, object_key_1)

# AWS WRANGLER
wr.s3.upload(local_file=file_path_2, path=f's3://{bucket}/{object_key_2}')

In [21]:
# 5. DELETE OBJECTS (BOTO3 WINS FOR METADATA, AWS WRANGLER WINS FOR SIMPLICITY)
# BOTO3
print('--BOTO3--') 
client = boto3.client('s3')
object_key = 'account_balances_jan2023.parquet'

# Delete Single object
response = client.delete_object(Bucket=bucket, Key=object_key)
detetion_date = response['ResponseMetadata']['HTTPHeaders']['date']

if response['ResponseMetadata']['HTTPStatusCode'] == 204:
    print(f'Object {object_key} deleted successfully on {detetion_date}.')
else:
    print(f'Object could not be deleted.')

# Delete Multiple Objects
object_keys = ['account_balances_jan2023.parquet', 
               'account_balances_feb2023.parquet', 
               'account_balances_mar2023.parquet']

objects = [{'Key': key} for key in object_keys]

response = client.delete_objects(Bucket=bucket, Delete={'Objects': objects})
detetion_date = response['ResponseMetadata']['HTTPHeaders']['date']

if len(object_keys) == len(response['Deleted']):
    print(f'All objects were deleted successfully on {detetion_date}')
else:
    print(f'Object could not be deleted.')
    
print('\n') 
# AWS WRANGLER
print('--AWS_WRANGLER--') 
wr.s3.delete_objects(path=f's3://{bucket}/{object_key}')

# Delete Multiple Objects
try:
    wr.s3.delete_objects(path=[f's3://{bucket}/{key}' for key in object_keys])
    print('All objects deleted successfully.')
except:
    print(f'Objects could not be deleted.')

--BOTO3--
Object account_balances_jan2023.parquet deleted successfully on Wed, 07 Jun 2023 17:38:18 GMT.
All objects were deleted successfully on Wed, 07 Jun 2023 17:38:19 GMT


--AWS_WRANGLER--
All objects deleted successfully.


In [27]:
# 6 WRITE OBJECTS (AWS WRANGLER)
#####
import pandas as pd
import numpy as np

# Generate mock data
num_rows = 10000
data = {
    'AS_OF_DATE': np.random.choice(pd.date_range(start='2023-07-01', end='2023-07-31'), size=num_rows),
    'COMPANY_CODE': np.random.choice(['ABC', 'XYZ', 'CBA', 'ZYX'], size=num_rows),
    'ACCOUNT_NAME': np.random.choice(['Account A', 'Account B', 'Account C'], size=num_rows),
    'ACCOUNT_NUMBER': np.random.randint(100000, 999999, size=num_rows).astype(str),
    'BALANCE': np.random.uniform(low=0, high=500000, size=num_rows).astype(np.float32),
    'CURRENCY_CODE': np.random.choice(['USD', 'EUR', 'GBP'], size=num_rows),
}

# Create the dataframe
df = pd.DataFrame(data)
#####

# # BOTO3
object_key = 'account_balances_july2023.parquet'

# AWS WRANGLER   
wr.s3.to_parquet(df=df, path=f's3://{bucket}/{object_key}', compression = 'gzip', 
                 partition_cols = ['COMPANY_CODE'], dataset=True)

{'paths': ['s3://coding-tutorials/account_balances_july2023.parquet/COMPANY_CODE=ABC/449e625480224a5398e89a323997bf00.gz.parquet',
  's3://coding-tutorials/account_balances_july2023.parquet/COMPANY_CODE=CBA/449e625480224a5398e89a323997bf00.gz.parquet',
  's3://coding-tutorials/account_balances_july2023.parquet/COMPANY_CODE=XYZ/449e625480224a5398e89a323997bf00.gz.parquet',
  's3://coding-tutorials/account_balances_july2023.parquet/COMPANY_CODE=ZYX/449e625480224a5398e89a323997bf00.gz.parquet'],
 'partitions_values': {'s3://coding-tutorials/account_balances_july2023.parquet/COMPANY_CODE=ABC/': ['ABC'],
  's3://coding-tutorials/account_balances_july2023.parquet/COMPANY_CODE=CBA/': ['CBA'],
  's3://coding-tutorials/account_balances_july2023.parquet/COMPANY_CODE=XYZ/': ['XYZ'],
  's3://coding-tutorials/account_balances_july2023.parquet/COMPANY_CODE=ZYX/': ['ZYX']}}

In [28]:
# 7.1 READ OBJECTS (AWS WRANGLER)
# BOTO3
s3_client = boto3.client('s3')
object_key = 'account_balances_may2023.parquet'


# Read the Parquet file
response = s3_client.get_object(Bucket=bucket, Key=object_key)
parquet_object = response['Body'].read()

df = pd.read_parquet(io.BytesIO(parquet_object))

df.head()

# AWS WRANGLER
df = wr.s3.read_parquet(path=f's3://{bucket}/{object_key}')
df.head()

# wr.s3.read_csv()
# wr.s3.read_json()
# wr.s3.read_parquet_table()
# wr.s3.read_deltalake()

Unnamed: 0,AS_OF_DATE,COMPANY_CODE,ACCOUNT_NAME,ACCOUNT_NUMBER,BALANCE,CURRENCY_CODE
0,2023-05-27,ZYX,Account C,430605,181860.203125,GBP
1,2023-05-13,ZYX,Account C,269927,323294.84375,GBP
2,2023-05-27,CBA,Account B,348429,128884.3125,EUR
3,2023-05-31,CBA,Account C,532127,438921.0,EUR
4,2023-05-25,XYZ,Account A,598786,433218.28125,USD


In [43]:
# 7.2 READ OBJECTS WITH SQL (AWS WRANGLER)

object_key = 'account_balances_may2023.parquet'
query = "SELECT * FROM s3object s WHERE AS_OF_DATE > CAST('2023-05-13T' AS TIMESTAMP)"

# BOTO3
client = boto3.client('s3')

resp = client.select_object_content(
        Bucket=bucket,
        Key=object_key,
        Expression= query,
        ExpressionType='SQL',
        InputSerialization={"Parquet": {}},
        OutputSerialization={'JSON': {}},
)

records = []

# Process the response
for event in resp['Payload']:
    if 'Records' in event:
        records.append(event['Records']['Payload'].decode('utf-8'))
        
# Concatenate the JSON records into a single string
json_string = ''.join(records)

# Load the JSON data into a Pandas DataFrame
df = pd.read_json(json_string, lines=True)

# Print the DataFrame
df.head()

# AWS WRANGLER
df = wr.s3.select_query(
        sql=query,
        path=f's3://{bucket}/{object_key}',
        input_serialization="Parquet",
        input_serialization_params={}
)
df.head()

Unnamed: 0,AS_OF_DATE,COMPANY_CODE,ACCOUNT_NAME,ACCOUNT_NUMBER,BALANCE,CURRENCY_CODE
0,2023-05-27T00:00:00.000Z,ZYX,Account C,430605,181860.203125,GBP
1,2023-05-27T00:00:00.000Z,CBA,Account B,348429,128884.3125,EUR
2,2023-05-31T00:00:00.000Z,CBA,Account C,532127,438921.0,EUR
3,2023-05-25T00:00:00.000Z,XYZ,Account A,598786,433218.28125,USD
4,2023-05-31T00:00:00.000Z,XYZ,Account A,137437,60378.207031,GBP
