In [1]:
import boto3

## AWS S3 `Client` Interface

- The `Client` interface is a low-level library whose functions return mostly dictionaries that we need to manually parse. It gives you more granular control over AWS operations and can provide better performance but requires more programming work.

In [2]:
# Instantiate an S3 client
s3_client = boto3.client("s3")

# Check the client
s3_client

<botocore.client.S3 at 0x7f8b69200450>

In [3]:
# Send request to list all buckets
response = s3_client.list_buckets()

# Check response
response

{'ResponseMetadata': {'RequestId': 'N6Z4X98V5R87P0D9',
  'HostId': 'm0UtvXrXCJ8+YUfy9qOepCdMjI09k8AvyBhVxjFZcLj1TqRZBhQ8Y5/W1ykjKIYjpZNiU7UrZ5k=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'm0UtvXrXCJ8+YUfy9qOepCdMjI09k8AvyBhVxjFZcLj1TqRZBhQ8Y5/W1ykjKIYjpZNiU7UrZ5k=',
   'x-amz-request-id': 'N6Z4X98V5R87P0D9',
   'date': 'Wed, 13 Sep 2023 13:28:28 GMT',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'Buckets': [{'Name': 'aycy-recipe-classifier',
   'CreationDate': datetime.datetime(2023, 9, 12, 21, 18, 43, tzinfo=tzlocal())},
  {'Name': 'aycy-velocipede-481502',
   'CreationDate': datetime.datetime(2023, 7, 18, 8, 38, 44, tzinfo=tzlocal())}],
 'Owner': {'ID': 'cbf718b577d52ecf2bfe907804ec072cdab387c715b97b9fe55c18187fa05541'}}

In [4]:
type(response)

dict

In [5]:
response.keys()

dict_keys(['ResponseMetadata', 'Buckets', 'Owner'])

In [6]:
response.values()

dict_values([{'RequestId': 'N6Z4X98V5R87P0D9', 'HostId': 'm0UtvXrXCJ8+YUfy9qOepCdMjI09k8AvyBhVxjFZcLj1TqRZBhQ8Y5/W1ykjKIYjpZNiU7UrZ5k=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'm0UtvXrXCJ8+YUfy9qOepCdMjI09k8AvyBhVxjFZcLj1TqRZBhQ8Y5/W1ykjKIYjpZNiU7UrZ5k=', 'x-amz-request-id': 'N6Z4X98V5R87P0D9', 'date': 'Wed, 13 Sep 2023 13:28:28 GMT', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, [{'Name': 'aycy-recipe-classifier', 'CreationDate': datetime.datetime(2023, 9, 12, 21, 18, 43, tzinfo=tzlocal())}, {'Name': 'aycy-velocipede-481502', 'CreationDate': datetime.datetime(2023, 7, 18, 8, 38, 44, tzinfo=tzlocal())}], {'ID': 'cbf718b577d52ecf2bfe907804ec072cdab387c715b97b9fe55c18187fa05541'}])

In [7]:
response["Buckets"]

[{'Name': 'aycy-recipe-classifier',
  'CreationDate': datetime.datetime(2023, 9, 12, 21, 18, 43, tzinfo=tzlocal())},
 {'Name': 'aycy-velocipede-481502',
  'CreationDate': datetime.datetime(2023, 7, 18, 8, 38, 44, tzinfo=tzlocal())}]

## AWS S3 Resource Interface

The `Resource` interface is a higher-level abstraction that deals with the dictionaries behind the scenes and often returns more readable outputs.
- Code will be easier to read and understand with this method, but not every Client operation is available from the resource interface.
- If you have written a lot of code in the Resource interface and realize you need a function only available through the Client, you can use `s3_resource.meta.client` to access Client functions without rewriting your other code.
    - An example pf this is the `.generate_presigned_url()` Client method which can be used to give users access to an S3 bucket for a limited amount of time without AWS credentials.

In [8]:
# Instantiate a resource service client
s3_resource = boto3.resource("s3")

# Iterate over the buckets
for bucket in s3_resource.buckets.all():

    # Use the `name` method on each bucket to print its name
    print(bucket.name)

aycy-recipe-classifier
aycy-velocipede-481502


# Creating a New Bucket

In [9]:
# Check region of bucket
boto3.session.Session()

Session(region_name='ca-central-1')

Create a function for creating buckets

In [16]:
def create_bucket(bucket_name, s3_connection):

    # Identify current session and region
    session = boto3.session.Session()
    current_region = session.region_name

    # Try to create the bucket
    try:
        bucket_response = s3_connection.create_bucket(
            Bucket = bucket_name,
            CreateBucketConfiguration = {
                "LocationConstraint" : current_region
            }
        )

        # Block public access to the bucket
        response_public = s3_client.put_public_access_block(
            Bucket = bucket_name,
            PublicAccessBlockConfiguration = {
                "BlockPublicAcls" : True,
                "IgnorePublicAcls" : True,
                "BlockPublicPolicy" : True,
                "RestrictPublicBuckets" : True
            }
        )

        # Print successful addition of Bucket
        print(f"Bucket Name: '{bucket_name}' was created in Region: '{current_region}'")

        # Return response (dictionary) associated with bucket creation
        return bucket_response

    # Except clause to deal with bucket already created
    except s3_resource.meta.client.exceptions.BucketAlreadyOwnedByYou:
        print(f"Bucket Name: '{bucket_name}' already exists!")
        return None

In [17]:
# Test creating a bucket that already exists
bucket_name = "aycy-recipe-classifier"

bucket_response = create_bucket(
    bucket_name,
    s3_resource.meta.client
)

Bucket Name: 'aycy-recipe-classifier' already exists!


In [19]:
# Test creating a new bucket
bucket_name = "aycy-recipe-classifier-test"

bucket_response = create_bucket(
    bucket_name,
    s3_resource.meta.client
)

Bucket Name: 'aycy-recipe-classifier-test' was created in Region: 'ca-central-1'


In [20]:
# Instantiate a resource service client
s3_resource = boto3.resource("s3")

# Iterate over the buckets
for bucket in s3_resource.buckets.all():

    # Use the `name` method on each bucket to print its name
    print(bucket.name)

aycy-recipe-classifier
aycy-recipe-classifier-test
aycy-velocipede-481502


## Uploading a file to bucket

In [21]:
import joblib

In [26]:
filename = "../data/recipe_url_df.pkl"
df = joblib.load(filename)

In [27]:
df.shape

(40001, 1)

In [28]:
df.head()

Unnamed: 0,extracted_url
0,https://www.allrecipes.com/recipe/83646/corned...
1,https://www.allrecipes.com/recipe/158799/stout...
2,https://www.allrecipes.com/recipe/8509102/chic...
3,https://www.allrecipes.com/recipe/8508920/miss...
4,https://www.allrecipes.com/recipe/255462/lasag...


In [30]:
s3_client = boto3.client("s3")

filename = "../data/recipe_url_df.pkl"
bucket_name = "aycy-recipe-classifier"

s3_client.upload_file(
    Filename = filename,
    Bucket = bucket_name,
    Key = "recipe_url_df.pkl"
)

## Accessing a Specific Bucket and List Objects

A specific bucket can be accessed using the `.list_objects_v2()` method.

In [32]:
# Using AWS client, send a request to gather all objects in a bucket
object_response = s3_client.list_objects_v2(
    Bucket = bucket_name
)

#Check
object_response["Contents"]

[{'Key': 'recipe_url_df.pkl',
  'LastModified': datetime.datetime(2023, 9, 13, 13, 54, 12, tzinfo=tzlocal()),
  'ETag': '"cccbb4b930b9b8e33998c2dcf172089d"',
  'Size': 3089204,
  'StorageClass': 'STANDARD'}]

# Reading Files from a Bucket

In [34]:
# Given a filename, get the object from
s3_object = s3_client.get_object(
    Bucket = bucket_name,
    Key = "recipe_url_df.pkl"
)

# Check
s3_object

{'ResponseMetadata': {'RequestId': 'GJAXTKJVPNC2VVX1',
  'HostId': '+pEEvJRTe1cuiPVpR5NWP4KtT/4Kh/NjSLw/tO1SaFjN4sk/35FF8rHrb7IoCYzwLmYAHpW7Euo=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '+pEEvJRTe1cuiPVpR5NWP4KtT/4Kh/NjSLw/tO1SaFjN4sk/35FF8rHrb7IoCYzwLmYAHpW7Euo=',
   'x-amz-request-id': 'GJAXTKJVPNC2VVX1',
   'date': 'Wed, 13 Sep 2023 14:02:08 GMT',
   'last-modified': 'Wed, 13 Sep 2023 13:54:12 GMT',
   'etag': '"cccbb4b930b9b8e33998c2dcf172089d"',
   'x-amz-server-side-encryption': 'AES256',
   'accept-ranges': 'bytes',
   'content-type': 'binary/octet-stream',
   'server': 'AmazonS3',
   'content-length': '3089204'},
  'RetryAttempts': 0},
 'AcceptRanges': 'bytes',
 'LastModified': datetime.datetime(2023, 9, 13, 13, 54, 12, tzinfo=tzutc()),
 'ContentLength': 3089204,
 'ETag': '"cccbb4b930b9b8e33998c2dcf172089d"',
 'ContentType': 'binary/octet-stream',
 'ServerSideEncryption': 'AES256',
 'Metadata': {},
 'Body': <botocore.response.StreamingBody at 0x7f8b68606d10>}

In [52]:
# The body of the response contains the target file
s3_object["Body"]

<botocore.response.StreamingBody at 0x7f8b68606d10>

In [45]:
bucket_name

'aycy-recipe-classifier'

### Write to disk then read

In [49]:
# https://stackoverflow.com/questions/48964181/how-to-load-a-pickle-file-from-s3-to-use-in-aws-lambda
#s3 = boto3.resource('s3')

import pickle
# Read the boto3 file into a dataframe
# df = pickle.loads(s3_resource.Bucket(bucket_name).Object("recipe_url_df.pkl").get()["Body"].read())

with open("recipe_url_df.pkl", "wb") as data:
    s3_resource.Bucket(bucket_name).download_fileobj("recipe_url_df.pkl", data)

with open("recipe_url_df.pkl", "rb") as data:
    df = joblib.load(data)

### Read without writing to disk

In [54]:
import pickle
import boto3
from io import BytesIO

with BytesIO() as data:
    s3_resource.Bucket(bucket_name).download_fileobj("recipe_url_df.pkl", data)
    data.seek(0)    # move back to the beginning after writing
    df = joblib.load(data)

In [55]:
df

Unnamed: 0,extracted_url
0,https://www.allrecipes.com/recipe/83646/corned...
1,https://www.allrecipes.com/recipe/158799/stout...
2,https://www.allrecipes.com/recipe/8509102/chic...
3,https://www.allrecipes.com/recipe/8508920/miss...
4,https://www.allrecipes.com/recipe/255462/lasag...
...,...
39996,https://www.allrecipes.com/recipe/276834/salsa...
39997,https://www.allrecipes.com/recipe/67002/sweet-...
39998,https://www.allrecipes.com/recipe/53211/wilder...
39999,https://www.allrecipes.com/recipe/274770/insta...


## Delete a bucket

In [57]:
def delete_all_objects(bucket_name):

    # An empty list to store the names of files within Bucket to be deleted

    # define bucket object to be deleted
    bucket = s3_resource.Bucket(bucket_name)

    # If you have versioning turned on, you will see the files and their version in the list
    # We do not, so the VersionId will simply be null
    for object_version in bucket.object_versions.all():
        files_to_delete.append(
            {
                "Key": object_version.object_key,
                "VersionId": object_version.id
            }
        )

    # print each file to be deleted
    for file in files_to_delete:
        print(f"Deleted {file['Key']} version {file['VersionId']}")

    # Delete all the files in the S3 bucket
    bucket.delete_objects(
        Delete = {
            "Objects":files_to_delete
        }
    )

In [None]:
# Check the bucket contents again
try:
    for file in s3_client.list_objects_v2(Bucket=bucket_name)['Contents']:
        print(file['Key'])

# If the bucket is empty, file['Key'] will return a KeyError
except KeyError:
    print('This bucket is empty!')

In [None]:
s3_resource.Bucket(bucket_name).delete()

In [None]:
# Print the names of all of our buckets:
for bucket in s3_resource.buckets.all():
    print(bucket.name)