This notebook documents the process of loading, subsetting, then uploading data to an AWS S3 bucket.

# Import Packages and Define Functions

In [26]:
import pandas as pd # For general data processing
import joblib       # For loading data
import boto3        # For AWS S3 processes

# For getting AWS S3 credentials
from dotenv import load_dotenv
import os

# Import and Subset Data

In [2]:
# Import raw data file
df = joblib.load(
    "../data/raw_data_df.pkl"
)

In [3]:
# Print 
print(f"There are {df.shape[0]} recipes with {df.shape[1]} columns.")

There are 40001 recipes with 18 columns.


In [4]:
target_columns = [
    "recipe_url",
    "title",
    "label",
    "rating_average",
    "rating_count"
]

# Subset(filter) for columns of interest only
df = df.loc[:, target_columns]

# Visually examine the filtered data
df.head(1).T

Unnamed: 0,0
recipe_url,https://www.allrecipes.com/recipe/83646/corned...
title,Corned Beef Roast
label,"['Recipes', 'Main Dishes', 'Beef', 'Corned Bee..."
rating_average,4.4
rating_count,68


In [5]:
# Examine shape of data after filtering
print(f"After subsetting the data, there are {df.shape[0]} recipes with {df.shape[1]} columns.")

After subsetting the data, there are 40001 recipes with 5 columns.


In [49]:
# Save filtered DataFrame
filename = "../data/recipe_label_df.pkl"
joblib.dump(df, filename = filename)

['../data/recipe_label_df.pkl']

# Uploading Data to AWS S3

## Instansiate Client, Resources and Session

In [36]:
# Initialize s3 client interface
s3_client = boto3.client(
    "s3",
    aws_access_key_id     = os.environ.get("aws_access_key_id"),
    aws_secret_access_key = os.environ.get("aws_secret_access_key")
)

# Check the client
s3_client

<botocore.client.S3 at 0x7fd773bb6a50>

In [34]:
# Instantiate s3 resource interface
s3_resource = boto3.resource(
    "s3",
    aws_access_key_id     = os.environ.get("aws_access_key_id"),
    aws_secret_access_key = os.environ.get("aws_secret_access_key")
)

# Check the resource
s3_resource

s3.ServiceResource()

In [29]:
# Start S3 Session
session = boto3.Session(
    aws_access_key_id     = os.environ.get("aws_access_key_id"),
    aws_secret_access_key = os.environ.get("aws_secret_access_key")
    # aws_session_token # Only when using temporary credentials
)

## Check AWS S3 Buckets

In [43]:
# Send a request to list all buckets
response = s3_client.list_buckets()

for index, bucket in enumerate(response["Buckets"]):
    print(f"# {index}: {bucket['Name']}")

# 0: aycy-recipe-classifier
# 1: aycy-recipe-classifier-test
# 2: aycy-velocipede-481502


In [48]:
# Check contents of bucket
object_response = s3_client.list_objects_v2(
    Bucket = "aycy-recipe-classifier"
)

for index, object in enumerate(object_response["Contents"]):
    print(f"# {index}: {object['Key']}, {object['Size']} bytes")

# 0: recipe_url_df.pkl, 3089204 bytes


# Upload Filtered Data File to AWS S3

In [50]:
# Specify a key to save the data file as within S3
s3_client.upload_file(
    Filename = filename,
    Bucket = "aycy-recipe-classifier",
    Key = "recipe_label_df.pkl"
)

In [51]:
# Check that the file has been uploaded
object_response = s3_client.list_objects_v2(
    Bucket = "aycy-recipe-classifier"
)

for index, object in enumerate(object_response["Contents"]):
    print(f"# {index}: {object['Key']}, {object['Size']} bytes")

# 0: recipe_label_df.pkl, 5539560 bytes
# 1: recipe_url_df.pkl, 3089204 bytes
