In [None]:
# Wilmar Perez

# Import the Python AWS SDK
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/boto3.html

import boto3
import os
import pandas as pd

In [None]:
## config

aws_profile = 'LOCAL_PROFILE'
aws_service = 's3'
aws_bucket = 'RANDOM_BUCKET_NAME'
local_dataset = 'dc-wikia-data.csv'
object_name = 'myS3dataset.csv'

In [None]:
# Let's explore the dataset locally
# df_local = pd.read_csv(local_dataset)

In [None]:
# df_local.head()

In [None]:
# Let's explore the dataset locally
df_local = pd.read_csv(local_dataset, 
                       parse_dates = ['first appearance'], 
                       dtype = {'page_id': str, 'appearances': str, 'year': str})

df_local.head()

In [None]:
# We need a new session in order to use a profile different to the Default one
session = boto3.Session(profile_name = aws_profile)

In [None]:
# Create resource and client
s3Resource = session.resource(aws_service)
s3client = session.client(aws_service)

In [None]:
# Let's create a bucket
s3Resource.Bucket(aws_bucket).create(
    CreateBucketConfiguration={
        'LocationConstraint': session.region_name
    }
)

In [None]:
# Let's upload a file
with open(local_dataset, "rb") as file:
    s3client.upload_fileobj(file, aws_bucket, object_name)

In [None]:
# Was the file uploaded?
s3client.head_object(Bucket = aws_bucket, Key = object_name)

In [None]:
# Let's extract only data we need
my_query = """SELECT * FROM S3Object WHERE year > '1980'"""

query_result = s3client.select_object_content(
        Bucket=aws_bucket,
        Key=object_name,
        ExpressionType="SQL",
        Expression=my_query,
        InputSerialization={'CSV':{"FileHeaderInfo":"Use"}},
        OutputSerialization={'CSV':{}}
    )

In [None]:
for record in query_result["Payload"]:
    print(record)

In [None]:
# Let's extract only data we need
my_query = """SELECT * FROM S3Object WHERE name like '%superman%'"""

query_result = s3client.select_object_content(
        Bucket=aws_bucket,
        Key=object_name,
        ExpressionType="SQL",
        Expression=my_query,
        InputSerialization={'CSV':{"FileHeaderInfo":"Use"}},
        OutputSerialization={'CSV':{}}
    )

In [None]:
for record in query_result["Payload"]:
    print(record)

In [None]:
# Let's delete the content first!
s3client.delete_object(Bucket = aws_bucket, Key = object_name)

In [None]:
# Let's try deleting that bucket again
s3client.delete_bucket(Bucket=aws_bucket)