In [9]:
import boto3
import pandas as pd

In [3]:
client = boto3.client("s3")

paginator = client.get_paginator("list_objects_v2")

In [4]:
def get_matching_s3_objects(bucket, prefix="", suffix=""):
    """
    Generate objects in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch objects whose key starts with
        this prefix (optional).
    :param suffix: Only fetch objects whose keys end with
        this suffix (optional).
    """
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")

    kwargs = {'Bucket': bucket}

    # We can pass the prefix directly to the S3 API.  If the user has passed
    # a tuple or list of prefixes, we go through them one by one.
    if isinstance(prefix, str):
        prefixes = (prefix, )
    else:
        prefixes = prefix

    for key_prefix in prefixes:
        kwargs["Prefix"] = key_prefix

        for page in paginator.paginate(**kwargs):
            try:
                contents = page["Contents"]
            except KeyError:
                break

            for obj in contents:
                key = obj["Key"]
                if key.endswith(suffix):
                    yield obj


def get_matching_s3_keys(bucket, prefix="", suffix=""):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    :param suffix: Only fetch keys that end with this suffix (optional).
    """
    for obj in get_matching_s3_objects(bucket, prefix, suffix):
        yield obj["Key"]

In [5]:
bucket = "covid19-lake"
prefix = "rearc-covid-19-nyt-data-in-usa/json/us-counties/"
suffix = ".json"

for key in get_matching_s3_keys(bucket, prefix=prefix, suffix=suffix):
    key_set = key

In [6]:
key_set

'rearc-covid-19-nyt-data-in-usa/json/us-counties/part-00000-02e67fb4-7c63-418c-aec6-57ba4306f06a-c000.json'

In [7]:
def get_s3_object(bucket, key):
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket, key)
    body = obj.get()['Body'].read()

    json_string = bytes.decode(body)
    df = pd.read_json(json_string, lines=True)

    return df

In [10]:
df_test = get_s3_object("covid19-lake", key_set)
df_test

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0
1,2020-01-22,Snohomish,Washington,53061,1,0
2,2020-01-23,Snohomish,Washington,53061,1,0
3,2020-01-24,Cook,Illinois,17031,1,0
4,2020-01-24,Snohomish,Washington,53061,1,0
...,...,...,...,...,...,...
537709,2020-09-15,Sweetwater,Wyoming,56037,317,2
537710,2020-09-15,Teton,Wyoming,56039,478,1
537711,2020-09-15,Uinta,Wyoming,56041,312,2
537712,2020-09-15,Washakie,Wyoming,56043,110,6


In [11]:
print(df_test)

             date      county       state   fips  cases  deaths
0      2020-01-21   Snohomish  Washington  53061      1       0
1      2020-01-22   Snohomish  Washington  53061      1       0
2      2020-01-23   Snohomish  Washington  53061      1       0
3      2020-01-24        Cook    Illinois  17031      1       0
4      2020-01-24   Snohomish  Washington  53061      1       0
...           ...         ...         ...    ...    ...     ...
537709 2020-09-15  Sweetwater     Wyoming  56037    317       2
537710 2020-09-15       Teton     Wyoming  56039    478       1
537711 2020-09-15       Uinta     Wyoming  56041    312       2
537712 2020-09-15    Washakie     Wyoming  56043    110       6
537713 2020-09-15      Weston     Wyoming  56045     23       0

[537714 rows x 6 columns]


In [14]:
import pyspark
from pyspark.sql import SparkSession
import pandas as pd

spark = pyspark.sql.SparkSession.builder.appName("DeltaTest") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:0.7.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

from delta.tables import *