In [1]:
import os
import boto3
from datetime import datetime
from dateutil import tz

In [2]:
# aws sso login --profile beta
os.environ.setdefault("AWS_PROFILE", "sandbox")
session = boto3.Session(profile_name="sandbox")

In [3]:
# Initialize the S3 client
s3_client = boto3.client("s3")

In [None]:
bucket_name = "bb2-sandbox-datalake-raw"
prefix = "test_data/date=2024-11-06/"

In [5]:
def list_object_versions(bucket_name, prefix=None):
    versions = []
    paginator = s3_client.get_paginator("list_object_versions")

    if prefix:
        pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
    else:
        pages = paginator.paginate(Bucket=bucket_name)

    for page in pages:
        if "Versions" in page:
            for version in page["Versions"]:
                versions.append(
                    {
                        "Key": version["Key"],
                        "VersionId": version["VersionId"],
                        "IsLatest": version["IsLatest"],
                        "LastModified": version["LastModified"],
                        "Size": version["Size"],
                    }
                )
        if "DeleteMarkers" in page:
            for delete_marker in page["DeleteMarkers"]:
                versions.append(
                    {
                        "Key": delete_marker["Key"],
                        "VersionId": delete_marker["VersionId"],
                        "IsLatest": delete_marker["IsLatest"],
                        "LastModified": delete_marker["LastModified"],
                        "IsDeleteMarker": True,
                    }
                )

    return versions

In [6]:
object_versions = list_object_versions(bucket_name, prefix)
object_versions

[{'Key': 'test_data/date=2024-11-06/60e53455ff1c4d648bd49371c9924411.snappy.parquet',
  'VersionId': 'HmJiahMwBewopNdrqutWea0xGbAx2rY5',
  'IsLatest': True,
  'LastModified': datetime.datetime(2024, 11, 28, 7, 44, 25, tzinfo=tzutc()),
  'Size': 1167532},
 {'Key': 'test_data/date=2024-11-06/60e53455ff1c4d648bd49371c9924411.snappy.parquet',
  'VersionId': 'rCmEMlHvcc6dNmgqBmzN9rQzL7qEJ0kX',
  'IsLatest': False,
  'LastModified': datetime.datetime(2024, 11, 28, 7, 41, 31, tzinfo=tzutc()),
  'Size': 1167532},
 {'Key': 'test_data/date=2024-11-06/c7e81600-736f-43c1-943b-69fc27f9f424.csv',
  'VersionId': 's7hQgQeTTLyQiu12o_TzagC3LfAQkz5x',
  'IsLatest': True,
  'LastModified': datetime.datetime(2024, 11, 28, 8, 13, 4, tzinfo=tzutc()),
  'Size': 1162866},
 {'Key': 'test_data/date=2024-11-06/c7e81600-736f-43c1-943b-69fc27f9f424.csv',
  'VersionId': '_Baw1ttJGaSpDPU1AJOTycBGR_fgW9t4',
  'IsLatest': False,
  'LastModified': datetime.datetime(2024, 11, 28, 8, 5, 48, tzinfo=tzutc()),
  'Size': 1162

In [7]:
def convert_to_utc_components(date_string):
    """
    Converts a datetime string with timezone information to UTC components.

    Args:
        date_string (str): The input datetime string, e.g., "November 20, 2024, 19:07:25 (UTC+02:00)"

    Returns:
        tuple: A tuple of (year, month, day, hour, minute, second) in UTC.
    """
    # Clean the string to remove parentheses around the timezone
    cleaned_date_string = date_string.replace(" (", " ").replace(")", "")

    # Parse the cleaned string into a datetime object
    datetime_obj = datetime.strptime(cleaned_date_string, "%B %d, %Y, %H:%M:%S %Z%z")

    # Convert to UTC
    datetime_obj_utc = datetime_obj.astimezone(tz.tzutc())

    # Extract components
    return (
        datetime_obj_utc.year,
        datetime_obj_utc.month,
        datetime_obj_utc.day,
        datetime_obj_utc.hour,
        datetime_obj_utc.minute,
        datetime_obj_utc.second,
    )

In [8]:
os.environ.setdefault("START_DATE", "November 28, 2024, 09:41:31 (UTC+02:00)")
os.environ.setdefault("END_DATE", "November 28, 2024, 10:13:04 (UTC+02:00)")

'November 28, 2024, 10:13:04 (UTC+02:00)'

In [9]:
start_utc_components = convert_to_utc_components(os.environ["START_DATE"])
start_date = datetime(*start_utc_components, tzinfo=None)
start_date

datetime.datetime(2024, 11, 28, 7, 41, 31)

In [10]:
end_utc_components = convert_to_utc_components(os.environ["END_DATE"])
end_date = datetime(*end_utc_components, tzinfo=None)
end_date

datetime.datetime(2024, 11, 28, 8, 13, 4)

In [11]:
# Filter objects within the specified date range
filtered_objects = [
    obj
    for obj in object_versions
    if start_date <= obj["LastModified"].replace(tzinfo=None) <= end_date
]

In [12]:
filtered_objects

[{'Key': 'test_data/date=2024-11-06/60e53455ff1c4d648bd49371c9924411.snappy.parquet',
  'VersionId': 'HmJiahMwBewopNdrqutWea0xGbAx2rY5',
  'IsLatest': True,
  'LastModified': datetime.datetime(2024, 11, 28, 7, 44, 25, tzinfo=tzutc()),
  'Size': 1167532},
 {'Key': 'test_data/date=2024-11-06/60e53455ff1c4d648bd49371c9924411.snappy.parquet',
  'VersionId': 'rCmEMlHvcc6dNmgqBmzN9rQzL7qEJ0kX',
  'IsLatest': False,
  'LastModified': datetime.datetime(2024, 11, 28, 7, 41, 31, tzinfo=tzutc()),
  'Size': 1167532},
 {'Key': 'test_data/date=2024-11-06/c7e81600-736f-43c1-943b-69fc27f9f424.csv',
  'VersionId': 's7hQgQeTTLyQiu12o_TzagC3LfAQkz5x',
  'IsLatest': True,
  'LastModified': datetime.datetime(2024, 11, 28, 8, 13, 4, tzinfo=tzutc()),
  'Size': 1162866},
 {'Key': 'test_data/date=2024-11-06/c7e81600-736f-43c1-943b-69fc27f9f424.csv',
  'VersionId': '_Baw1ttJGaSpDPU1AJOTycBGR_fgW9t4',
  'IsLatest': False,
  'LastModified': datetime.datetime(2024, 11, 28, 8, 5, 48, tzinfo=tzutc()),
  'Size': 1162

In [15]:
for obj in filtered_objects:
    key = obj["Key"]
    version_id = obj["VersionId"]
    is_latest = obj["IsLatest"]

    if not is_latest:
        if key.endswith(".snappy.parquet"):
            # Handle .snappy.parquet specifically
            base_key = key[: -len(".snappy.parquet")]  # Remove the extension
            new_key = f"{base_key}_restored_{version_id}.snappy.parquet"
        else:
            # Handle other extensions
            file_extension = key.split(".")[-1]  # Get the extension after the last dot
            base_key = key[
                : -(len(file_extension) + 1)
            ]  # Remove the extension and the dot
            new_key = f"{base_key}_restored_{version_id}.{file_extension}"

        # Copy the object to the new key
        s3_client.copy_object(
            Bucket=bucket_name,
            CopySource={"Bucket": bucket_name, "Key": key, "VersionId": version_id},
            Key=new_key,
        )

        print(
            f"Copied object {key} (version {version_id}) to {new_key} in bucket {bucket_name}/{prefix}."
        )

Copied object test_data/date=2024-11-06/60e53455ff1c4d648bd49371c9924411.snappy.parquet (version rCmEMlHvcc6dNmgqBmzN9rQzL7qEJ0kX) to test_data/date=2024-11-06/60e53455ff1c4d648bd49371c9924411_restored_rCmEMlHvcc6dNmgqBmzN9rQzL7qEJ0kX.snappy.parquet in bucket bb2-sandbox-datalake-raw/test_data/date=2024-11-06/.
Copied object test_data/date=2024-11-06/c7e81600-736f-43c1-943b-69fc27f9f424.csv (version _Baw1ttJGaSpDPU1AJOTycBGR_fgW9t4) to test_data/date=2024-11-06/c7e81600-736f-43c1-943b-69fc27f9f424_restored__Baw1ttJGaSpDPU1AJOTycBGR_fgW9t4.csv in bucket bb2-sandbox-datalake-raw/test_data/date=2024-11-06/.
