-
Notifications
You must be signed in to change notification settings - Fork 13.8k
/
s3.py
1519 lines (1301 loc) · 57.4 KB
/
s3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Interact with AWS S3, using the boto3 library."""
from __future__ import annotations
import asyncio
import fnmatch
import gzip as gz
import logging
import os
import re
import shutil
import time
import warnings
from contextlib import suppress
from copy import deepcopy
from datetime import datetime
from functools import wraps
from inspect import signature
from io import BytesIO
from pathlib import Path
from tempfile import NamedTemporaryFile, gettempdir
from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast
from urllib.parse import urlsplit
from uuid import uuid4
if TYPE_CHECKING:
with suppress(ImportError):
from aiobotocore.client import AioBaseClient
from asgiref.sync import sync_to_async
from boto3.s3.transfer import S3Transfer, TransferConfig
from botocore.exceptions import ClientError
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
from airflow.providers.amazon.aws.exceptions import S3HookUriParseFailure
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
from airflow.providers.amazon.aws.utils.tags import format_tags
from airflow.utils.helpers import chunks
if TYPE_CHECKING:
from mypy_boto3_s3.service_resource import Bucket as S3Bucket, Object as S3ResourceObject
T = TypeVar("T", bound=Callable)
logger = logging.getLogger(__name__)
def provide_bucket_name(func: T) -> T:
"""Provide a bucket name taken from the connection if no bucket name has been passed to the function."""
if hasattr(func, "_unify_bucket_name_and_key_wrapped"):
logger.warning("`unify_bucket_name_and_key` should wrap `provide_bucket_name`.")
function_signature = signature(func)
@wraps(func)
def wrapper(*args, **kwargs) -> T:
bound_args = function_signature.bind(*args, **kwargs)
if "bucket_name" not in bound_args.arguments:
self = args[0]
if "bucket_name" in self.service_config:
bound_args.arguments["bucket_name"] = self.service_config["bucket_name"]
elif self.conn_config and self.conn_config.schema:
warnings.warn(
"s3 conn_type, and the associated schema field, is deprecated. "
"Please use aws conn_type instead, and specify `bucket_name` "
"in `service_config.s3` within `extras`.",
AirflowProviderDeprecationWarning,
stacklevel=2,
)
bound_args.arguments["bucket_name"] = self.conn_config.schema
return func(*bound_args.args, **bound_args.kwargs)
return cast(T, wrapper)
def provide_bucket_name_async(func: T) -> T:
"""Provide a bucket name taken from the connection if no bucket name has been passed to the function."""
function_signature = signature(func)
@wraps(func)
async def wrapper(*args: Any, **kwargs: Any) -> Any:
bound_args = function_signature.bind(*args, **kwargs)
if "bucket_name" not in bound_args.arguments:
self = args[0]
if self.aws_conn_id:
connection = await sync_to_async(self.get_connection)(self.aws_conn_id)
if connection.schema:
bound_args.arguments["bucket_name"] = connection.schema
return await func(*bound_args.args, **bound_args.kwargs)
return cast(T, wrapper)
def unify_bucket_name_and_key(func: T) -> T:
"""Unify bucket name and key in case no bucket name and at least a key has been passed to the function."""
function_signature = signature(func)
@wraps(func)
def wrapper(*args, **kwargs) -> T:
bound_args = function_signature.bind(*args, **kwargs)
if "wildcard_key" in bound_args.arguments:
key_name = "wildcard_key"
elif "key" in bound_args.arguments:
key_name = "key"
else:
raise ValueError("Missing key parameter!")
if "bucket_name" not in bound_args.arguments:
with suppress(S3HookUriParseFailure):
bound_args.arguments["bucket_name"], bound_args.arguments[key_name] = S3Hook.parse_s3_url(
bound_args.arguments[key_name]
)
return func(*bound_args.args, **bound_args.kwargs)
# set attr _unify_bucket_name_and_key_wrapped so that we can check at
# class definition that unify is the first decorator applied
# if provide_bucket_name is applied first, and there's a bucket defined in conn
# then if user supplies full key, bucket in key is not respected
wrapper._unify_bucket_name_and_key_wrapped = True # type: ignore[attr-defined]
return cast(T, wrapper)
class S3Hook(AwsBaseHook):
"""
Interact with Amazon Simple Storage Service (S3).
Provide thick wrapper around :external+boto3:py:class:`boto3.client("s3") <S3.Client>`
and :external+boto3:py:class:`boto3.resource("s3") <S3.ServiceResource>`.
:param transfer_config_args: Configuration object for managed S3 transfers.
:param extra_args: Extra arguments that may be passed to the download/upload operations.
.. seealso::
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#s3-transfers
- For allowed upload extra arguments see ``boto3.s3.transfer.S3Transfer.ALLOWED_UPLOAD_ARGS``.
- For allowed download extra arguments see ``boto3.s3.transfer.S3Transfer.ALLOWED_DOWNLOAD_ARGS``.
Additional arguments (such as ``aws_conn_id``) may be specified and
are passed down to the underlying AwsBaseHook.
.. seealso::
- :class:`airflow.providers.amazon.aws.hooks.base_aws.AwsBaseHook`
"""
def __init__(
self,
aws_conn_id: str | None = AwsBaseHook.default_conn_name,
transfer_config_args: dict | None = None,
extra_args: dict | None = None,
*args,
**kwargs,
) -> None:
kwargs["client_type"] = "s3"
kwargs["aws_conn_id"] = aws_conn_id
if transfer_config_args and not isinstance(transfer_config_args, dict):
raise TypeError(f"transfer_config_args expected dict, got {type(transfer_config_args).__name__}.")
self.transfer_config = TransferConfig(**transfer_config_args or {})
if extra_args and not isinstance(extra_args, dict):
raise TypeError(f"extra_args expected dict, got {type(extra_args).__name__}.")
self._extra_args = extra_args or {}
super().__init__(*args, **kwargs)
@property
def extra_args(self):
"""Return hook's extra arguments (immutable)."""
return deepcopy(self._extra_args)
@staticmethod
def parse_s3_url(s3url: str) -> tuple[str, str]:
"""
Parse the S3 Url into a bucket name and key.
See https://docs.aws.amazon.com/AmazonS3/latest/userguide/access-bucket-intro.html
for valid url formats.
:param s3url: The S3 Url to parse.
:return: the parsed bucket name and key
"""
valid_s3_format = "S3://bucket-name/key-name"
valid_s3_virtual_hosted_format = "https://bucket-name.s3.region-code.amazonaws.com/key-name"
format = s3url.split("//")
if re.match(r"s3[na]?:", format[0], re.IGNORECASE):
parsed_url = urlsplit(s3url)
if not parsed_url.netloc:
raise S3HookUriParseFailure(
"Please provide a bucket name using a valid format of the form: "
f'{valid_s3_format} or {valid_s3_virtual_hosted_format} but provided: "{s3url}"'
)
bucket_name = parsed_url.netloc
key = parsed_url.path.lstrip("/")
elif format[0] == "https:":
temp_split = format[1].split(".")
if temp_split[0] == "s3":
# "https://s3.region-code.amazonaws.com/bucket-name/key-name"
_, bucket_name, key = format[1].split("/", 2)
elif temp_split[1] == "s3":
# "https://bucket-name.s3.region-code.amazonaws.com/key-name"
bucket_name = temp_split[0]
key = format[1].partition("/")[-1]
else:
raise S3HookUriParseFailure(
"Please provide a bucket name using a valid virtually hosted format which should "
f'be of the form: {valid_s3_virtual_hosted_format} but provided: "{s3url}"'
)
else:
raise S3HookUriParseFailure(
"Please provide a bucket name using a valid format of the form: "
f'{valid_s3_format} or {valid_s3_virtual_hosted_format} but provided: "{s3url}"'
)
return bucket_name, key
@staticmethod
def get_s3_bucket_key(
bucket: str | None, key: str, bucket_param_name: str, key_param_name: str
) -> tuple[str, str]:
"""
Get the S3 bucket name and key.
From either:
- bucket name and key. Return the info as it is after checking `key` is a relative path.
- key. Must be a full s3:// url.
:param bucket: The S3 bucket name
:param key: The S3 key
:param bucket_param_name: The parameter name containing the bucket name
:param key_param_name: The parameter name containing the key name
:return: the parsed bucket name and key
"""
if bucket is None:
return S3Hook.parse_s3_url(key)
parsed_url = urlsplit(key)
if parsed_url.scheme != "" or parsed_url.netloc != "":
raise TypeError(
f"If `{bucket_param_name}` is provided, {key_param_name} should be a relative path "
"from root level, rather than a full s3:// url"
)
return bucket, key
@provide_bucket_name
def check_for_bucket(self, bucket_name: str | None = None) -> bool:
"""
Check if bucket_name exists.
.. seealso::
- :external+boto3:py:meth:`S3.Client.head_bucket`
:param bucket_name: the name of the bucket
:return: True if it exists and False if not.
"""
try:
self.get_conn().head_bucket(Bucket=bucket_name)
return True
except ClientError as e:
# The head_bucket api is odd in that it cannot return proper
# exception objects, so error codes must be used. Only 200, 404 and 403
# are ever returned. See the following links for more details:
# https://github.com/boto/boto3/issues/2499
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_bucket
return_code = int(e.response["Error"]["Code"])
if return_code == 404:
self.log.info('Bucket "%s" does not exist', bucket_name)
elif return_code == 403:
self.log.error(
'Access to bucket "%s" is forbidden or there was an error with the request', bucket_name
)
self.log.error(e)
return False
@provide_bucket_name
def get_bucket(self, bucket_name: str | None = None) -> S3Bucket:
"""
Return a :py:class:`S3.Bucket` object.
.. seealso::
- :external+boto3:py:meth:`S3.ServiceResource.Bucket`
:param bucket_name: the name of the bucket
:return: the bucket object to the bucket name.
"""
s3_resource = self.get_session().resource(
"s3",
endpoint_url=self.conn_config.endpoint_url,
config=self.config,
verify=self.verify,
)
return s3_resource.Bucket(bucket_name)
@provide_bucket_name
def create_bucket(self, bucket_name: str | None = None, region_name: str | None = None) -> None:
"""
Create an Amazon S3 bucket.
.. seealso::
- :external+boto3:py:meth:`S3.Client.create_bucket`
:param bucket_name: The name of the bucket
:param region_name: The name of the aws region in which to create the bucket.
"""
if not region_name:
if self.conn_region_name == "aws-global":
raise AirflowException(
"Unable to create bucket if `region_name` not set "
"and boto3 configured to use s3 regional endpoints."
)
region_name = self.conn_region_name
if region_name == "us-east-1":
self.get_conn().create_bucket(Bucket=bucket_name)
else:
self.get_conn().create_bucket(
Bucket=bucket_name, CreateBucketConfiguration={"LocationConstraint": region_name}
)
@provide_bucket_name
def check_for_prefix(self, prefix: str, delimiter: str, bucket_name: str | None = None) -> bool:
"""
Check that a prefix exists in a bucket.
:param bucket_name: the name of the bucket
:param prefix: a key prefix
:param delimiter: the delimiter marks key hierarchy.
:return: False if the prefix does not exist in the bucket and True if it does.
"""
if not prefix.endswith(delimiter):
prefix += delimiter
prefix_split = re.split(rf"(\w+[{delimiter}])$", prefix, 1)
previous_level = prefix_split[0]
plist = self.list_prefixes(bucket_name, previous_level, delimiter)
return prefix in plist
@provide_bucket_name
def list_prefixes(
self,
bucket_name: str | None = None,
prefix: str | None = None,
delimiter: str | None = None,
page_size: int | None = None,
max_items: int | None = None,
) -> list:
"""
List prefixes in a bucket under prefix.
.. seealso::
- :external+boto3:py:class:`S3.Paginator.ListObjectsV2`
:param bucket_name: the name of the bucket
:param prefix: a key prefix
:param delimiter: the delimiter marks key hierarchy.
:param page_size: pagination size
:param max_items: maximum items to return
:return: a list of matched prefixes
"""
prefix = prefix or ""
delimiter = delimiter or ""
config = {
"PageSize": page_size,
"MaxItems": max_items,
}
paginator = self.get_conn().get_paginator("list_objects_v2")
response = paginator.paginate(
Bucket=bucket_name, Prefix=prefix, Delimiter=delimiter, PaginationConfig=config
)
prefixes: list[str] = []
for page in response:
if "CommonPrefixes" in page:
prefixes.extend(common_prefix["Prefix"] for common_prefix in page["CommonPrefixes"])
return prefixes
@provide_bucket_name_async
@unify_bucket_name_and_key
async def get_head_object_async(
self, client: AioBaseClient, key: str, bucket_name: str | None = None
) -> dict[str, Any] | None:
"""
Retrieve metadata of an object.
:param client: aiobotocore client
:param bucket_name: Name of the bucket in which the file is stored
:param key: S3 key that will point to the file
"""
head_object_val: dict[str, Any] | None = None
try:
head_object_val = await client.head_object(Bucket=bucket_name, Key=key)
return head_object_val
except ClientError as e:
if e.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
return head_object_val
else:
raise e
async def list_prefixes_async(
self,
client: AioBaseClient,
bucket_name: str | None = None,
prefix: str | None = None,
delimiter: str | None = None,
page_size: int | None = None,
max_items: int | None = None,
) -> list[Any]:
"""
List prefixes in a bucket under prefix.
:param client: ClientCreatorContext
:param bucket_name: the name of the bucket
:param prefix: a key prefix
:param delimiter: the delimiter marks key hierarchy.
:param page_size: pagination size
:param max_items: maximum items to return
:return: a list of matched prefixes
"""
prefix = prefix or ""
delimiter = delimiter or ""
config = {
"PageSize": page_size,
"MaxItems": max_items,
}
paginator = client.get_paginator("list_objects_v2")
response = paginator.paginate(
Bucket=bucket_name, Prefix=prefix, Delimiter=delimiter, PaginationConfig=config
)
prefixes = []
async for page in response:
if "CommonPrefixes" in page:
for common_prefix in page["CommonPrefixes"]:
prefixes.append(common_prefix["Prefix"])
return prefixes
@provide_bucket_name_async
async def get_file_metadata_async(self, client: AioBaseClient, bucket_name: str, key: str) -> list[Any]:
"""
Get a list of files that a key matching a wildcard expression exists in a bucket asynchronously.
:param client: aiobotocore client
:param bucket_name: the name of the bucket
:param key: the path to the key
"""
prefix = re.split(r"[\[*?]", key, 1)[0]
delimiter = ""
paginator = client.get_paginator("list_objects_v2")
response = paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter=delimiter)
files = []
async for page in response:
if "Contents" in page:
files += page["Contents"]
return files
async def _check_key_async(
self,
client: AioBaseClient,
bucket_val: str,
wildcard_match: bool,
key: str,
) -> bool:
"""
Get a list of files that a key matching a wildcard expression or get the head object.
If wildcard_match is True get list of files that a key matching a wildcard
expression exists in a bucket asynchronously and return the boolean value. If wildcard_match
is False get the head object from the bucket and return the boolean value.
:param client: aiobotocore client
:param bucket_val: the name of the bucket
:param key: S3 keys that will point to the file
:param wildcard_match: the path to the key
"""
bucket_name, key = self.get_s3_bucket_key(bucket_val, key, "bucket_name", "bucket_key")
if wildcard_match:
keys = await self.get_file_metadata_async(client, bucket_name, key)
key_matches = [k for k in keys if fnmatch.fnmatch(k["Key"], key)]
if not key_matches:
return False
else:
obj = await self.get_head_object_async(client, key, bucket_name)
if obj is None:
return False
return True
async def check_key_async(
self,
client: AioBaseClient,
bucket: str,
bucket_keys: str | list[str],
wildcard_match: bool,
) -> bool:
"""
Get a list of files that a key matching a wildcard expression or get the head object.
If wildcard_match is True get list of files that a key matching a wildcard
expression exists in a bucket asynchronously and return the boolean value. If wildcard_match
is False get the head object from the bucket and return the boolean value.
:param client: aiobotocore client
:param bucket: the name of the bucket
:param bucket_keys: S3 keys that will point to the file
:param wildcard_match: the path to the key
"""
if isinstance(bucket_keys, list):
return all(
await asyncio.gather(
*(self._check_key_async(client, bucket, wildcard_match, key) for key in bucket_keys)
)
)
return await self._check_key_async(client, bucket, wildcard_match, bucket_keys)
async def check_for_prefix_async(
self, client: AioBaseClient, prefix: str, delimiter: str, bucket_name: str | None = None
) -> bool:
"""
Check that a prefix exists in a bucket.
:param bucket_name: the name of the bucket
:param prefix: a key prefix
:param delimiter: the delimiter marks key hierarchy.
:return: False if the prefix does not exist in the bucket and True if it does.
"""
if not prefix.endswith(delimiter):
prefix += delimiter
prefix_split = re.split(rf"(\w+[{delimiter}])$", prefix, 1)
previous_level = prefix_split[0]
plist = await self.list_prefixes_async(client, bucket_name, previous_level, delimiter)
return prefix in plist
async def _check_for_prefix_async(
self, client: AioBaseClient, prefix: str, delimiter: str, bucket_name: str | None = None
) -> bool:
return await self.check_for_prefix_async(
client, prefix=prefix, delimiter=delimiter, bucket_name=bucket_name
)
async def get_files_async(
self,
client: AioBaseClient,
bucket: str,
bucket_keys: str | list[str],
wildcard_match: bool,
delimiter: str | None = "/",
) -> list[Any]:
"""Get a list of files in the bucket."""
keys: list[Any] = []
for key in bucket_keys:
prefix = key
if wildcard_match:
prefix = re.split(r"[\[*?]", key, 1)[0]
paginator = client.get_paginator("list_objects_v2")
response = paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter=delimiter)
async for page in response:
if "Contents" in page:
keys.extend(k for k in page["Contents"] if isinstance(k.get("Size"), (int, float)))
return keys
@staticmethod
async def _list_keys_async(
client: AioBaseClient,
bucket_name: str | None = None,
prefix: str | None = None,
delimiter: str | None = None,
page_size: int | None = None,
max_items: int | None = None,
) -> list[str]:
"""
List keys in a bucket under prefix and not containing delimiter.
:param bucket_name: the name of the bucket
:param prefix: a key prefix
:param delimiter: the delimiter marks key hierarchy.
:param page_size: pagination size
:param max_items: maximum items to return
:return: a list of matched keys
"""
prefix = prefix or ""
delimiter = delimiter or ""
config = {
"PageSize": page_size,
"MaxItems": max_items,
}
paginator = client.get_paginator("list_objects_v2")
response = paginator.paginate(
Bucket=bucket_name, Prefix=prefix, Delimiter=delimiter, PaginationConfig=config
)
keys = []
async for page in response:
if "Contents" in page:
for k in page["Contents"]:
keys.append(k["Key"])
return keys
def _list_key_object_filter(
self, keys: list, from_datetime: datetime | None = None, to_datetime: datetime | None = None
) -> list:
def _is_in_period(input_date: datetime) -> bool:
if from_datetime is not None and input_date <= from_datetime:
return False
if to_datetime is not None and input_date > to_datetime:
return False
return True
return [k["Key"] for k in keys if _is_in_period(k["LastModified"])]
async def is_keys_unchanged_async(
self,
client: AioBaseClient,
bucket_name: str,
prefix: str,
inactivity_period: float = 60 * 60,
min_objects: int = 1,
previous_objects: set[str] | None = None,
inactivity_seconds: int = 0,
allow_delete: bool = True,
last_activity_time: datetime | None = None,
) -> dict[str, Any]:
"""
Check if new objects have been uploaded and the period has passed; update sensor state accordingly.
:param client: aiobotocore client
:param bucket_name: the name of the bucket
:param prefix: a key prefix
:param inactivity_period: the total seconds of inactivity to designate
keys unchanged. Note, this mechanism is not real time and
this operator may not return until a poke_interval after this period
has passed with no additional objects sensed.
:param min_objects: the minimum number of objects needed for keys unchanged
sensor to be considered valid.
:param previous_objects: the set of object ids found during the last poke.
:param inactivity_seconds: number of inactive seconds
:param allow_delete: Should this sensor consider objects being deleted
between pokes valid behavior. If true a warning message will be logged
when this happens. If false an error will be raised.
:param last_activity_time: last activity datetime.
"""
if not previous_objects:
previous_objects = set()
list_keys = await self._list_keys_async(client=client, bucket_name=bucket_name, prefix=prefix)
current_objects = set(list_keys)
current_num_objects = len(current_objects)
if current_num_objects > len(previous_objects):
# When new objects arrived, reset the inactivity_seconds
# and update previous_objects for the next poke.
self.log.info(
"New objects found at %s, resetting last_activity_time.",
os.path.join(bucket_name, prefix),
)
self.log.debug("New objects: %s", current_objects - previous_objects)
last_activity_time = datetime.now()
inactivity_seconds = 0
previous_objects = current_objects
return {
"status": "pending",
"previous_objects": previous_objects,
"last_activity_time": last_activity_time,
"inactivity_seconds": inactivity_seconds,
}
if len(previous_objects) - len(current_objects):
# During the last poke interval objects were deleted.
if allow_delete:
deleted_objects = previous_objects - current_objects
previous_objects = current_objects
last_activity_time = datetime.now()
self.log.info(
"Objects were deleted during the last poke interval. Updating the "
"file counter and resetting last_activity_time:\n%s",
deleted_objects,
)
return {
"status": "pending",
"previous_objects": previous_objects,
"last_activity_time": last_activity_time,
"inactivity_seconds": inactivity_seconds,
}
return {
"status": "error",
"message": f"{os.path.join(bucket_name, prefix)} between pokes.",
}
if last_activity_time:
inactivity_seconds = int((datetime.now() - last_activity_time).total_seconds())
else:
# Handles the first poke where last inactivity time is None.
last_activity_time = datetime.now()
inactivity_seconds = 0
if inactivity_seconds >= inactivity_period:
path = os.path.join(bucket_name, prefix)
if current_num_objects >= min_objects:
success_message = (
f"SUCCESS: Sensor found {current_num_objects} objects at {path}. "
"Waited at least {inactivity_period} seconds, with no new objects uploaded."
)
self.log.info(success_message)
return {
"status": "success",
"message": success_message,
}
self.log.error("FAILURE: Inactivity Period passed, not enough objects found in %s", path)
return {
"status": "error",
"message": f"FAILURE: Inactivity Period passed, not enough objects found in {path}",
}
return {
"status": "pending",
"previous_objects": previous_objects,
"last_activity_time": last_activity_time,
"inactivity_seconds": inactivity_seconds,
}
@provide_bucket_name
def list_keys(
self,
bucket_name: str | None = None,
prefix: str | None = None,
delimiter: str | None = None,
page_size: int | None = None,
max_items: int | None = None,
start_after_key: str | None = None,
from_datetime: datetime | None = None,
to_datetime: datetime | None = None,
object_filter: Callable[..., list] | None = None,
apply_wildcard: bool = False,
) -> list:
"""
List keys in a bucket under prefix and not containing delimiter.
.. seealso::
- :external+boto3:py:class:`S3.Paginator.ListObjectsV2`
:param bucket_name: the name of the bucket
:param prefix: a key prefix
:param delimiter: the delimiter marks key hierarchy.
:param page_size: pagination size
:param max_items: maximum items to return
:param start_after_key: should return only keys greater than this key
:param from_datetime: should return only keys with LastModified attr greater than this equal
from_datetime
:param to_datetime: should return only keys with LastModified attr less than this to_datetime
:param object_filter: Function that receives the list of the S3 objects, from_datetime and
to_datetime and returns the List of matched key.
:param apply_wildcard: whether to treat '*' as a wildcard or a plain symbol in the prefix.
**Example**: Returns the list of S3 object with LastModified attr greater than from_datetime
and less than to_datetime:
.. code-block:: python
def object_filter(
keys: list,
from_datetime: datetime | None = None,
to_datetime: datetime | None = None,
) -> list:
def _is_in_period(input_date: datetime) -> bool:
if from_datetime is not None and input_date < from_datetime:
return False
if to_datetime is not None and input_date > to_datetime:
return False
return True
return [k["Key"] for k in keys if _is_in_period(k["LastModified"])]
:return: a list of matched keys
"""
_original_prefix = prefix or ""
_apply_wildcard = bool(apply_wildcard and "*" in _original_prefix)
_prefix = _original_prefix.split("*", 1)[0] if _apply_wildcard else _original_prefix
delimiter = delimiter or ""
start_after_key = start_after_key or ""
object_filter_usr = object_filter
config = {
"PageSize": page_size,
"MaxItems": max_items,
}
paginator = self.get_conn().get_paginator("list_objects_v2")
response = paginator.paginate(
Bucket=bucket_name,
Prefix=_prefix,
Delimiter=delimiter,
PaginationConfig=config,
StartAfter=start_after_key,
)
keys: list[str] = []
for page in response:
if "Contents" in page:
new_keys = page["Contents"]
if _apply_wildcard:
new_keys = (k for k in new_keys if fnmatch.fnmatch(k["Key"], _original_prefix))
keys.extend(new_keys)
if object_filter_usr is not None:
return object_filter_usr(keys, from_datetime, to_datetime)
return self._list_key_object_filter(keys, from_datetime, to_datetime)
@provide_bucket_name
def get_file_metadata(
self,
prefix: str,
bucket_name: str | None = None,
page_size: int | None = None,
max_items: int | None = None,
) -> list:
"""
List metadata objects in a bucket under prefix.
.. seealso::
- :external+boto3:py:class:`S3.Paginator.ListObjectsV2`
:param prefix: a key prefix
:param bucket_name: the name of the bucket
:param page_size: pagination size
:param max_items: maximum items to return
:return: a list of metadata of objects
"""
config = {
"PageSize": page_size,
"MaxItems": max_items,
}
paginator = self.get_conn().get_paginator("list_objects_v2")
response = paginator.paginate(Bucket=bucket_name, Prefix=prefix, PaginationConfig=config)
files = []
for page in response:
if "Contents" in page:
files += page["Contents"]
return files
@unify_bucket_name_and_key
@provide_bucket_name
def head_object(self, key: str, bucket_name: str | None = None) -> dict | None:
"""
Retrieve metadata of an object.
.. seealso::
- :external+boto3:py:meth:`S3.Client.head_object`
:param key: S3 key that will point to the file
:param bucket_name: Name of the bucket in which the file is stored
:return: metadata of an object
"""
try:
return self.get_conn().head_object(Bucket=bucket_name, Key=key)
except ClientError as e:
if e.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
return None
else:
raise e
@unify_bucket_name_and_key
@provide_bucket_name
def check_for_key(self, key: str, bucket_name: str | None = None) -> bool:
"""
Check if a key exists in a bucket.
.. seealso::
- :external+boto3:py:meth:`S3.Client.head_object`
:param key: S3 key that will point to the file
:param bucket_name: Name of the bucket in which the file is stored
:return: True if the key exists and False if not.
"""
obj = self.head_object(key, bucket_name)
return obj is not None
@unify_bucket_name_and_key
@provide_bucket_name
def get_key(self, key: str, bucket_name: str | None = None) -> S3ResourceObject:
"""
Return a :py:class:`S3.Object`.
.. seealso::
- :external+boto3:py:meth:`S3.ServiceResource.Object`
:param key: the path to the key
:param bucket_name: the name of the bucket
:return: the key object from the bucket
"""
def sanitize_extra_args() -> dict[str, str]:
"""Parse extra_args and return a dict with only the args listed in ALLOWED_DOWNLOAD_ARGS."""
return {
arg_name: arg_value
for (arg_name, arg_value) in self.extra_args.items()
if arg_name in S3Transfer.ALLOWED_DOWNLOAD_ARGS
}
s3_resource = self.get_session().resource(
"s3",
endpoint_url=self.conn_config.endpoint_url,
config=self.config,
verify=self.verify,
)
obj = s3_resource.Object(bucket_name, key)
obj.load(**sanitize_extra_args())
return obj
@unify_bucket_name_and_key
@provide_bucket_name
def read_key(self, key: str, bucket_name: str | None = None) -> str:
"""
Read a key from S3.
.. seealso::
- :external+boto3:py:meth:`S3.Object.get`
:param key: S3 key that will point to the file
:param bucket_name: Name of the bucket in which the file is stored
:return: the content of the key
"""
obj = self.get_key(key, bucket_name)
return obj.get()["Body"].read().decode("utf-8")
@unify_bucket_name_and_key
@provide_bucket_name
def select_key(
self,
key: str,
bucket_name: str | None = None,
expression: str | None = None,
expression_type: str | None = None,
input_serialization: dict[str, Any] | None = None,
output_serialization: dict[str, Any] | None = None,
) -> str:
"""
Read a key with S3 Select.
.. seealso::
- :external+boto3:py:meth:`S3.Client.select_object_content`
:param key: S3 key that will point to the file
:param bucket_name: Name of the bucket in which the file is stored
:param expression: S3 Select expression
:param expression_type: S3 Select expression type
:param input_serialization: S3 Select input data serialization format
:param output_serialization: S3 Select output data serialization format
:return: retrieved subset of original data by S3 Select
"""
expression = expression or "SELECT * FROM S3Object"
expression_type = expression_type or "SQL"
if input_serialization is None:
input_serialization = {"CSV": {}}
if output_serialization is None:
output_serialization = {"CSV": {}}
response = self.get_conn().select_object_content(
Bucket=bucket_name,
Key=key,
Expression=expression,
ExpressionType=expression_type,
InputSerialization=input_serialization,
OutputSerialization=output_serialization,
)
return b"".join(
event["Records"]["Payload"] for event in response["Payload"] if "Records" in event
).decode("utf-8")