In [1]:
import requests, boto3

In [2]:
# Dataset URL and local save path
url_data = "https://static.openfoodfacts.org/data/openfoodfacts-products.jsonl.gz"
url_fileds = "https://static.openfoodfacts.org/data/data-fields.txt"

In [3]:
# Bucket and key in s3
BUCKET = 'nutriflow-pipeline'
KEY = 'phase-a/raw/openfoodfacts-products.jsonl.gz'
KEY_Field = 'phase-a/raw/data-fields.txt'

In [4]:
# Initiate s3 and multipart uploading
s3_client = boto3.client('s3')
mpu = s3_client.create_multipart_upload(Bucket=BUCKET, Key=KEY)

In [5]:
parts, part_no = [], 1
response = requests.get(url_data, stream=True)

In [6]:
# Download the dataset and upload to s3 in parts
for chunk in response.iter_content(chunk_size= 100 * 1024 * 1024): # 100MB parts
    if not chunk:
        continue
    resp = s3_client.upload_part(
        Bucket=BUCKET,
        Key=KEY,
        UploadId=mpu['UploadId'],
        PartNumber=part_no,
        Body=chunk
    )
    print(f"Uploaded part {part_no} — {len(chunk):,} bytes, ETag={resp['ETag']}")
    parts.append({'ETag': resp['ETag'], 'PartNumber': part_no})
    part_no += 1

Uploaded part 1 — 104,857,600 bytes, ETag="a84d0da6dbb2dae066f08478512f240d"
Uploaded part 2 — 104,857,600 bytes, ETag="6b678f85b325b9e2d3b7ef548d2dbc3f"
Uploaded part 3 — 104,857,600 bytes, ETag="142b40e6c20a142a59d5a7c65daeee23"
Uploaded part 4 — 104,857,600 bytes, ETag="aea3fb878631770468edb8fe8d673f33"
Uploaded part 5 — 104,857,600 bytes, ETag="292f900802ed84eeb8f63385e7c27254"
Uploaded part 6 — 104,857,600 bytes, ETag="cb4a754143f26d11ef19351e76170684"
Uploaded part 7 — 104,857,600 bytes, ETag="6bd0bc66479a4418dba5be509f33850e"
Uploaded part 8 — 104,857,600 bytes, ETag="4aea418cdc65fe40d00549c6503bea57"
Uploaded part 9 — 104,857,600 bytes, ETag="607a1e0abe7002e5444759f8fd18c87d"
Uploaded part 10 — 104,857,600 bytes, ETag="e64432c42b06e5fd96a35a83e5bcfef6"
Uploaded part 11 — 104,857,600 bytes, ETag="57053b90914744f192cf24fec9a1cbb2"
Uploaded part 12 — 104,857,600 bytes, ETag="f73103d09922c2e8fd193cefa44751f6"
Uploaded part 13 — 104,857,600 bytes, ETag="4d1ecaf223e1fda9fcd11f7a14e1f

In [7]:
# Finalize
completed = s3_client.complete_multipart_upload(
        Bucket=BUCKET,
        Key=KEY,
        UploadId=mpu['UploadId'],
        MultipartUpload={'Parts': parts}
)

print("CompleteMultipartUpload response:")
print(completed)

CompleteMultipartUpload response:
{'ResponseMetadata': {'RequestId': 'KJTTEJDVK0NRQ2QM', 'HostId': '0qQ6gn8K/RoKdh111p4vuUrhUqGRIOUVIRWwpBPJBtq7ukftIkDdNQNGeyh1rKD4mlO2y7yGuZw=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': '0qQ6gn8K/RoKdh111p4vuUrhUqGRIOUVIRWwpBPJBtq7ukftIkDdNQNGeyh1rKD4mlO2y7yGuZw=', 'x-amz-request-id': 'KJTTEJDVK0NRQ2QM', 'date': 'Fri, 13 Jun 2025 20:53:01 GMT', 'x-amz-server-side-encryption': 'AES256', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'ServerSideEncryption': 'AES256', 'Location': 'https://nutriflow-pipeline.s3.amazonaws.com/phase-a%2Fraw%2Fopenfoodfacts-products.jsonl.gz', 'Bucket': 'nutriflow-pipeline', 'Key': 'phase-a/raw/openfoodfacts-products.jsonl.gz', 'ETag': '"923434b766c2a517fdcecf3136c0fe0c-92"', 'ChecksumCRC64NVME': 'ZPvEudaqLHo=', 'ChecksumType': 'FULL_OBJECT'}


In [8]:
resp_field = requests.get(url_fileds, stream=True)
s3_client.upload_fileobj(Bucket=BUCKET, Key=KEY_Field, Fileobj=resp_field.raw)