### **1. Upload 10 parquets**

#### Flow the following steps:
1. Install pkgs
pip install pandas pyarrow boto3 python-dotenv
2. .env
set all the credentials in that file
3. Create and upload parquets

In [1]:
#-- Load modules --#
import pandas as pd
import numpy as np
import boto3
import datetime

In [3]:
#-- Create parquets --#
def s3_upload_parquet(initDate: str):
    #-- Create dataframe --#
    rngDate = pd.date_range(start=initDate.strftime('%Y-%m-%d'), periods=18, freq='1MS')
    nmember = 200
    file_name = f"Hydro-LongFc_{initDate:%Y-%m-%d}.parquet"
    basinName = ['QN-Mantaro', 'QN-Santa', 'QN-Rimac', 'QN-Vilcanota']
    initDates = [initDate]*len(rngDate)*nmember*len(basinName)
    #-- Dataframe --#
    sampleData = pd.DataFrame(
        {
            'name': [s for s in basinName for _ in range(len(rngDate)) for member in range(nmember)],
            'initDate': initDates,
            'date': np.tile(rngDate.tolist(),len(basinName)*nmember),
            'member': [i+1 for _ in range(len(rngDate)) for a in range(len(basinName)) for i in range(nmember)],
            'QNfc': 100+5*np.random.randn((len(rngDate)),len(basinName),nmember).flatten(),
            'QN': 100+5.5*np.random.randn((len(rngDate)),len(basinName),nmember).flatten(),
            'QNclim': 90+5.3*np.random.randn((len(rngDate)),len(basinName),nmember).flatten(),
        }
    )
    #-- Save locally --#
    local_file = f'forecast_{initDate:%Y-%m-%d}.parquet'
    sampleData.to_parquet(local_file, index=False)
    #-- Set credentials --#
    import boto3
    from dotenv import load_dotenv
    load_dotenv('.env')
    #-- Fix parameters --#
    bucket = 'cdh-hydrolongterm-514438'
    base = 'longterm-forecast'
    version = 'v1.0'
    #-- Upload to S3 --#
    s3 = boto3.client('s3')
    key = f"{base}/monthly/run_date={initDate.strftime('%Y-%m-%d')}/version={version}/ensemble/forecast.parquet"
    s3.upload_file(local_file, bucket, key)

#-- Run through dates --#
initDate = '2024-01-01'
endDate = '2024-12-01'
rngDateIndex = pd.date_range(start=initDate, end=endDate, freq='1MS')
for initDate in rngDateIndex:
    print(f"Uploading {initDate:%Y-%m-%d}...")
    s3_upload_parquet(initDate)
    print(f"Uploaded {initDate:%Y-%m-%d}!")

Uploading 2024-01-01...


S3UploadFailedError: Failed to upload forecast_2024-01-01.parquet to cdh-hydrolongterm-514438/longterm-forecast/monthly/run_date=2024-01-01/version=v1.0/ensemble/forecast.parquet: An error occurred (ExpiredToken) when calling the PutObject operation: The provided token has expired.

### **2. Query Buckets**

In [4]:
#-- Connect to DuckDB and set credentials --#
import duckdb, os
con = duckdb.connect()
con.execute("SET s3_region=?", [os.getenv("AWS_DEFAULT_REGION")])
con.execute("SET s3_access_key_id=?", [os.getenv("AWS_ACCESS_KEY_ID")])
con.execute("SET s3_secret_access_key=?", [os.getenv("AWS_SECRET_ACCESS_KEY")])
con.execute("SET s3_session_token=?", [os.getenv("AWS_SESSION_TOKEN")])

<duckdb.duckdb.DuckDBPyConnection at 0x7c9967c889b0>

In [5]:
#-- Build the list of dates to request --#
bucket = 'cdh-hydrolongterm-514438'
base = 'longterm-forecast'
version = 'v1.0'
uris = [
    f"s3://{bucket}/{base}/monthly/run_date={initDate.strftime('%Y-%m-%d')}/version={version}/ensemble/forecast.parquet" 
    for initDate in rngDateIndex
]

In [6]:
uris

['s3://cdh-hydrolongterm-514438/longterm-forecast/monthly/run_date=2024-01-01/version=v1.0/ensemble/forecast.parquet',
 's3://cdh-hydrolongterm-514438/longterm-forecast/monthly/run_date=2024-02-01/version=v1.0/ensemble/forecast.parquet',
 's3://cdh-hydrolongterm-514438/longterm-forecast/monthly/run_date=2024-03-01/version=v1.0/ensemble/forecast.parquet',
 's3://cdh-hydrolongterm-514438/longterm-forecast/monthly/run_date=2024-04-01/version=v1.0/ensemble/forecast.parquet',
 's3://cdh-hydrolongterm-514438/longterm-forecast/monthly/run_date=2024-05-01/version=v1.0/ensemble/forecast.parquet',
 's3://cdh-hydrolongterm-514438/longterm-forecast/monthly/run_date=2024-06-01/version=v1.0/ensemble/forecast.parquet',
 's3://cdh-hydrolongterm-514438/longterm-forecast/monthly/run_date=2024-07-01/version=v1.0/ensemble/forecast.parquet',
 's3://cdh-hydrolongterm-514438/longterm-forecast/monthly/run_date=2024-08-01/version=v1.0/ensemble/forecast.parquet',
 's3://cdh-hydrolongterm-514438/longterm-forecas

In [7]:
's3://cdh-hydrolongterm-514438/longterm-forecast/monthly/run_date=2024-01-01/version=v1.0/ensemble/forecast.parquet'

's3://cdh-hydrolongterm-514438/longterm-forecast/monthly/run_date=2024-01-01/version=v1.0/ensemble/forecast.parquet'

In [8]:
#-- Create a view merging all the parquets --#
sql_union = " UNION ALL ".join([f"SELECT * FROM read_parquet('{u}')" for u in uris])
con.execute(f"CREATE OR REPLACE VIEW raw_long AS {sql_union}")

<duckdb.duckdb.DuckDBPyConnection at 0x7c9967c889b0>

In [9]:
#-- How many rows per run --#
q1 = con.execute("""
    SELECT initDate AS run_date, COUNT(*) AS rows
    FROM raw_long
    GROUP BY 1 ORDER BY 1
""").df()
print(q1)

     run_date   rows
0  2024-01-01  14400
1  2024-02-01  14400
2  2024-03-01  14400
3  2024-04-01  14400
4  2024-05-01  14400
5  2024-06-01  14400
6  2024-07-01  14400
7  2024-08-01  14400
8  2024-09-01  14400
9  2024-10-01  14400
10 2024-11-01  14400
11 2024-12-01  14400


In [10]:
#-- Mean of ensemble --#
q2 = con.execute("""
    SELECT name, date AS targetDate, AVG(QNfc) AS QN_mean
    FROM raw_long
    GROUP BY name, targetDate
    ORDER BY name, targetDate
""").df()
print(q2.head())

         name targetDate     QN_mean
0  QN-Mantaro 2024-01-01   99.778802
1  QN-Mantaro 2024-02-01   99.736616
2  QN-Mantaro 2024-03-01  100.071649
3  QN-Mantaro 2024-04-01   99.972756
4  QN-Mantaro 2024-05-01  100.082035


In [11]:
#-- Anomalies --#
q3 = con.execute("""
    SELECT name, date AS targetDate, AVG(QNfc) - AVG(QNclim) AS anom
    FROM raw_long
    GROUP BY name, targetDate
    ORDER BY name, targetDate
""").df()
print(q3.head())

         name targetDate       anom
0  QN-Mantaro 2024-01-01   9.061734
1  QN-Mantaro 2024-02-01  10.287259
2  QN-Mantaro 2024-03-01  10.204871
3  QN-Mantaro 2024-04-01  10.013447
4  QN-Mantaro 2024-05-01   9.966547


In [12]:
q3

Unnamed: 0,name,targetDate,anom
0,QN-Mantaro,2024-01-01,9.061734
1,QN-Mantaro,2024-02-01,10.287259
2,QN-Mantaro,2024-03-01,10.204871
3,QN-Mantaro,2024-04-01,10.013447
4,QN-Mantaro,2024-05-01,9.966547
...,...,...,...
111,QN-Vilcanota,2026-01-01,10.211200
112,QN-Vilcanota,2026-02-01,10.261312
113,QN-Vilcanota,2026-03-01,9.675485
114,QN-Vilcanota,2026-04-01,10.130510


In [14]:
# --- Parámetros de TU dataset / región ---
DB            = "lake"
REGION        = "eu-west-1"
WORKGROUP     = "primary"  # cambia si usas otro
BUCKET        = "cdh-hydrolongterm-514438"
BASE          = "longterm-forecasts"
VERSION       = "v1.0"

# Athena necesita un staging S3 donde pueda escribir resultados de consulta:
# Usa una carpeta del MISMO bucket si tienes permiso de escritura; si no, usa otro bucket propio.
ATHENA_STAGING = f"s3://{BUCKET}/athena-staging/"   # ajústalo si te da AccessDenied

# --- Conexión Athena ---
from pyathena import connect
import pandas as pd
from datetime import date
from dateutil.relativedelta import relativedelta

conn = connect(
    s3_staging_dir=ATHENA_STAGING,
    region_name=REGION,
    work_group=WORKGROUP,
)
cur = conn.cursor()

# 1) DB (si no existe)
cur.execute(f"CREATE DATABASE IF NOT EXISTS {DB}")

# 2) Tabla externa sobre RAW (particionada por ruta)
# Ubicación física:
RAW_LOCATION = f"s3://{BUCKET}/{BASE}/raw/"

cur.execute(f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {DB}.hydro_longterm_raw (
  name        string,
  initDate    date,
  date        date,
  member      int,
  QNfc        double,
  QN          double,
  QNclim      double
)
PARTITIONED BY (
  run_date date,
  version  string
)
STORED AS PARQUET
LOCATION '{RAW_LOCATION}'
""")

# 3) Agregar particiones explícitas (enero..octubre 2024, v1.0)
run_dates = [date(2024,1,1) + relativedelta(months=i) for i in range(10)]

for rd in run_dates:
    part_loc = (
        f"s3://{BUCKET}/{BASE}/raw/"
        f"run_date={rd:%Y-%m-%d}/version={VERSION}/"
    )
    cur.execute(f"""
    ALTER TABLE {DB}.hydro_longterm_raw
      ADD IF NOT EXISTS PARTITION (run_date=DATE '{rd:%Y-%m-%d}', version='{VERSION}')
      LOCATION '{part_loc}'
    """)

# 4) Consulta de prueba (mean del ensamble por cuenca y mes objetivo)
sql_mean = f"""
SELECT
  name,
  date AS targetDate,
  AVG(QNfc) AS QN_mean
FROM {DB}.hydro_longterm_raw
WHERE version = '{VERSION}'
  AND run_date BETWEEN DATE '2024-01-01' AND DATE '2024-10-01'
GROUP BY name, targetDate
ORDER BY name, targetDate
"""
df_mean = pd.read_sql(sql_mean, conn)
print(df_mean.head())

# 5) Crear CURATED con CTAS (Parquet + particionado + tu ubicación)
CURATED_MEAN_LOCATION = f"s3://{BUCKET}/{BASE}/curated/meanEnsemble/"

# Si la tabla ya existe, puedes DROP TABLE primero o hacer INSERT en una ya creada
cur.execute(f"""
CREATE TABLE IF NOT EXISTS {DB}.longterm_mean
WITH (
  format = 'PARQUET',
  external_location = '{CURATED_MEAN_LOCATION}',
  partitioned_by = ARRAY['run_date','version']
) AS
SELECT
  name,
  date AS targetDate,
  AVG(QNfc) AS QN_mean,
  run_date,
  version
FROM {DB}.hydro_longterm_raw
WHERE version = '{VERSION}'
  AND run_date BETWEEN DATE '2024-01-01' AND DATE '2024-10-01'
GROUP BY name, date, run_date, version
""")

# (Opcional) Leer el curated recién creado:
df_curated = pd.read_sql(f"SELECT * FROM {DB}.longterm_mean ORDER BY name, targetDate LIMIT 20", conn)
print(df_curated.head())


  df_mean = pd.read_sql(sql_mean, conn)


DatabaseError: Execution failed on sql: 
SELECT
  name,
  date AS targetDate,
  AVG(QNfc) AS QN_mean
FROM lake.hydro_longterm_raw
WHERE version = 'v1.0'
  AND run_date BETWEEN DATE '2024-01-01' AND DATE '2024-10-01'
GROUP BY name, targetDate
ORDER BY name, targetDate

COLUMN_NOT_FOUND: line 8:16: Column 'targetdate' cannot be resolved or requester is not authorized to access requested resources
unable to rollback

In [None]:
s3://cdh-hydrolongterm-514438/longterm-forecast/monthly/run_date=2024-01-01/version=v1.0/ensemble/forecast.parquet

360

In [55]:
s3_upload_parquet(datetime.datetime(2024, 1, 1))

Unnamed: 0,name
0,QN-Mantaro
1,QN-Mantaro
2,QN-Mantaro
3,QN-Mantaro
4,QN-Mantaro
...,...
355,QN-Vilcanota
356,QN-Vilcanota
357,QN-Vilcanota
358,QN-Vilcanota


In [None]:
np.random.randn(18, )

In [None]:
data = np.random.randn(3, 12)
merged_array = data.flatten()[:26]
print(merged_array)
print(merged_array.shape)  # (26,)

In [34]:
import numpy as np
np.random.randn(3, 12).flatten().shape

(36,)

In [None]:
rng_date = pd.date_range(start="2024-01-01", end="2024-01-10", freq='D')
for single_date in rng_date:
    # print(single_date.strftime("%Y-%m-%d"))
    print(f"InitDate_{single_date:%Y-%m-%d}")

InitDate_2024-01-01
InitDate_2024-01-02
InitDate_2024-01-03
InitDate_2024-01-04
InitDate_2024-01-05
InitDate_2024-01-06
InitDate_2024-01-07
InitDate_2024-01-08
InitDate_2024-01-09
InitDate_2024-01-10


In [None]:
https://cdh-hydrolongterm-514438.s3.eu-west-1.amazonaws.com/longterm-forecasts/raw/20250911_160434_input.parquet

In [1]:
from dotenv import load_dotenv
load_dotenv(".env")

True

In [2]:
import boto3
s3 = boto3.client("s3")
s3.head_object(
    Bucket="cdh-hydrolongterm-514438",
    Key="longterm-forecasts/raw/20250911_160434_input.parquet"
)

{'ResponseMetadata': {'RequestId': 'R8YQRC84SW39NQ3M',
  'HostId': 'UThAGF2Jx5y8+R+pH9VWk3pZqslIy7xgpT6uTpINUXeYQ3lvwysyWjgM/WphbbFBudvOf1BVwLE=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'UThAGF2Jx5y8+R+pH9VWk3pZqslIy7xgpT6uTpINUXeYQ3lvwysyWjgM/WphbbFBudvOf1BVwLE=',
   'x-amz-request-id': 'R8YQRC84SW39NQ3M',
   'date': 'Thu, 11 Sep 2025 16:35:32 GMT',
   'last-modified': 'Thu, 11 Sep 2025 16:04:38 GMT',
   'etag': '"54054e8ae83393e08e2503d0244b30aa"',
   'x-amz-server-side-encryption': 'aws:kms',
   'x-amz-server-side-encryption-aws-kms-key-id': 'arn:aws:kms:eu-west-1:228119973315:key/95a16e40-70e3-4520-baba-7a90da7bcc5a',
   'x-amz-server-side-encryption-bucket-key-enabled': 'true',
   'accept-ranges': 'bytes',
   'content-type': 'binary/octet-stream',
   'content-length': '18687',
   'server': 'AmazonS3'},
  'RetryAttempts': 1},
 'AcceptRanges': 'bytes',
 'LastModified': datetime.datetime(2025, 9, 11, 16, 4, 38, tzinfo=tzutc()),
 'ContentLength': 18687,
 'ETag': '"54

In [3]:
import pandas as pd

uri = "s3://cdh-hydrolongterm-514438/longterm-forecasts/raw/20250911_160434_input.parquet"
df = pd.read_parquet(uri)   # con s3fs/pyarrow y tus env vars, debería funcionar
df.head()

Unnamed: 0,date,NombreEmpresa,Tipoinfoabrev,power,name
0,2025-09-05 00:30:00,ENGIE,MW,107.083,W.F. Punta Lomitas
1,2025-09-05 01:00:00,ENGIE,MW,107.083,W.F. Punta Lomitas
2,2025-09-05 01:30:00,ENGIE,MW,102.6775,W.F. Punta Lomitas
3,2025-09-05 02:00:00,ENGIE,MW,102.6775,W.F. Punta Lomitas
4,2025-09-05 02:30:00,ENGIE,MW,99.3825,W.F. Punta Lomitas


### **Subir parquet**

In [7]:
import boto3

s3 = boto3.client("s3")

# Ruta exacta en tu bucket (ajusta run_date y version si quieres)
bucket = "cdh-hydrolongterm-514438"
key = "longterm-forecasts/raw/test_forecast.parquet"

# Subir archivo local al S3/CDH
s3.upload_file("../dataset/currentGen.parquet", bucket, key)

print("Archivo subido a:", f"s3://{bucket}/{key}")

Archivo subido a: s3://cdh-hydrolongterm-514438/longterm-forecasts/raw/test_forecast.parquet


In [8]:
# Leer directo desde S3 (usando pandas + s3fs)
uri = f"s3://{bucket}/{key}"
df_check = pd.read_parquet(uri)
print(df_check)

                    date       NombreEmpresa Tipoinfoabrev     power  \
0    2025-09-05 00:30:00               ENGIE            MW  107.0830   
1    2025-09-05 01:00:00               ENGIE            MW  107.0830   
2    2025-09-05 01:30:00               ENGIE            MW  102.6775   
3    2025-09-05 02:00:00               ENGIE            MW  102.6775   
4    2025-09-05 02:30:00               ENGIE            MW   99.3825   
...                  ...                 ...           ...       ...   
2875 2025-09-10 22:00:00  ORYGEN PERU S.A.A.            MW  115.1340   
2876 2025-09-10 22:30:00  ORYGEN PERU S.A.A.            MW  114.4970   
2877 2025-09-10 23:00:00  ORYGEN PERU S.A.A.            MW  114.4970   
2878 2025-09-10 23:30:00  ORYGEN PERU S.A.A.            MW  112.3075   
2879 2025-09-11 00:00:00  ORYGEN PERU S.A.A.            MW  112.3075   

                    name  
0     W.F. Punta Lomitas  
1     W.F. Punta Lomitas  
2     W.F. Punta Lomitas  
3     W.F. Punta Lomitas  


In [4]:
!pwd

/Users/carlosenciso/Documents/ENGIE/windShortTermForecast/Notebooks


In [None]:
import os
from dotenv import load_dotenv

# Cargar variables del archivo .env
load_dotenv(".env")

# Verificación rápida
print("Access Key:", os.getenv("AWS_ACCESS_KEY_ID")[:5], "...")
print("Region:", os.getenv("AWS_DEFAULT_REGION"))

In [None]:
import boto3

s3 = boto3.client("s3")
resp = s3.list_objects_v2(
    Bucket="cdh-dsdatalakecoesprod-514438",
    Prefix="projects/hydroForecast-Peru/datasources/Hydro-LongTerm/datasets/LongTerm-Forecasts/raw/"
)
for obj in resp.get("Contents", []):
    print(obj["Key"])

In [None]:
#-- Send to CDH --#
import boto3
import pandas as pd
import os
from dotenv import load_dotenv

In [None]:
#-- Load dotenv --#
load_dotenv()
#-- Parquet file --#
parquetFile = '../dataset/currentGen.parquet'
print(f'File to send: {parquetFile}')
#-- The file exits --#
if os.path.exists(parquetFile):
    size_mb = os.path.getsize(parquetFile) / (1024 * 1024)
    print(f'File size: {size_mb:.2f} MB')
else:
    print(f'File {parquetFile} does not exist.')

In [None]:
#-- Credentials --#
aws_access_key = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')
aws_token = os.getenv('AWS_SESSION_TOKEN')
aws_region = os.getenv('AWS_REGION', 'us-east-1')
bucket_name = os.getenv('BUCKET_NAME', 'hydroforecast-peru-data')
print(f"🔑 Usando región: {aws_region}")
print(f"🪣 Bucket destino: {bucket_name}")

In [None]:
s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key,
    aws_session_token=aws_token,
    region_name=aws_region
)
print("✅ Cliente S3 creado")

In [None]:
filename = os.path.basename(parquetFile) 
s3_key = f"hydroForecast-Peru/data/{filename}"
print(f"📍 Se guardará en: s3://{bucket_name}/{s3_key}")

In [None]:
try:
    print("📤 Subiendo archivo...")
    s3_client.upload_file(
        parquetFile,    
        bucket_name,    
        s3_key          
    )
    print("🎉 ¡SUBIDA EXITOSA!")
    print(f"🌐 Tu archivo está en: s3://{bucket_name}/{s3_key}")
except Exception as error:
    print(f"❌ Error: {str(error)}")
print("🏁 Proceso terminado")

In [None]:
#-- Send to CDH --#
import boto3
import pandas as pd
import os
from dotenv import load_dotenv

#-- Load dotenv --#
load_dotenv()
#-- Parquet file --#
parquetFile = '../dataset/currentGen.parquet'
print(f'File to send: {parquetFile}')
#-- The file exits --#
if os.path.exists(parquetFile):
    size_mb = os.path.getsize(parquetFile) / (1024 * 1024)
    print(f'File size: {size_mb:.2f} MB')
else:
    print(f'File {parquetFile} does not exist.')
    exit()

#-- Credentials --#
aws_access_key = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')
aws_token = os.getenv('AWS_SESSION_TOKEN')
aws_region = os.getenv('AWS_REGION', 'us-east-1')
bucket_name = os.getenv('BUCKET_NAME', 'hydroforecast-peru-data')
print(f"🔑 Usando región: {aws_region}")
print(f"🪣 Bucket destino: {bucket_name}")

s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key,
    aws_session_token=aws_token,
    region_name=aws_region
)
print("✅ Cliente S3 creado")

# Función para verificar y crear el bucket si no existe
def check_and_create_bucket(bucket_name, region):
    try:
        # Verificar si el bucket existe
        s3_client.head_bucket(Bucket=bucket_name)
        print(f"✅ Bucket '{bucket_name}' existe")
        return True
    except s3_client.exceptions.ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == '404':
            print(f"❌ Bucket '{bucket_name}' no existe, intentando crearlo...")
            try:
                if region == 'us-east-1':
                    # us-east-1 tiene una sintaxis especial
                    s3_client.create_bucket(Bucket=bucket_name)
                else:
                    s3_client.create_bucket(
                        Bucket=bucket_name,
                        CreateBucketConfiguration={'LocationConstraint': region}
                    )
                print(f"✅ Bucket '{bucket_name}' creado exitosamente")
                return True
            except Exception as create_error:
                print(f"❌ Error creando bucket: {create_error}")
                return False
        else:
            print(f"❌ Error accediendo al bucket: {e}")
            return False

# Verificar y crear el bucket si es necesario
if not check_and_create_bucket(bucket_name, aws_region):
    print("No se pudo acceder al bucket, terminando ejecución.")
    exit()

filename = os.path.basename(parquetFile) 
s3_key = f"hydroForecast-Peru/data/{filename}"
print(f"📍 Se guardará en: s3://{bucket_name}/{s3_key}")

try:
    print("📤 Subiendo archivo...")
    s3_client.upload_file(
        parquetFile,    
        bucket_name,    
        s3_key          
    )
    print("🎉 ¡SUBIDA EXITOSA!")
    print(f"🌐 Tu archivo está en: s3://{bucket_name}/{s3_key}")
except Exception as error:
    print(f"❌ Error: {str(error)}")
print("🏁 Proceso terminado")

In [None]:
arn:aws:iam::228119973315:role/cdh_hydroforecastperu_78495

In [None]:
#-- Read dataset from Athena S3 --#
import boto3
import pandas as pd
#-- Main code --#
s3 = boto3.client('s3')
"""
bucket = 'your-bucket-name'

"""

In [None]:
import boto3

In [None]:
import boto3

# Initialize S3 client (requires AWS credentials configured in ~/.aws/credentials or env vars)
s3 = boto3.client('s3')

bucket = "cdh-dsdatalakecoesprod-514438"
prefix = "central_generadora/"

# List objects
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
for obj in response.get('Contents', []):
    print(obj['Key'])

# Read one file (example CSV/JSON/Parquet)
obj = s3.get_object(Bucket=bucket, Key="central_generadora/example.csv")
data = obj['Body'].read().decode('utf-8')

print(data[:500])  # preview first 500 chars
