In [1]:
import boto3
import pandas as pd
import io
import pyarrow as pa
import pyarrow.parquet as pq

In [None]:
# --------- SETTINGS ---------
bucket_name = "your-bucket-name"
input_key = "data/sample_input.csv"
output_key = "data/output.parquet"
region = "us-east-1"

In [None]:
# --------- CONNECT TO S3 ---------
s3 = boto3.client('s3', region_name=region)

In [None]:
# --------- READ CSV FROM S3 ---------
response = s3.get_object(Bucket=bucket_name, Key=input_key)
csv_data = response['Body'].read()
df = pd.read_csv(io.BytesIO(csv_data))

print("✅ Original data loaded:")
print(df.head())

In [None]:
# --------- TRANSFORM DATA ---------
df_cleaned = df.dropna()

print("✅ Cleaned data:")
print(df_cleaned.head())

In [None]:
# --------- CONVERT TO PARQUET ---------
table = pa.Table.from_pandas(df_cleaned)
parquet_buffer = io.BytesIO()
pq.write_table(table, parquet_buffer)

In [None]:
# --------- UPLOAD PARQUET TO S3 ---------
s3.put_object(Bucket=bucket_name, Key=output_key, Body=parquet_buffer.getvalue())
print(f"✅ Parquet file uploaded to s3://{bucket_name}/{output_key}")