In [1]:
import psycopg2
import pandas as pd
import boto3
import awswrangler as wr
import resource 

#######################################################
######## CONNECTING TO PG DB + CREATING CURSORS #######
connection = psycopg2.connect(user="postgres",
                              password="postgres",
                              port="5432",
                              database="mainDB")
cursor = connection.cursor()

query = "select * from transactions;"

#######################################################
######## CONNECTING TO MINIO BUCKET ###################

boto3.setup_default_session(aws_access_key_id = 'your_minio_access_key',
                            aws_secret_access_key = 'your_minio_secret_key')

bucket = 'generators-test-bucket'
folder_batch = 'data_batch'
folder_gen = 'data_gen'
parquet_file_name = 'transactions'
batch_size = 1000000

wr.config.s3_endpoint_url = 'http://minio:9000'

In [20]:
# 1.1. CREATE DF USING BATCHES
def create_df_batch(cursor, batch_size):
    print('Creating pandas DF using generator...')
    colnames = ['transaction_id', 'user_id', 'product_name', 'transaction_date', 'amount_gbp']
    
    df = pd.DataFrame(columns=colnames)
    
    cursor.execute(query)

    while True:
        rows = cursor.fetchmany(batch_size)
        if not rows:
            break
        # some tramsformation
        batch_df = pd.DataFrame(data = rows, columns=colnames)        
        df = pd.concat([df, batch_df], ignore_index=True)

    print('DF successfully created!\n')

    return df
    
# 1.2 WRITING DF TO MINIO BUCKET IN PARQUET FORMAT USING BATCHES
def write_df_to_s3_batch(cursor, bucket, folder, parquet_file_name, batch_size):
    colnames = ['transaction_id', 'user_id', 'product_name', 'transaction_date', 'amount_gbp']
    cursor.execute(query)
    batch_num = 1
    while True:
        rows = cursor.fetchmany(batch_size)
        if not rows:
            break
        print(f"Writing DF batch #{batch_num} to S3 bucket...")
        wr.s3.to_parquet(df= pd.DataFrame(data = rows, columns=colnames),
                         path=f's3://{bucket}/{folder}/{parquet_file_name}',
                         compression='gzip',
                         mode = 'append',
                         dataset=True)
        print('Batch successfully written to S3 bucket!\n')
        batch_num += 1

In [6]:
%%time 
df_batch = create_df_batch(cursor, batch_size)
df_batch.head()

Creating pandas DF using generator...
DF successfully created!

CPU times: user 10.5 s, sys: 6.4 s, total: 16.9 s
Wall time: 20.1 s


Unnamed: 0,transaction_id,user_id,product_name,transaction_date,amount_gbp
0,62772546,e8737da1-1faf-4310-826d-472d76a1cb62,CARD_TRANSACTION,2023-11-13,538.73
1,36104234,a7fb9822-feb9-4d51-8f84-a5f73a42a32c,TRANSFER,2023-11-18,732.24
2,74659386,d53d7409-7be4-4a9b-96b2-6c064767dee2,CARD_TRANSACTION,2023-11-10,583.6
3,83601298,6a20e4bd-ed37-4557-924e-128e47805f28,CARD_TRANSACTION,2023-11-13,643.83
4,37411306,0d75d948-bbf6-4261-8da7-fee61c4dd9ac,TRANSFER,2023-11-16,925.85


In [22]:
write_df_to_s3_batch(cursor, bucket, folder_batch, parquet_file_name, batch_size)

Writing DF batch #1 to S3 bucket...
Batch successfully written to S3 bucket!

Writing DF batch #2 to S3 bucket...
Batch successfully written to S3 bucket!

Writing DF batch #3 to S3 bucket...
Batch successfully written to S3 bucket!

Writing DF batch #4 to S3 bucket...
Batch successfully written to S3 bucket!

Writing DF batch #5 to S3 bucket...
Batch successfully written to S3 bucket!



In [28]:
# AUXILIARY FUNCTION
def generate_dataset(cursor):
    
    cursor.execute(query)
    
    for row in cursor.fetchall():
        # some tramsformation
        yield row 

# 2.1. CREATE DF USING GENERATORS
def create_df_gen(cursor):
    print('Creating pandas DF using generator...')

    colnames = ['transaction_id', 'user_id', 'product_name', 'transaction_date', 'amount_gbp']
    
    df = pd.DataFrame(data = generate_dataset(cursor), columns=colnames)

    print('DF successfully created!\n')
    
    return df 

# 2.2 WRITING DF TO MINIO BUCKET IN PARQUET FORMAT USING GENERATORS
def write_df_to_s3_gen(cursor, bucket, folder, parquet_file_name):
    print('Writing DF to S3 bucket...')

    colnames = ['transaction_id', 
                'user_id', 
                'product_name', 
                'transaction_date', 
                'amount_gbp']
    
    wr.s3.to_parquet(df= pd.DataFrame(data = generate_dataset(cursor), columns=colnames),
             path=f's3://{bucket}/{folder}/{parquet_file_name}',
             compression='gzip',
             mode = 'append',
             dataset=True)
    print('Data successfully written to S3 bucket!\n')

In [24]:
%%time 
df_gen = create_df_gen(cursor)
df_gen.head()

Creating pandas DF using generator...
DF successfully created!

CPU times: user 9.04 s, sys: 2.1 s, total: 11.1 s
Wall time: 14.4 s


Unnamed: 0,transaction_id,user_id,product_name,transaction_date,amount_gbp
0,62772546,e8737da1-1faf-4310-826d-472d76a1cb62,CARD_TRANSACTION,2023-11-13,538.73
1,36104234,a7fb9822-feb9-4d51-8f84-a5f73a42a32c,TRANSFER,2023-11-18,732.24
2,74659386,d53d7409-7be4-4a9b-96b2-6c064767dee2,CARD_TRANSACTION,2023-11-10,583.6
3,83601298,6a20e4bd-ed37-4557-924e-128e47805f28,CARD_TRANSACTION,2023-11-13,643.83
4,37411306,0d75d948-bbf6-4261-8da7-fee61c4dd9ac,TRANSFER,2023-11-16,925.85


In [29]:
write_df_to_s3_gen(cursor, bucket, folder_gen, parquet_file_name)

Writing DF to S3 bucket...
Data successfully written to S3 bucket!

