In [185]:
import sqlalchemy
from sqlalchemy import create_engine, text
import yaml
import logging
import pandas as pd
import numpy as np

In [186]:
with open("docker-compose.yml", "r") as file:
    yml = yaml.safe_load(file)

class DBCred():
    def __init__(self, service) -> None:
        try:
            db =  yml["services"][service]
            env = db["environment"]

            # if "pg_db" in service:
            self.db_prefix = "POSTGRES"
            self.eng_driver = "postgresql+psycopg2"
            self.db_name = env[f"{self.db_prefix}_DB"]
            self.user=env[f"{self.db_prefix}_USER"]
            self.passwd=env[f"{self.db_prefix}_PASSWORD"]
            self.port=yml["services"][service]["ports"][0].split(":")[0]
            self.host="localhost"
            
            self.conn_str = f"{self.eng_driver}://{self.user}:{self.passwd}@{self.host}:{self.port}/{self.db_name}"

        except Exception as e:
            logging.error(f"Invalid docker-compose.yml: {e}")

In [187]:
source_db_cred = DBCred("pg_db_in")
dest_db_cred = DBCred("pg_db_out")

In [227]:
class Database():
    def __init__(self, db_cred):
        self.conn_str = db_cred.conn_str
        self.engine = create_engine(self.conn_str)


    def get_table_names(self, sql):
        """
            Retrieve name of tables from source DB
            return 'list' : 'table_names'
        """
        try:
            with self.engine.connect() as connection:
                results = connection.execute(text(sql))

            table_names = [table_name[0] for table_name in results]
            logging.info(f"Got {len(table_names)} tables: {table_names} \n")
            return table_names
        except Exception as e:
            logging.error(f"Error retrieving table names, check database status or SQL syntax: \n{e}")

    def extract_db(self, table_names, user_date):
        """
            Extract data from source DB.
            Write each table into csv files.
        """
        try:
            with self.engine.connect() as connection:
                for table in table_names:
                    path = f"./data/postgres/{table}/{user_date}/{table}.csv"
                    result = connection.execute(text(f"SELECT * FROM {table};"))
                    columns = result._metadata.keys
                    df = pd.DataFrame(columns=columns, data=result)
                    final_df = df.astype(object).where(pd.notnull(df), 'NULL')
                    final_df.to_csv(path, index=False, sep=',', encoding='utf-8')
            logging.info("Tables extracted.\n")
        except Exception as e:
            logging.error(f"Error while extracting or saving data from Postgres DB: {e}")


    def insert_into_db(self, source, table, user_date):
        """
            Insert data into destination DB.
            return 'bool' : success
        """
        success = False
        if source == "postgres":
            path = f"./data/{source}/{table}/{user_date}/{table}.csv"
        elif source == "csv":
            path = f"./data/{source}/{user_date}/{table}.csv"
        
        try:
            df = pd.read_csv(path, encoding="utf-8")
            df.replace(to_replace=np.nan, value='NULL', inplace=True)


            with self.engine.connect() as connection:
                
                df.to_sql(name=table, con=self.engine, if_exists='replace', index=False)
                
                connection.commit()
            success = True
            return success
        except Exception as e:
            print(f"Error loading data into destination database: {e}")

    
    def exec_sql(self, sql):
        with self.engine.connect() as connection:
            connection.execute(text(sql))
            connection.commit()


    def final_query(self, user_date, sql):
        """
            Execute final query on destination DB and write result into 'final_query.csv'
        """
        path = f"./data/track/{user_date}/final_query.csv"
        with self.engine.connect() as connection:
            result = connection.execute(text(sql))
            cols = result._metadata.keys
            df = pd.DataFrame(result, columns=cols)
        df.to_csv(path, index=False, sep=',', encoding='utf-8')

In [228]:
source_db = Database(source_db_cred)

In [229]:
dest_db = Database(dest_db_cred)

In [230]:
from scripts.constants import sql_PG_TABLE_NAMES_QUERY

In [231]:
src_tables = source_db.get_table_names(sql_PG_TABLE_NAMES_QUERY)

In [232]:
user_date = "2023-01-30"

In [233]:
from scripts.functions import create_db_path, create_csv_path, extract_csv
create_db_path(src_tables, user_date)
create_csv_path(user_date)

Step 1 already executed for this date. Reprocessing it for the selected day (2023-01-30).
./data/postgres/suppliers/2023-01-30 recreated.

Step 1 already executed for this date. Reprocessing it for the selected day (2023-01-30).
./data/postgres/employees/2023-01-30 recreated.

Step 1 already executed for this date. Reprocessing it for the selected day (2023-01-30).
./data/postgres/shippers/2023-01-30 recreated.

Step 1 already executed for this date. Reprocessing it for the selected day (2023-01-30).
./data/postgres/categories/2023-01-30 recreated.

Step 1 already executed for this date. Reprocessing it for the selected day (2023-01-30).
./data/postgres/employee_territories/2023-01-30 recreated.

Step 1 already executed for this date. Reprocessing it for the selected day (2023-01-30).
./data/postgres/region/2023-01-30 recreated.

Step 1 already executed for this date. Reprocessing it for the selected day (2023-01-30).
./data/postgres/customer_demographics/2023-01-30 recreated.

Step 1 

In [234]:
source_db.extract_db(src_tables, user_date)

In [235]:
extract_csv(user_date)

CSV file extracted.



In [236]:
for table in src_tables:
    source = "postgres"
    success = dest_db.insert_into_db(source, table, user_date)
    if success:
        print(f"Table {table} loaded into destination database.")


Table suppliers loaded into destination database.
Table employees loaded into destination database.
Table shippers loaded into destination database.
Table categories loaded into destination database.
Table employee_territories loaded into destination database.
Table region loaded into destination database.
Table customer_demographics loaded into destination database.
Table us_states loaded into destination database.
Table products loaded into destination database.
Table territories loaded into destination database.
Table customer_customer_demo loaded into destination database.
Table customers loaded into destination database.
Table orders loaded into destination database.


In [237]:

success = dest_db.insert_into_db("csv", "order_details", user_date)
if success:
    logging.info(f"Table {file} from CSV folder loaded into destination database.")

In [238]:
from scripts.constants import sql_PG_TABLE_NAMES_QUERY

In [239]:
result = source_db.get_table_names(sql_PG_TABLE_NAMES_QUERY)

In [240]:
from scripts.constants import sql_FINAL_QUERY

In [241]:
dest_db.final_query(user_date, sql_FINAL_QUERY)

In [242]:
file = "./data/constraints_db_out.sql"
file

'./data/constraints_db_out.sql'

In [243]:
with open(file, "r") as f:
    dest_db.exec_sql(f.read())