In [1]:
import database_utils
import data_extraction
import data_cleaning
from sqlalchemy import create_engine
import pandas as pd
import yaml

pd.set_option('display.max_columns', None)

# User Data

In [2]:
# Initialize the DatabaseConnector
db_connector = database_utils.DatabaseConnector()

# List all tables in the RDS database
tables = db_connector.list_db_tables()
print("Available tables:", tables)

Available tables: ['legacy_store_details', 'dim_card_details', 'legacy_users', 'orders_table']


In [3]:
# Extract the user data from the specified table
extractor = data_extraction.DataExtractor()
user_df = extractor.read_rds_table(db_connector, 'legacy_users')

In [4]:
# Clean the user data
data_cleaner = data_cleaning.DataCleaning()
cleaned_user_df = data_cleaner.clean_user_data(user_df)

In [5]:
# Initialize a connection to the local PostgreSQL database
file_path = "postgres_creds.yaml"
with open(file_path, 'r') as file:
    post_creds = yaml.safe_load(file)
local_db_engine = create_engine(f"postgresql://{post_creds['username']}:{post_creds['password']}@localhost:5432/sales_data")

# Upload the cleaned data to the local database
db_connector.upload_to_db(cleaned_user_df, 'dim_users', local_db_engine)

# Card Details Data

In [6]:
# Extract the card data from the specified link
pdf_link = 'https://data-handling-public.s3.eu-west-1.amazonaws.com/card_details.pdf'
card_df = extractor.retrieve_pdf_data(pdf_link)

In [7]:
# Clean the card data
cleaned_card_df = data_cleaner.clean_card_data(card_df)

# Upload the cleaned data to the local database
db_connector.upload_to_db(cleaned_card_df, 'dim_card_details', local_db_engine)

# Stores Data

In [8]:
number_of_stores_endpoint = "https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores"
store_details_endpoint = "https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/{store_number}"

# Get the number of stores
number_of_stores = extractor.list_number_of_stores(number_of_stores_endpoint)

# Retrieve data for all stores
if number_of_stores:
    store_df = extractor.retrieve_stores_data(store_details_endpoint, number_of_stores)

Error retrieving store data from https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/451: 500 Server Error: Internal Server Error for url: https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/451


In [9]:
# Clean the store data
cleaned_store_df = data_cleaner.clean_store_data(store_df)

# Upload the cleaned data to the local database
db_connector.upload_to_db(cleaned_store_df, 'dim_store_details', local_db_engine)

# Product Data

In [19]:
# Extract data from S3
product_df = extractor.extract_from_s3('s3://data-handling-public/products.csv')