In [9]:
import duckdb
import time
import json
import os
import logging
import shutil
import warnings
import pandas as pd
import numpy as np
from datetime import datetime
from functools import wraps

In [10]:
# set the basic logging level, for our purposes we will use INFO
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [11]:
# suppress warnings about the data validations
warnings.simplefilter(action='ignore', category=UserWarning)

In [12]:
# establish the load date
load_date = datetime.now()

# this function allows for ' with the mysql insert statement - essential
def replace_text(text):
  """Escapes single quotes within a string for safe MySQL insertion."""
  return text.replace("'", "\\'")

# motherduck config
server_config = "/home/asha/airflow/duckdb-config.json"

with open(server_config, "r") as fp:
    config = json.load(fp)
token = config['token']

def log_execution(func):
    """
    """
    
    @wraps(func)
    def etl_task_time(*args, **kwargs):
        start_time = time.time()
        print(f"Starting '{func.__name__}'...")
        result = func(*args, **kwargs)
        print(f"Finished '{func.__name__}' in {time.time() - start_time} seconds.")
        return result

    return etl_task_time

def motherduck_connection(token):
    def connection_decorator(func):
        con = duckdb.connect(f'md:?motherduck_token={token}')
        
        @wraps(func)
        def wrapper(*args, **kwargs):
            # pass con as a keyword argument for use in other functions
            return func(*args, con=con, **kwargs)
    
        return wrapper
    return connection_decorator

@log_execution
@motherduck_connection(token=token)
def job_load_yearly_tenant_data(bronze_schema, bronze_table_name, con, **kwargs):
    """
    """

    # connect to motherduck
    con.sql("USE asha_production;")
    con.sql(f"CREATE SCHEMA IF NOT EXISTS {bronze_schema};")
    
    # iterate over files
    all_files = []
    current_dir = os.getcwd()

    source_path = f'{current_dir}/source'
    processing_path = f'{current_dir}/processing'
    processed_path = f'{current_dir}/processed'
    error_path = f'{current_dir}/errors'

    for root, dirs, files in os.walk(source_path):
        for file in files:
            file_path = os.path.join(root, file)
            shutil.move(file_path, f'{processing_path}/{file}')
            
            if file.endswith('.xlsx'):
                try:
                    df = pd.read_excel(f'{processing_path}/{file}', sheet_name='template')
                    df['Source'] = file
                    df['LoadDate'] = load_date
                    all_files.append(df)
                    shutil.move(f'{processing_path}/{file}', f'{processed_path}/{file}')
                    logging.info(f"Finished processing -> {file}")
                except:
                    logging.warning(f"Error processing file -> {file}")
                    shutil.move(f'{processing_path}/{file}', f'{error_path}/{file}')
            
            if file.endswith('.csv'):
                try:
                    df = pd.read_csv(f'{processing_path}/{file}')
                    df['Source'] = file
                    df['LoadDate'] = load_date
                    all_files.append(df)
                    shutil.move(f'{processing_path}/{file}', f'{processed_path}/{file}')
                    logging.info(f"Finished processing -> {file}")
                except:
                    logging.warning(f"Error processing file -> {file}")
                    shutil.move(f'{processing_path}/{file}', f'{error_path}/{file}')
    
    # concat everything together
    fdf = pd.concat(all_files)
    
    # minor transformation
    fdf.dropna(subset=['FirstName', 'LastName'], inplace=True)
    fdf['Disability'] = fdf['Disability'].str.replace(',', ' |')
    fdf = fdf.replace(to_replace={np.nan : "UNKNOWN", '' : "UNKNOWN", ' ' : "UNKNOWN", "nan" : "UNKNOWN"})
    for col in fdf.columns:
        fdf[col] = fdf[col].apply(lambda x: replace_text(str(x)))
    
    column_data_types = {
        'PropertyAddress' : 'VARCHAR (100)',
        'Room' : 'VARCHAR(30)', 
        'FirstName' : 'VARCHAR(100)', 
        'MiddleName': 'VARCHAR(100)', 
        'LastName': 'VARCHAR(100)',
        'DateOfBirth' : 'VARCHAR(30)', 
        'NINumber' : 'VARCHAR(30)', 
        'CheckinDate' : 'VARCHAR(30)', 
        'CheckoutDate': 'VARCHAR(30)' ,
        'NewHBClaim': 'VARCHAR(30)',
        'HBClaimRefNumber': 'VARCHAR(30)', 
        'ReferralAgency': 'VARCHAR(100)', 
        'Age' : "VARCHAR(10)", 
        'Gender' : 'VARCHAR(30)', 
        'Religion' : 'VARCHAR(200)',
        'Ethnicity' : 'VARCHAR(200)', 
        'Nationality' : 'VARCHAR(200)', 
        'Disability' : 'VARCHAR(200)', 
        'SexualOrientation' : 'VARCHAR(100)',
        'SpokenLanguage': 'VARCHAR(100)', 
        'RiskAssessment': 'VARCHAR(100)', 
        'LengthOfStay': 'VARCHAR(100)', 
        'CycleNumber' : 'VARCHAR(10)',
        'RecordStatus' : 'VARCHAR(100)',
        'Source' : 'VARCHAR(100)',
        'LoadDate': 'DATETIME'
    }
    
    # create the column headers and convert to a string
    column_headers = [f"{col} {data_type}" for col, data_type in column_data_types.items()]
    column_headers_string = ", ".join(column_headers)
    
    # create the table if it does not exist
    try:
       con.sql(f"CREATE OR REPLACE TABLE {bronze_schema}.{bronze_table_name} AS SELECT * FROM fdf;")
    finally:
        # commit changes and close connections
        con.close()
      
if __name__ == '__main__':
    
    # this is the ETL task
    bronze_schema = 'bronze'
    bronze_table_name = 'yearly_tenant_data'

    job_load_yearly_tenant_data(
        token=token,
        bronze_schema=bronze_schema,
        bronze_table_name=bronze_table_name
    )

Starting 'job_load_yearly_tenant_data'...


ValueError: No objects to concatenate

: 