In [1]:
import mysql.connector
import os
import logging
import shutil
import warnings
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
# set the basic logging level, for our purposes we will use INFO
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:
# suppress warnings about the data validations
warnings.simplefilter(action='ignore', category=UserWarning)

In [4]:
# establish the load date
load_date = datetime.now()

# this function allows for ' with the mysql insert statement - essential
def replace_text(text):
  """Escapes single quotes within a string for safe MySQL insertion."""
  return text.replace("'", "\\'")

def job_load_yearly_tenant_data(host, user, root_pass, base_table):
    """
    """

    # Establish connection to base
    base_db = mysql.connector.connect(
        host=host,
        user=user,
        password=root_pass,
        database='base'
    )
    
    base_cursor = base_db.cursor()
    
    # iterate over files
    all_files = []
    current_dir = os.getcwd()

    source_path = f'{current_dir}/source'
    processing_path = f'{current_dir}/processing'
    processed_path = f'{current_dir}/processed'
    error_path = f'{current_dir}/errors'

    for root, dirs, files in os.walk(source_path):
        for file in files:
            file_path = os.path.join(root, file)
            shutil.move(file_path, f'{processing_path}/{file}')
            
            if file.endswith('.xlsx'):
                try:
                    df = pd.read_excel(f'{processing_path}/{file}', sheet_name='template')
                    df['Source'] = file
                    df['LoadDate'] = load_date
                    all_files.append(df)
                    shutil.move(f'{processing_path}/{file}', f'{processed_path}/{file}')
                    logging.info(f"Finished processing -> {file}")
                except:
                    logging.warning(f"Error processing file -> {file}")
                    shutil.move(f'{processing_path}/{file}', f'{error_path}/{file}')
            
            if file.endswith('.csv'):
                try:
                    df = pd.read_csv(f'{processing_path}/{file}')
                    df['Source'] = file
                    df['LoadDate'] = load_date
                    all_files.append(df)
                    shutil.move(f'{processing_path}/{file}', f'{processed_path}/{file}')
                    logging.info(f"Finished processing -> {file}")
                except:
                    logging.warning(f"Error processing file -> {file}")
                    shutil.move(f'{processing_path}/{file}', f'{error_path}/{file}')
    
    # concat everything together
    fdf = pd.concat(all_files)
    
    # minor transformation
    fdf.dropna(subset=['FirstName', 'LastName'], inplace=True)
    fdf['Disability'] = fdf['Disability'].str.replace(',', ' |')
    fdf = fdf.replace(to_replace={np.nan : "UNKNOWN", '' : "UNKNOWN", ' ' : "UNKNOWN", "nan" : "UNKNOWN"})
    for col in fdf.columns:
        fdf[col] = fdf[col].apply(lambda x: replace_text(str(x)))
    
    column_data_types = {
        'PropertyAddress' : 'VARCHAR (100)',
        'Room' : 'VARCHAR(30)', 
        'FirstName' : 'VARCHAR(100)', 
        'MiddleName': 'VARCHAR(100)', 
        'LastName': 'VARCHAR(100)',
        'DateOfBirth' : 'VARCHAR(30)', 
        'NINumber' : 'VARCHAR(30)', 
        'CheckinDate' : 'VARCHAR(30)', 
        'CheckoutDate': 'VARCHAR(30)' ,
        'NewHBClaim': 'VARCHAR(30)',
        'HBClaimRefNumber': 'VARCHAR(30)', 
        'ReferralAgency': 'VARCHAR(100)', 
        'Age' : "VARCHAR(10)", 
        'Gender' : 'VARCHAR(30)', 
        'Religion' : 'VARCHAR(200)',
        'Ethnicity' : 'VARCHAR(200)', 
        'Nationality' : 'VARCHAR(200)', 
        'Disability' : 'VARCHAR(200)', 
        'SexualOrientation' : 'VARCHAR(100)',
        'SpokenLanguage': 'VARCHAR(100)', 
        'RiskAssessment': 'VARCHAR(100)', 
        'LengthOfStay': 'VARCHAR(100)', 
        'CycleNumber' : 'VARCHAR(10)',
        'RecordStatus' : 'VARCHAR(100)',
        'Source' : 'VARCHAR(100)',
        'LoadDate': 'DATETIME'
    }
    
    # create the column headers and convert to a string
    column_headers = [f"{col} {data_type}" for col, data_type in column_data_types.items()]
    column_headers_string = ", ".join(column_headers)
    
    # create the table if it does not exist
    try:
        # drop_query = f"drop table {base_table}"
        # base_cursor.execute(drop_query)
        create_query = f"create table if not exists {base_table} ({column_headers_string});"
        base_cursor.execute(create_query)
        
        # insert the data
        row_count = 0
        for _, row in fdf.iterrows():
            values = []
            for value in row:
                values.append(f"'{value}'")
            
            values_string = ", ".join(values)
            insert_query = f"insert into {base_table} values ({values_string});"
            base_cursor.execute(insert_query)
            row_count += 1
    
        logging.info(f"Created -> {base_table} with {row_count} rows")
    except Exception as e:
        drop_query = f"drop table base.{base_table};"
        base_cursor.execute(drop_query)
        logging.warning(f"Error -> {e}")
    
    finally:
        # commit changes and close connections
        base_db.commit()
        base_cursor.close()
        base_db.close()
        
if __name__ == '__main__':
    
    # prepare the details to connect to the databases
    host = "localhost"
    user = "root"
    root_pass = "admin"
    base_table = 'yearly_tenant_data'
    
    job_load_yearly_tenant_data(host, user, root_pass, base_table)

2024-09-23 09:57:59,476 - INFO - Finished processing -> TL-yearly SURECITY.xlsx
2024-09-23 09:58:01,865 - INFO - Finished processing -> TL-yearly ACCESS (errors).xlsx
2024-09-23 09:58:03,719 - INFO - Finished processing -> TL-yearly THORNTON HOUSING.xlsx
2024-09-23 09:58:05,619 - INFO - Finished processing -> TL-yearly GREENGATES.xlsx
2024-09-23 09:58:07,555 - INFO - Finished processing -> TL-yearly MERCIAN HEART.xlsx
2024-09-23 09:58:09,390 - INFO - Finished processing -> TL-yearly FORWARD HOUSING.xlsx
2024-09-23 09:58:11,290 - INFO - Finished processing -> TL-yearly SELECT HOMES.xlsx
2024-09-23 09:58:13,284 - INFO - Finished processing -> TL-yearly WESTMIDLANDS.xlsx
2024-09-23 09:58:15,249 - INFO - Finished processing -> TL-yearly DREAM.xlsx
2024-09-23 09:58:17,169 - INFO - Finished processing -> TL-yearly SECOND CITY.xlsx
2024-09-23 09:58:19,127 - INFO - Finished processing -> TL-template-yearly HOLTE.xlsx
2024-09-23 09:58:20,985 - INFO - Finished processing -> TL-Yearly SERENITY CI