# START - 26th Sept

In [1]:
import pandas as pd
import re
import warnings

# Suppress warnings that may arise from older Excel file formats
warnings.simplefilter(action='ignore', category=UserWarning)

print("--- Starting Automated Pipeline for Month 1 ---")

# --- 1. Configuration: Define File Paths and Sheet Names ---
# This makes the script easy to update if file names change.
file_paths = {
    "register": "All time data from Register.xlsx",
    "non_lodger": "ato_tax_transparency_non_lodger.xlsx",
    "lodge_once_comp": "lodge_once.csv",
    "lodge_once_details": "lodge_once_cont.xlsx"
}
sheet_names = {
    "register": "Statements",
    "non_lodger": "Non-Lodger",
    "lodge_once_details": "lodge_once"
}

# --- 2. Data Loading ---
try:
    df_register = pd.read_excel(file_paths["register"], sheet_name=sheet_names["register"])
    df_non_lodger = pd.read_excel(file_paths["non_lodger"], sheet_name=sheet_names["non_lodger"])
    df_lodge_once_comp = pd.read_csv(file_paths["lodge_once_comp"])
    df_lodge_once_details = pd.read_excel(file_paths["lodge_once_details"], sheet_name=sheet_names["lodge_once_details"])
    print("Step 1/5: All source files loaded successfully.")
except Exception as e:
    print(f"ERROR: A file could not be loaded. Please check paths and names. Details: {e}")
    # Stop execution if files can't be loaded
    raise

# --- 3. Data Preparation and Merging (Incorporating all fixes) ---
# Merge the two 'lodge_once' files with robust ABN standardization
df_lodge_once_details['abn'] = df_lodge_once_details['abn'].astype(str)
df_lodge_once_comp['abn'] = df_lodge_once_comp['abn'].astype(str)
df_lodge_once_merged = pd.merge(df_lodge_once_details, df_lodge_once_comp, on='abn', how='inner')
df_lodge_once_merged = df_lodge_once_merged[~df_lodge_once_merged['abn'].str.startswith('dummy_')].copy()

# Clean the master register
df_register.columns = df_register.columns.str.strip().str.replace('\n', '')
df_register.dropna(subset=['Reporting Period'], inplace=True)
df_register['entity_name_clean'] = df_register['Reporting entities'].astype(str).str.upper().str.strip()

# Clean the non-lodger list
df_non_lodger['entity_name_clean'] = df_non_lodger['Entity Name'].astype(str).str.upper().str.strip()
print("Step 2/5: Data has been cleaned and prepared for analysis.")

# --- 4. Core Analysis: Identify Non-Lodgers and Single-Lodgers ---
# Identify Non-Lodgers by cross-referencing
lodged_entity_names = set(df_register['entity_name_clean'])
df_never_lodged = df_non_lodger[~df_non_lodger['entity_name_clean'].isin(lodged_entity_names)].copy()

# Identify Single-Lodgers by counting occurrences in the register
lodgement_counts = df_register['entity_name_clean'].value_counts()
single_lodger_names = lodgement_counts[lodgement_counts == 1].index
df_single_lodgers_base = df_register[df_register['entity_name_clean'].isin(single_lodger_names)].copy()

# Enrich Single-Lodger data by extracting ABN and merging
def extract_abn(text):
    if not isinstance(text, str): return None
    match = re.search(r'(\d{2}\s*\d{3}\s*\d{3}\s*\d{3})', text)
    return re.sub(r'\s+', '', match.group(1)) if match else None

df_single_lodgers_base['extracted_abn'] = df_single_lodgers_base['Reporting entities'].apply(extract_abn)
df_single_lodgers_enriched = pd.merge(
    df_single_lodgers_base,
    df_lodge_once_merged,
    left_on='extracted_abn',
    right_on='abn',
    how='left'
)
print("Step 3/5: Cohort identification and data enrichment complete.")

# --- 5. Final Deliverable Preparation and Export ---
# Define the columns for the final export, based on Month 1 objectives
non_lodger_columns = [
    'ABN', 'Entity Name', 'Total Income', 'Entity size', 'ASX listed?', 'ASX300',
    'Industry_desc', 'Division_Description', 'State',
    'Mn_bus_addr_ln_1', 'Mn_bus_sbrb', 'Mn_bus_pc', 'Ent_eml', 'ACN'
]
single_lodger_columns = [
    'Reporting entities', 'extracted_abn', 'abn', 'company_name', 'Reporting Period', 'Submitted',
    'Status', 'Voluntary?', 'Revenue', 'abn_regn_dt', 'abn_cancn_dt', 'industry_desc',
    'last_submission_dttm', 'num_compliant', 'num_non_compliant', 'expected_due_date'
]

# Ensure only existing columns are selected to prevent errors
final_non_lodger_cols = [col for col in non_lodger_columns if col in df_never_lodged.columns]
final_single_lodger_cols = [col for col in single_lodger_columns if col in df_single_lodgers_enriched.columns]

# Create final DataFrames and export to a single Excel file
final_non_lodgers_df = df_never_lodged[final_non_lodger_cols]
final_single_lodgers_df = df_single_lodgers_enriched[final_single_lodger_cols]

output_filename = 'Month_1_Analysis_Deliverable_Automated.xlsx'
with pd.ExcelWriter(output_filename) as writer:
    final_non_lodgers_df.to_excel(writer, sheet_name='Never Lodged Entities', index=False)
    final_single_lodgers_df.to_excel(writer, sheet_name='Single Lodgement Entities', index=False)
print(f"Step 4/5: Final datasets prepared and exported to '{output_filename}'.")

# --- 6. Final Summary Report ---
print("\n--- Month 1 Pipeline Execution Summary ---")
print(f"Identified {len(final_non_lodgers_df)} potential non-lodger entities.")
print(f"Identified {len(final_single_lodgers_df)} single-lodgement entities.")
print("The final deliverable includes all available supporting details.")
print("Key Finding: 'Responsible persons' data is not explicitly available but can be proxied by 'associates' data in later analysis.")
print("--- Pipeline Complete ---")

--- Starting Automated Pipeline for Month 1 ---
Step 1/5: All source files loaded successfully.
Step 2/5: Data has been cleaned and prepared for analysis.
Step 3/5: Cohort identification and data enrichment complete.
Step 4/5: Final datasets prepared and exported to 'Month_1_Analysis_Deliverable_Automated.xlsx'.

--- Month 1 Pipeline Execution Summary ---
Identified 0 potential non-lodger entities.
Identified 0 single-lodgement entities.
The final deliverable includes all available supporting details.
Key Finding: 'Responsible persons' data is not explicitly available but can be proxied by 'associates' data in later analysis.
--- Pipeline Complete ---


# END
