# Migration Status Spreadsheet Notebook

## Overview
This notebook generates the data for the migration tracking spreadsheet.

## What it does
- Extracts migration data from COLIN Extract database
- Retrieves filing information from LEAR database
- Retrieves affiliation information from Auth database
- Retrieves freeze status and early adopter information from COLIN database
- Merges and exports data to Excel format
- Composes a batch summary tab indicating migration overview of each batch

## Output
A formatted Excel spreadsheet tracking corporation migration status.

In [None]:
%pip install pandas
%pip install sqlalchemy>=2.0
%pip install oracledb
%pip install dotenv
%pip install psycopg2-binary
%pip install openpyxl

## Import Libraries and Load Configuration

Import required libraries and load environment variables. 

In [None]:
import oracledb
import os
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError, OperationalError
from dotenv import load_dotenv
from datetime import datetime

load_dotenv()

COLUMN_NAMES = {
    "group": "Group",
    "batch": "Batch",
    "email": "Admin Email",
    "corp_num": "Incorporation Number",
    "corp_name": "Company Name",
    "corp_type": "Type",
    "frozen_in_colin": "Frozen in COLIN",
    "banner_updated_in_colin": "COLIN Banner Updated",
    "status": "Migration Status",
    "date": "Migrated Date",
    "affiliated": "Affiliated",
    "account": "Account ID",
    "account_name": "Account Name",
    "filings": "Filings Done",
    "filing_date": "Last Filing Date",
    "legacy_outputs_uploaded_drs": "Legacy Outputs Uploaded DRS",
    "legacy_outputs_document_entries_created": "Legacy Outputs Document Entries Created in LEAR"
}

SUMMARY_COL_NAMES = {
    "group_display_name": "Group",
    "batch_display_name": "Batch",
    "requested_date": "Requested Date",
    "batch_status": "Migration Status",
    "migrated_date": "Migrated Date",
    "batch_size": "Batch Size",
    "migrated_businesses": "Migrated Businesses",
    "notes": "Notes"
}					

TAB_NAMES = {
    "status": "Migration Status",
    "summary": "Batch Summary"
}

CONFIG = {
    'batch_size': 5000,
    'final_excel_fields': [
        COLUMN_NAMES["group"],
        COLUMN_NAMES["batch"],
        COLUMN_NAMES["email"],
        COLUMN_NAMES["corp_num"],
        COLUMN_NAMES["corp_name"],
        COLUMN_NAMES["corp_type"],
        COLUMN_NAMES["frozen_in_colin"],
        COLUMN_NAMES["banner_updated_in_colin"],
        COLUMN_NAMES["status"],
        COLUMN_NAMES["date"],
        COLUMN_NAMES["affiliated"],
        COLUMN_NAMES["account"],
        COLUMN_NAMES['account_name'],
        COLUMN_NAMES["filings"],
        COLUMN_NAMES["filing_date"],
        COLUMN_NAMES["legacy_outputs_uploaded_drs"],
        COLUMN_NAMES["legacy_outputs_document_entries_created"]
    ],
    'excel_export': {
        'font_size': 14,
        'max_column_width': 65,
        'filled_color': 'FFCCCC',
        'output_dir': os.getenv('EXPORT_OUTPUT_DIR')
    }
}

MIGRATION_STATUS = {
    'COMPLETED': 'COMPLETED',
    'FAILED': 'FAILED',
    'PENDING': 'PENDING',
    'PARTIAL': 'PARTIAL'
}

FLAG_STATUS = {
    'YES': 'Y',
    'NO': 'N',
    'PARTIAL': 'PARTIAL',
}

# Configuration
BATCH_SIZE = CONFIG['batch_size']
FINAL_EXCEL_FIELDS = CONFIG['final_excel_fields']
MIG_GROUP_IDS = [int(x.strip()) for x in os.getenv('MIG_GROUP_IDS').split(',') if x.strip().isdigit()]

if not MIG_GROUP_IDS:
    raise ValueError("MIG_GROUP_IDS is empty! Need at least one group id.")

mig_group_ids = ','.join(str(x) for x in MIG_GROUP_IDS)

ORACLE_SCHEMA = os.getenv('DATABASE_COLIN_ORACLE_SCHEMA')

if not ORACLE_SCHEMA:
    raise ValueError("DATABASE_COLIN_ORACLE_SCHEMA is not set.")

print("Libraries imported and configuration loaded successfully.")

## Database Setup

Configure database connections using environment variables.

In [None]:
DATABASE_CONFIG = {
    'colin_extract': {
        'username': os.getenv("DATABASE_COLIN_EXTRACT_USERNAME"),
        'password': os.getenv("DATABASE_COLIN_EXTRACT_PASSWORD"),
        'host': os.getenv("DATABASE_COLIN_EXTRACT_HOST"),
        'port': os.getenv("DATABASE_COLIN_EXTRACT_PORT"),
        'name': os.getenv("DATABASE_COLIN_EXTRACT_NAME")
    },
    'lear': {
        'username': os.getenv("DATABASE_LEAR_USERNAME"),
        'password': os.getenv("DATABASE_LEAR_PASSWORD"),
        'host': os.getenv("DATABASE_LEAR_HOST"),
        'port': os.getenv("DATABASE_LEAR_PORT"),
        'name': os.getenv("DATABASE_LEAR_NAME")
    },
    'auth': {
        'username': os.getenv("DATABASE_AUTH_USERNAME"),
        'password': os.getenv("DATABASE_AUTH_PASSWORD"),
        'host': os.getenv("DATABASE_AUTH_HOST"),
        'port': os.getenv("DATABASE_AUTH_PORT"),
        'name': os.getenv("DATABASE_AUTH_NAME")
    },
    'doc': {
        'username': os.getenv("DATABASE_DOC_USERNAME"),
        'password': os.getenv("DATABASE_DOC_PASSWORD"),
        'host': os.getenv("DATABASE_DOC_HOST"),
        'port': os.getenv("DATABASE_DOC_PORT"),
        'name': os.getenv("DATABASE_DOC_NAME")
    },
    'colin_oracle': {
        'username': os.getenv("DATABASE_COLIN_ORACLE_USERNAME"),
        'password': os.getenv("DATABASE_COLIN_ORACLE_PASSWORD"),
        'host': os.getenv("DATABASE_COLIN_ORACLE_HOST"),
        'port': os.getenv("DATABASE_COLIN_ORACLE_PORT"),
        'name': os.getenv("DATABASE_COLIN_ORACLE_NAME"),
    },
}


for db_key, db_config in DATABASE_CONFIG.items():
    # Build Oracle URI
    if db_key == 'colin_oracle':
        uri = f"oracle+oracledb://{db_config['username']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['name']}"
    # Build PostgreSQL URI
    else:
        uri = f"postgresql://{db_config['username']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['name']}"
    DATABASE_CONFIG[db_key] = {'uri': uri}

print("Database configurations successfully.")


## Create Database Engines

Create and test database connections for all configured databases.

In [None]:
oracledb.init_oracle_client()

engines = {}

for db_key, config in DATABASE_CONFIG.items():
    try:
        engine = create_engine(config['uri'])
        
        # Test connection
        with engine.connect() as conn:
            if db_key =='colin_oracle':
                conn.execute(text("SELECT 1 FROM DUAL"))
            else:
                conn.execute(text("SELECT 1"))
        
        engines[db_key] = engine
        print(f"{db_key.upper()} database engine created and tested successfully.")
    
    except OperationalError as e:
        print(f"{db_key.upper()} database connection failed: {e}")
        raise
    except SQLAlchemyError as e:
        print(f"{db_key.upper()} database engine creation failed: {e}")
        raise
    except Exception as e:
        print(f"{db_key.upper()} unexpected error: {e}")
        raise

ENGINE_NAMES = {engine: key for key, engine in engines.items()}

print("All database engines ready for use.")


## Extract Migration Data

Query COLIN Extract database to get list of migrated corporations with their details.

In [None]:
colin_extract_query = f"""
SELECT
    g.display_name AS "{COLUMN_NAMES['group']}",
    b.display_name AS "{COLUMN_NAMES['batch']}",
    mcb.corp_num AS "{COLUMN_NAMES['corp_num']}",
    c.admin_email AS "{COLUMN_NAMES['email']}",
    cn.corp_name AS "{COLUMN_NAMES['corp_name']}",
    c.corp_type_cd AS "{COLUMN_NAMES['corp_type']}",
    CASE
        WHEN cp.processed_status = 'COMPLETED' THEN '{MIGRATION_STATUS['COMPLETED']}'
        WHEN cp.processed_status = 'FAILED' THEN '{MIGRATION_STATUS['FAILED']}'
        WHEN cp.processed_status IS NULL THEN '{MIGRATION_STATUS['PENDING']}'
        ELSE '{MIGRATION_STATUS['PENDING']}'
    END AS "{COLUMN_NAMES['status']}",
    cp.create_date::date AS "{COLUMN_NAMES['date']}"
FROM
    mig_corp_batch mcb
    JOIN 
        mig_batch b ON mcb.mig_batch_id = b.id
    JOIN 
        mig_group g ON b.mig_group_id = g.id
    LEFT JOIN 
        corporation c ON mcb.corp_num = c.corp_num
    LEFT JOIN 
        corp_processing cp ON mcb.corp_num = cp.corp_num
    LEFT JOIN 
        corp_name cn ON c.corp_num = cn.corp_num 
            AND cn.corp_name_typ_cd IN ('CO', 'NB') 
            AND cn.end_event_id IS NULL
WHERE
    g.id IN ({mig_group_ids})
    AND (
        (cp.processed_status = 'COMPLETED' AND cp.environment = 'prod')
        OR (cp.processed_status = 'FAILED' AND cp.environment = 'prod')
        OR cp.processed_status IS NULL
    )
ORDER BY
    g.display_name, 
    b.display_name,
    CASE
        WHEN cp.processed_status = 'COMPLETED' THEN 0
        WHEN cp.processed_status = 'FAILED' THEN 1
        ELSE 2
    END, 
    cp.create_date DESC,
    cn.corp_name;
"""
    
try:
    with engines['colin_extract'].connect() as conn:
        colin_extract_df = pd.read_sql(colin_extract_query, conn)

    if colin_extract_df.empty:
        raise ValueError("COLIN Extract database query returned empty result")
    
    print(f"Fetched {len(colin_extract_df)} rows from COLIN Extract database.")
    
except Exception as e:
    print(f"Error fetching data from COLIN Extract: {e}")
    raise

# Display results
with pd.option_context('display.max_rows', None):
    display(colin_extract_df)

## Batch Query Function
A function to perform batch queries across multiple databases.

In [None]:
def batch_query(query_sql, db_engine, batch_size, columns, is_colin_oracle=False, additional_params=None, dedup=True):
    # Get unique corporation numbers from the dataset
    unique_corp_nums = colin_extract_df[COLUMN_NAMES['corp_num']].unique().tolist()

    if is_colin_oracle:
        # Convert corp_nums format if query in COLIN db
        corp_num_mapping = {corp_num[2:] if corp_num.startswith('BC') else corp_num: corp_num
                               for corp_num in unique_corp_nums}
        unique_corp_nums = list(corp_num_mapping.keys())
    else:
        corp_num_mapping = None

    corp_number_batches = [unique_corp_nums[i:i + batch_size] for i in range(0, len(unique_corp_nums), batch_size)]
    db_name = ENGINE_NAMES.get(db_engine, "Unknown database")
    batch_results = []
    
    # Process each batch of corporation numbers
    for batch_idx, current_batch_corp_numbers in enumerate(corp_number_batches):
        if not current_batch_corp_numbers:
            continue
        try:
            with db_engine.connect() as conn:
                if is_colin_oracle:
                    corp_nums_str = ', '.join([f"'{x}'" for x in current_batch_corp_numbers])
                    actual_query = query_sql.replace('{identifiers}', corp_nums_str)
                    df = pd.read_sql(actual_query, conn)
                else:
                    params = {'identifiers': current_batch_corp_numbers}
                    if additional_params:
                        params.update(additional_params)

                    df = pd.read_sql(query_sql, conn, params=params)
            
            # Store results from this batch
            batch_results.append(df)
            print(f"{db_name} Batch {batch_idx+1}: {len(df)} records fetched")
        
        except Exception as e:
            print(f"{db_name} Batch {batch_idx+1}/{len(corp_number_batches)} failed: {e}")
            continue
    
    # Process combined results
    if batch_results:
        combined_df = pd.concat(batch_results, ignore_index=True)

        # Convert back to corp format starts with BC
        if is_colin_oracle and corp_num_mapping:
            combined_df[COLUMN_NAMES['corp_num']] = combined_df[COLUMN_NAMES['corp_num']].map(corp_num_mapping)

        if dedup:
            combined_df = combined_df.drop_duplicates(COLUMN_NAMES['corp_num'], keep='last')
        print(f"Total records fetched: {len(combined_df)}")
    else:
        combined_df = pd.DataFrame(columns=columns)
        print(f"No records fetched")
    
    return combined_df

## Get Filing Data

Retrieve and aggregate filing information from LEAR database for migrated corporations.

In [None]:
lear_combined_query = f"""
SELECT 
    b.id,
    b.identifier AS "{COLUMN_NAMES['corp_num']}",
    COALESCE(
        STRING_AGG(f.filing_type, ', ' ORDER BY f.filing_type), 
        ''
    ) AS "{COLUMN_NAMES['filings']}",
    MAX(f.filing_date)::date AS "{COLUMN_NAMES['filing_date']}"
FROM businesses b
LEFT JOIN filings f ON b.id = f.business_id 
    AND f.source = 'LEAR' 
    AND f.status = 'COMPLETED'
WHERE b.identifier = ANY(%(identifiers)s)
GROUP BY b.id, b.identifier;
"""

lear_combined_df = batch_query(
    query_sql=lear_combined_query,
    db_engine=engines['lear'],
    batch_size=BATCH_SIZE,
    columns=['id', COLUMN_NAMES['corp_num'], COLUMN_NAMES["filings"], COLUMN_NAMES["filing_date"]]
)

# Display results
with pd.option_context('display.max_rows', None):
    display(lear_combined_df)

In [None]:
def validate_and_fill_missing_data(colin_extract_df, source_df, source_name, default_values):
    """Data validation and default value filling method"""
    
    # Get all non-pending status corporations
    non_pending_corps = colin_extract_df[
        colin_extract_df[COLUMN_NAMES['status']] != MIGRATION_STATUS['PENDING']
    ][COLUMN_NAMES['corp_num']].tolist()

    # Get corporation list from source database
    source_corps = source_df[COLUMN_NAMES['corp_num']].tolist()
    missing_corps = set(non_pending_corps) - set(source_corps)

    if missing_corps:
        print(f" {len(missing_corps)} corporations missing from {source_name} database")

        # Create default records for missing corporations
        missing_records = []
        for corp_num in missing_corps:
            record = {COLUMN_NAMES['corp_num']: corp_num}
            record.update(default_values)
            missing_records.append(record)

        if missing_records:
            missing_df = pd.DataFrame(missing_records)
            source_df = pd.concat([source_df, missing_df], ignore_index=True)

    return source_df

## Get Affiliation Data

Query the Auth database to get affiliation information, including whether corporations are affiliated and their account IDs.

In [None]:
auth_query = f"""
SELECT
    e.business_identifier AS "{COLUMN_NAMES['corp_num']}",
    CASE WHEN COUNT(a.id) > 0 THEN '{FLAG_STATUS['YES']}' ELSE '{FLAG_STATUS['NO']}' END AS "{COLUMN_NAMES['affiliated']}",
    COALESCE(
        STRING_AGG(a.org_id::text, ', ' ORDER BY a.org_id),
        ''
    ) AS "{COLUMN_NAMES['account']}",
    COALESCE(
        STRING_AGG(o.name, ', ' ORDER BY a.org_id),
        ''
    ) AS "{COLUMN_NAMES['account_name']}"
FROM
    entities e
LEFT JOIN
    affiliations a ON e.id = a.entity_id 
LEFT JOIN
    orgs o ON a.org_id = o.id
WHERE
    e.business_identifier = ANY(%(identifiers)s)
GROUP BY
    e.business_identifier
"""

auth_combined_df = batch_query(
    query_sql=auth_query,
    db_engine=engines['auth'],
    batch_size=BATCH_SIZE,
    columns=[COLUMN_NAMES['corp_num'], COLUMN_NAMES['affiliated'], COLUMN_NAMES['account'], COLUMN_NAMES['account_name']]
)

AUTH_DEFAULT_VALUES = {
    COLUMN_NAMES['affiliated']: FLAG_STATUS['NO'],
    COLUMN_NAMES['account']: '',
    COLUMN_NAMES['account_name']: ''
}
 
# Auth data validation and filling
auth_combined_df = validate_and_fill_missing_data(
    colin_extract_df,
    auth_combined_df,
    'Auth',
    AUTH_DEFAULT_VALUES
)

# Display results
with pd.option_context('display.max_rows', None):
    display(auth_combined_df)

## Get COLIN Data

Retrieve corporation freeze status and early adopter information from Oracle COLIN database.

In [None]:
colin_oracle_query = f"""
SELECT
    c.corp_num AS "{COLUMN_NAMES['corp_num']}",
    CASE WHEN c.CORP_FROZEN_TYP_CD = 'C' THEN '{FLAG_STATUS['YES']}' ELSE '{FLAG_STATUS['NO']}' END AS "{COLUMN_NAMES['frozen_in_colin']}",
    CASE WHEN cea.corp_num IS NOT NULL THEN '{FLAG_STATUS['YES']}' ELSE '{FLAG_STATUS['NO']}' END AS "{COLUMN_NAMES['banner_updated_in_colin']}"
FROM
    {ORACLE_SCHEMA}.CORPORATION c
    LEFT JOIN {ORACLE_SCHEMA}.CORP_EARLY_ADOPTERS cea ON c.corp_num = cea.corp_num
WHERE
    c.corp_num IN ({{identifiers}})
"""


colin_oracle_combined_df = batch_query(
    query_sql=colin_oracle_query,
    db_engine=engines['colin_oracle'],
    batch_size=BATCH_SIZE,
    columns=[COLUMN_NAMES['corp_num'], COLUMN_NAMES['frozen_in_colin'], COLUMN_NAMES['banner_updated_in_colin']],
    is_colin_oracle=True
)

COLIN_DEFAULT_VALUES = {
    COLUMN_NAMES['frozen_in_colin']: FLAG_STATUS['NO'],
    COLUMN_NAMES['banner_updated_in_colin']: FLAG_STATUS['NO'],
}
 
# Colin data validation and filling
colin_oracle_combined_df = validate_and_fill_missing_data(
    colin_extract_df,
    colin_oracle_combined_df,
    'COLIN',
    COLIN_DEFAULT_VALUES
)

# Display results
with pd.option_context('display.max_rows', None):
    display(colin_oracle_combined_df)

# Get Legacy Outputs

In [None]:
# Get colin_event_ids and file_keys for migrated corps from LEAR
lear_colin_event_detail_query = f"""
SELECT 
    b.identifier AS "{COLUMN_NAMES['corp_num']}",
    cei.colin_event_id,
    d.file_key
FROM businesses b
JOIN filings f ON b.id = f.business_id
JOIN colin_event_ids cei ON f.id = cei.filing_id
LEFT JOIN documents d ON f.id = d.filing_id
WHERE b.identifier = ANY(%(identifiers)s)
AND f.source = 'COLIN'
AND f.filing_type != 'lear_tombstone'
"""

lear_colin_event_detail_df = batch_query(
    query_sql=lear_colin_event_detail_query,
    db_engine=engines['lear'],
    batch_size=BATCH_SIZE,
    columns=[COLUMN_NAMES['corp_num'], 'colin_event_id', 'file_key'],
    dedup=False
)

# Display LEAR query results
with pd.option_context('display.max_rows', None):
    display(lear_colin_event_detail_df)

In [None]:
# Query DRS, match both entity_id and event_id
drs_detail_query = f"""
    SELECT 
        ar.entity_id AS "{COLUMN_NAMES['corp_num']}",
        ar.event_id,
        ar.document_service_id
    FROM application_reports ar
    WHERE ar.entity_id = ANY(%(identifiers)s)
    AND ar.event_id = ANY(%(event_ids)s)
    """

if not lear_colin_event_detail_df.empty:
    colin_event_ids = lear_colin_event_detail_df['colin_event_id'].unique().tolist()
    corp_nums = lear_colin_event_detail_df[COLUMN_NAMES['corp_num']].unique().tolist()
    
    drs_detail_df = batch_query(
        query_sql=drs_detail_query,
        db_engine=engines['doc'],
        batch_size=BATCH_SIZE,
        columns=[COLUMN_NAMES['corp_num'], 'event_id', 'document_service_id'],
        additional_params={'event_ids': colin_event_ids},
        dedup=False
    )

    # Display DRS query results
    print(f"Fetched {len(drs_detail_df)} rows from Doc database for DRS legacy outputs.")
    with pd.option_context('display.max_rows', None):
        display(drs_detail_df)
else:
    drs_detail_df = pd.DataFrame(columns=[COLUMN_NAMES['corp_num'], 'event_id', 'document_service_id'])
    print("No colin event data found in LEAR, skipping DRS query.")

In [None]:
def calculate_legacy_status_logic(corp_num, corp_colin_event_lear, corp_drs):
    # DRS legacy outputs status calculation
    if corp_drs.empty:
        drs_status = FLAG_STATUS['NO']
    else:
        if corp_colin_event_lear.empty:
            corp_colin_event_ids = set()
        else:
            # Get all unique colin_event_ids for this corporation
            corp_colin_event_ids = set(corp_colin_event_lear['colin_event_id'].tolist())

        # Get all unique event_ids from DRS for this corporation
        corp_drs_event_ids = set(corp_drs['event_id'].tolist())

        if len(corp_drs_event_ids) == 0:
            drs_status = FLAG_STATUS['NO']
        elif corp_drs_event_ids == corp_colin_event_ids:
            drs_status = FLAG_STATUS['YES']
        else:
            drs_status = FLAG_STATUS['PARTIAL']

    # LEAR legacy outputs status calculation
    if corp_colin_event_lear.empty:
        lear_status = FLAG_STATUS['NO']
    else:
        # Get all unique file keys from LEAR documents for this corporation (filter NULL values)
        lear_file_keys = set(corp_colin_event_lear['file_key'].dropna().tolist())

        # Determine LEAR legacy outputs documents creation status
        if drs_status == FLAG_STATUS['NO']:
            lear_status = FLAG_STATUS['NO']

            # Log data inconsistency if found
            if len(lear_file_keys) > 0:
                print(f"DATA QUALITY ISSUE: Corp {corp_num} - DRS=N but LEAR has {len(lear_file_keys)} document entries")

        else:
            # Get all unique document_service_ids from DRS (filter NULL values)
            drs_document_service_ids = set(corp_drs['document_service_id'].dropna().tolist())
 
            # Calculate matching between DRS and LEAR
            matched_keys = lear_file_keys.intersection(drs_document_service_ids)
            total_drs_keys = len(drs_document_service_ids)
            matched_count = len(matched_keys)

            if drs_status == FLAG_STATUS['YES']:
                if matched_count == total_drs_keys:
                    lear_status = FLAG_STATUS['YES']
                elif matched_count == 0:
                    lear_status = FLAG_STATUS['NO']
                else:
                    lear_status = FLAG_STATUS['PARTIAL']
            else:  # PARTIAL
                if matched_count > 0:
                    lear_status = FLAG_STATUS['PARTIAL']
                else:
                    lear_status = FLAG_STATUS['NO']

    return drs_status, lear_status

In [None]:
def calculate_corp_legacy_outputs_status():
    processed_corps = colin_extract_df[
        colin_extract_df[COLUMN_NAMES['status']] != MIGRATION_STATUS['PENDING']
    ][COLUMN_NAMES['corp_num']].unique().tolist()

    if not processed_corps:
        print("No non-pending corporations found")
        return pd.DataFrame(columns=[
            COLUMN_NAMES['corp_num'],
            COLUMN_NAMES['legacy_outputs_uploaded_drs'],
            COLUMN_NAMES['legacy_outputs_document_entries_created']
        ])

    # pre-grouping corps legacy outputs data 
    lear_colin_event_grouped = lear_colin_event_detail_df.groupby(COLUMN_NAMES['corp_num'])
    drs_grouped = drs_detail_df.groupby(COLUMN_NAMES['corp_num'])
    lear_colin_event_corps = set(lear_colin_event_grouped.groups.keys())
    drs_corps = set(drs_grouped.groups.keys())

    # create empty DF
    empty_lear_df = pd.DataFrame(columns=lear_colin_event_detail_df.columns)
    empty_drs_df = pd.DataFrame(columns=drs_detail_df.columns)


    result = []
    for corp_num in processed_corps:
        try:
            corp_colin_event_lear = lear_colin_event_grouped.get_group(corp_num) if corp_num in lear_colin_event_corps else empty_lear_df
            corp_drs = drs_grouped.get_group(corp_num) if corp_num in drs_corps else empty_drs_df

            # calculate legacy outputs status for each grouped corp
            drs_status, lear_status = calculate_legacy_status_logic(corp_num, corp_colin_event_lear, corp_drs)

            result.append({
                COLUMN_NAMES['corp_num']: corp_num,
                COLUMN_NAMES['legacy_outputs_uploaded_drs']: drs_status,
                COLUMN_NAMES['legacy_outputs_document_entries_created']: lear_status
            })

        except Exception as e:
            print(f"Error processing corporation {corp_num}: {e}")
            result.append({
                COLUMN_NAMES['corp_num']: corp_num,
                COLUMN_NAMES['legacy_outputs_uploaded_drs']: '',
                COLUMN_NAMES['legacy_outputs_document_entries_created']: ''
            })

    print(f"Processing completed. Generated status for {len(result)} corporations.")
    return pd.DataFrame(result)

legacy_outputs_df = calculate_corp_legacy_outputs_status()

# Data validation and filling
LEGACY_OUTPUTS_DEFAULT_VALUES = {
    COLUMN_NAMES['legacy_outputs_uploaded_drs']: FLAG_STATUS['NO'],
    COLUMN_NAMES['legacy_outputs_document_entries_created']: FLAG_STATUS['NO']
}

legacy_outputs_df = validate_and_fill_missing_data(
    colin_extract_df,
    legacy_outputs_df,
    'Legacy Outputs',
    LEGACY_OUTPUTS_DEFAULT_VALUES
)

# Display results
print(f"Generated legacy outputs status for {len(legacy_outputs_df)} corporations.")
with pd.option_context('display.max_rows', None):
    display(legacy_outputs_df)

## Merge Data

Combine data from COLIN Extract, LEAR, and Auth databases into a merged dataset.

In [None]:
try:
    result = (colin_extract_df
              .merge(lear_combined_df, 
                     on=COLUMN_NAMES['corp_num'], 
                     how='left')
              .merge(auth_combined_df,
                     on=COLUMN_NAMES['corp_num'],
                     how='left')
              .merge(colin_oracle_combined_df,
                     on=COLUMN_NAMES['corp_num'],
                     how='left')
              .merge(legacy_outputs_df,
                     on=COLUMN_NAMES['corp_num'],
                     how='left')
              )
    
    # Select final fields
    merged_df = result[FINAL_EXCEL_FIELDS]
    
    print(f"Data merged successfully: {len(merged_df)} rows")
        
except Exception as e:
    print(f"Error merging data: {e}")

# Display merged results
with pd.option_context('display.max_rows', None):
    display(merged_df)

## Get Batch Summary Dataframe
Query Colin Extract database to compose the Batch Summary tab.
<br />Currently including 8 columns: Group, Batch, Requested Date, Migration Status, Migrated Date, Batch Size, Migrated Businesses, Notes

In [None]:
batch_summary_query = f"""
WITH batch_status AS (
            SELECT 
                b.id as batch_id,
                g.display_name as group_display_name,
                b.display_name as batch_display_name,
                b.requested_date,
                b.migrated_date,
                b.notes,
                COUNT(DISTINCT mcb.corp_num) as batch_size,
                COUNT(DISTINCT CASE 
                    WHEN cp.processed_status = 'COMPLETED' AND cp.environment = 'prod'
                    THEN mcb.corp_num 
                END) as completed_corps,
                COUNT(DISTINCT CASE 
                    WHEN cp.corp_num IS NOT NULL AND cp.environment = 'prod'
                    THEN mcb.corp_num 
                END) as has_processing_records
            FROM mig_group g
            JOIN mig_batch b ON g.id = b.mig_group_id
            LEFT JOIN mig_corp_batch mcb ON b.id = mcb.mig_batch_id
            LEFT JOIN corp_processing cp ON mcb.corp_num = cp.corp_num
            WHERE g.id IN ({mig_group_ids})
            GROUP BY b.id, g.display_name, b.display_name, b.requested_date, b.migrated_date
        )
        SELECT 
            group_display_name,
            batch_display_name,
            requested_date,
            migrated_date,
            batch_size,
            completed_corps as migrated_businesses,
            notes,
            CASE
                WHEN has_processing_records = 0 THEN '{MIGRATION_STATUS['PENDING']}'
                WHEN batch_size = completed_corps THEN '{MIGRATION_STATUS['COMPLETED']}'
                WHEN completed_corps = 0 AND has_processing_records > 0 THEN '{MIGRATION_STATUS['FAILED']}'
                WHEN completed_corps > 0 AND completed_corps < batch_size THEN '{MIGRATION_STATUS['PARTIAL']}'
                ELSE '{MIGRATION_STATUS['PENDING']}'
            END as batch_status
        FROM batch_status
        ORDER BY group_display_name, batch_display_name
"""
try:
    with engines['colin_extract'].connect() as conn:
        batch_summary_df = pd.read_sql(batch_summary_query, conn)
    
    if batch_summary_df.empty:
        raise ValueError("batch summary data query returned 0 result")
    
    print(f"Composed {len(batch_summary_df)} entries for batch summary")

    # formatting the dataframe with proper column order and column names
    column_order = ['group_display_name', 'batch_display_name', 'requested_date', 'batch_status', 'migrated_date', 'batch_size', 'migrated_businesses', 'notes']
    batch_summary_df = batch_summary_df[column_order]
    batch_summary_df = batch_summary_df.rename(columns=SUMMARY_COL_NAMES)
except Exception as e:
    print(f"Error fetching data to compose batch summary: {e}")
    raise

with pd.option_context('display.max_rows', None):
    display(batch_summary_df)

## Export to Excel

Generate formatted Excel file with the merged migration tracking data.

In [None]:
# Define highlighting rules for Migration Status tab
MIGRATION_STATUS_HIGHLIGHTING_RULES = [
    {
        'column_name': COLUMN_NAMES['affiliated'],
        'condition_value': FLAG_STATUS['NO'],
        'fill_color': CONFIG['excel_export']['filled_color'],
        'additional_condition': {
            'column': COLUMN_NAMES['status'],
            'values': [MIGRATION_STATUS['COMPLETED'], MIGRATION_STATUS['FAILED']]
        }
    },
    {
        'column_name': COLUMN_NAMES['banner_updated_in_colin'], 
        'condition_value': FLAG_STATUS['NO'],
        'fill_color': CONFIG['excel_export']['filled_color'],
        'additional_condition': {
            'column': COLUMN_NAMES['status'],
            'values': [MIGRATION_STATUS['COMPLETED'], MIGRATION_STATUS['FAILED']]
        }
    },
    {
        'column_name': COLUMN_NAMES['status'],
        'condition_value': MIGRATION_STATUS['FAILED'], 
        'fill_color': CONFIG['excel_export']['filled_color']
    },
    {
        'column_name': COLUMN_NAMES['frozen_in_colin'],
        'condition_value': FLAG_STATUS['NO'],
        'fill_color': CONFIG['excel_export']['filled_color'],
        'additional_condition': {
            'column': COLUMN_NAMES['status'],
            'values': [MIGRATION_STATUS['COMPLETED'], MIGRATION_STATUS['FAILED']]
        }
    },
    {
        'column_name': COLUMN_NAMES['legacy_outputs_uploaded_drs'],
        'condition_value': [FLAG_STATUS['NO'], FLAG_STATUS['PARTIAL']],
        'fill_color': CONFIG['excel_export']['filled_color'],
        'additional_condition': {
            'column': COLUMN_NAMES['status'],
            'values': [MIGRATION_STATUS['COMPLETED'], MIGRATION_STATUS['FAILED']]
        }
    },
    {
        'column_name': COLUMN_NAMES['legacy_outputs_document_entries_created'],
        'condition_value': [FLAG_STATUS['NO'], FLAG_STATUS['PARTIAL']],
        'fill_color': CONFIG['excel_export']['filled_color'],
        'additional_condition': {
            'column': COLUMN_NAMES['status'],
            'values': [MIGRATION_STATUS['COMPLETED'], MIGRATION_STATUS['FAILED']]
        }
    }
]

# Define highlighting rules for Batch Summary tab
BATCH_SUMMARY_HIGHLIGHTING_RULES = [
    {
        'column_name': SUMMARY_COL_NAMES['batch_status'],
        'condition_value': MIGRATION_STATUS['PARTIAL'],
        'fill_color': CONFIG['excel_export']['filled_color'],
        'highlight_related_columns': [
            SUMMARY_COL_NAMES['migrated_businesses']
        ]
    },
    {
        'column_name': SUMMARY_COL_NAMES['batch_status'],
        'condition_value': MIGRATION_STATUS['FAILED'],
        'fill_color': CONFIG['excel_export']['filled_color'],
        'highlight_related_columns': [
            SUMMARY_COL_NAMES['migrated_businesses']
        ]
    }
]

In [None]:
from openpyxl.styles import Font, PatternFill, Alignment

def apply_cell_highlighting(worksheet, highlighting_rules):
    """
    Apply conditional highlighting to worksheet cells based on rules.
    
    Args:
        worksheet: The openpyxl worksheet
        highlighting_rules: List of dicts with column_name, condition_value, fill_color
    
    Returns:
        int: Total number of cells highlighted
    """
    highlighted_count = 0
    
    # Find column indices for all highlighting rules
    all_column_names = set()
    for rule in highlighting_rules:
        all_column_names.add(rule['column_name'])
        if 'highlight_related_columns' in rule:
            all_column_names.update(rule['highlight_related_columns'])
    column_indices = {}
    for col_idx, cell in enumerate(worksheet[1], 1):
        if cell.value in all_column_names:
            column_indices[cell.value] = col_idx
    
    # Apply highlighting based on rules
    for row_num, row in enumerate(worksheet.iter_rows(), 1):
        if row_num == 1:  # Skip header row
            continue
            
        for col_idx, cell in enumerate(row, 1):
            # Check each highlighting rule
            for rule in highlighting_rules:
                condition_values = rule['condition_value']
                if isinstance(condition_values, str):
                    condition_values = [condition_values]

                if col_idx == column_indices.get(rule['column_name']) and cell.value in condition_values:
                    should_highlight = True
                    
                    if 'additional_condition' in rule:
                        additional_col_idx = column_indices.get(rule['additional_condition']['column'])
                        if additional_col_idx:
                            additional_cell = row[additional_col_idx - 1]
                            if additional_cell.value not in rule['additional_condition']['values']:
                                should_highlight = False
                    
                    if should_highlight:
                        fill = PatternFill(start_color=rule['fill_color'], end_color=rule['fill_color'], fill_type='solid')
                        cell.fill = fill
                        highlighted_count += 1
                    
                    if 'highlight_related_columns' in rule:
                        for related_col_name in rule['highlight_related_columns']:
                            if related_col_name in column_indices:
                                related_col_idx = column_indices[related_col_name]
                                related_cell = row[related_col_idx - 1]
                                related_cell.fill = fill
                                highlighted_count += 1
    
    return highlighted_count


def format_worksheet(worksheet) -> None:
    """Format the given worksheet."""
    
    # Define display styles
    header_font = Font(size=CONFIG['excel_export']['font_size'], bold=True)
    normal_font = Font(size=CONFIG['excel_export']['font_size'])

    # Apply cell highlighting based on worksheet type
    if worksheet.title == TAB_NAMES['summary']:
        # Batch Summary tab
        highlighted_count = apply_cell_highlighting(worksheet, BATCH_SUMMARY_HIGHLIGHTING_RULES)
    else:
        # Migration Status tab
        highlighted_count = apply_cell_highlighting(worksheet, MIGRATION_STATUS_HIGHLIGHTING_RULES)

    # Format rows (excluding highlighting which is now handled separately)
    for row_num, row in enumerate(worksheet.iter_rows(), 1):
        for col_idx, cell in enumerate(row, 1):
            if row_num == 1:
                # Header row
                cell.font = header_font
            else:
                # Data rows
                cell.font = normal_font
                cell.alignment = Alignment(horizontal='left')
    
    # Freeze header row
    worksheet.freeze_panes = 'A2'

    # Add filter
    worksheet.auto_filter.ref = worksheet.dimensions
    
    # Add last updated at top right
    last_updated = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    worksheet.cell(row=1, column=worksheet.max_column + 1, value=f"Last Updated: {last_updated}").font = normal_font
    
    # Adjust column width
    for column in worksheet.columns:
        max_length = 0
        column_letter = column[0].column_letter
        worksheet.column_dimensions[column_letter].alignment = Alignment(horizontal='left')

        for cell in column:
            try:
                if cell.value and len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except (TypeError, AttributeError):
                continue
        
        adjusted_width = min(max_length + 12, CONFIG['excel_export']['max_column_width'])
        worksheet.column_dimensions[column_letter].width = adjusted_width


In [None]:
if merged_df.empty:
    raise ValueError("Data is empty, cannot export")

if batch_summary_df.empty:
    raise ValueError("Batch Summary dataframe is empty, nothing to export")

# Create output directory
os.makedirs(CONFIG['excel_export']['output_dir'], exist_ok=True)

# Generate filename
# if reading an existing Excel file and update data
read_path = os.getenv('READ_FILE_DIR')
read_file = os.getenv('EXCEL_FILE_READ')
writer_mode = 'create'

if not read_file or not read_path:
    excel_filename = f"migration_status.xlsx"
    excel_filepath = os.path.join(CONFIG['excel_export']['output_dir'], excel_filename)
    print("No file to read. Or reading file path not configured. Creating migration tracking spreadsheet.")
elif os.path.exists(excel_filepath := os.path.join(read_path, read_file)):
    writer_mode = 'update'
    print("Updating migration tracking spreadsheet.")
else:
    raise FileExistsError("Configured file reading path, but file doesn't exist.")

try:
    writer_kwargs = {'engine': 'openpyxl'}
    if writer_mode != 'create':
        writer_kwargs.update({'mode': 'a', 'if_sheet_exists': 'replace'})
    
    with pd.ExcelWriter(excel_filepath, **writer_kwargs) as writer:
        print(f"Mode: {writer_mode}")
        # Export Batch Summary tab data
        batch_summary_df.to_excel(writer, sheet_name=TAB_NAMES['summary'], index=False)
        b_sum_worksheet = writer.sheets[TAB_NAMES['summary']]
        format_worksheet(b_sum_worksheet)

        # Export Migration Status tab data
        merged_df.to_excel(writer, sheet_name=TAB_NAMES['status'], index=False)
        mig_status_worksheet = writer.sheets[TAB_NAMES['status']]
        format_worksheet(mig_status_worksheet)

    print(f"Excel export successful: {excel_filepath}")
    
except Exception as e:
    print(f"Excel export failed: {e}")
    raise