# Migration Status Spreadsheet Notebook

## Overview
This notebook generates the data for the migration tracking spreadsheet.

## What it does
- Extracts migration data from COLIN Extract database
- Retrieves filing information from LEAR database
- Retrieves affiliation information from Auth database
- Merges and exports data to Excel format

## Output
A formatted Excel spreadsheet tracking corporation migration status.

In [1]:
%pip install pandas
%pip install sqlalchemy
%pip install dotenv
%pip install psycopg2-binary
%pip install openpyxl


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m2

## Import Libraries and Load Configuration

Import required libraries and load environment variables. 

In [18]:
import os
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError, OperationalError
from dotenv import load_dotenv
from datetime import datetime

load_dotenv()

COLUMN_NAMES = {
    "group": "Group",
    "batch": "Batch",
    "email": "Admin Email",
    "corp_num": "Incorporation Number",
    "corp_name": "Company Name",
    "corp_type": "Type",
    "status": "Migration Status",
    "date": "Migrated Date",
    "affiliated": "Affiliated",
    "account": "Account ID",
    "filings": "Filings Done",
    "filing_date": "Last Filing Date"
}

SUMMARY_COL_NAMES = {
    "group_display_name": "Group",
    "batch_display_name": "Batch",
    "requested_date": "Requested Date",
    "batch_status": "Migration Status",
    "migrated_date": "Migrated Date",
    "total_corps": "Business Count",
    "notes": "Batch Notes"
}					

TAB_NAMES = {
    "status": "Migration Status",
    "summary": "Batch Summary"
}

CONFIG = {
    'batch_size': 5000,
    'final_excel_fields': [
        COLUMN_NAMES["group"],
        COLUMN_NAMES["batch"],
        COLUMN_NAMES["email"],
        COLUMN_NAMES["corp_num"],
        COLUMN_NAMES["corp_name"],
        COLUMN_NAMES["corp_type"],
        COLUMN_NAMES["status"],
        COLUMN_NAMES["date"],
        COLUMN_NAMES["affiliated"],
        COLUMN_NAMES["account"],
        COLUMN_NAMES["filings"],
        COLUMN_NAMES["filing_date"]
    ],
    'excel_export': {
        'font_size': 12,
        'max_column_width': 50,
        'output_dir': os.getenv('EXPORT_OUTPUT_DIR')
    }
}

# Configuration
BATCH_SIZE = CONFIG['batch_size']
FINAL_EXCEL_FIELDS = CONFIG['final_excel_fields']
MIG_GROUP_IDS = [int(x.strip()) for x in os.getenv('MIG_GROUP_IDS').split(',') if x.strip().isdigit()]

if not MIG_GROUP_IDS:
    raise ValueError("MIG_GROUP_IDS is empty! Need at least one group id.")

mig_group_ids = ','.join(str(x) for x in MIG_GROUP_IDS)

print("Libraries imported and configuration loaded successfully.")


Libraries imported and configuration loaded successfully.


## Database Setup

Configure database connections using environment variables.

In [3]:
DATABASE_CONFIG = {
    'colin_extract': {
        'username': os.getenv("DATABASE_COLIN_EXTRACT_USERNAME"),
        'password': os.getenv("DATABASE_COLIN_EXTRACT_PASSWORD"),
        'host': os.getenv("DATABASE_COLIN_EXTRACT_HOST"),
        'port': os.getenv("DATABASE_COLIN_EXTRACT_PORT"),
        'name': os.getenv("DATABASE_COLIN_EXTRACT_NAME")
    },
    'lear': {
        'username': os.getenv("DATABASE_LEAR_USERNAME"),
        'password': os.getenv("DATABASE_LEAR_PASSWORD"),
        'host': os.getenv("DATABASE_LEAR_HOST"),
        'port': os.getenv("DATABASE_LEAR_PORT"),
        'name': os.getenv("DATABASE_LEAR_NAME")
    },
    'auth': {
        'username': os.getenv("DATABASE_AUTH_USERNAME"),
        'password': os.getenv("DATABASE_AUTH_PASSWORD"),
        'host': os.getenv("DATABASE_AUTH_HOST"),
        'port': os.getenv("DATABASE_AUTH_PORT"),
        'name': os.getenv("DATABASE_AUTH_NAME")
    }
}


for db_key, db_config in DATABASE_CONFIG.items():
    # Build URI
    uri = f"postgresql://{db_config['username']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['name']}"
    DATABASE_CONFIG[db_key] = {'uri': uri}

print("Database configurations successfully.")


Database configurations successfully.


## Create Database Engines

Create and test database connections for all configured databases.

In [4]:
engines = {}

for db_key, config in DATABASE_CONFIG.items():
    try:
        engine = create_engine(config['uri'])
        
        # Test connection
        with engine.connect() as conn:
            conn.execute(text("SELECT 1"))
        
        engines[db_key] = engine
        print(f"{db_key.upper()} database engine created and tested successfully.")
    
    except OperationalError as e:
        print(f"{db_key.upper()} database connection failed: {e}")
        raise
    except SQLAlchemyError as e:
        print(f"{db_key.upper()} database engine creation failed: {e}")
        raise
    except Exception as e:
        print(f"{db_key.upper()} unexpected error: {e}")
        raise

ENGINE_NAMES = {engine: key for key, engine in engines.items()}

print("All database engines ready for use.")


COLIN_EXTRACT database engine created and tested successfully.
LEAR database engine created and tested successfully.
AUTH database engine created and tested successfully.
All database engines ready for use.


## Extract Migration Data

Query COLIN Extract database to get list of migrated corporations with their details.

In [None]:
colin_extract_query = f"""
SELECT
    g.display_name AS "{COLUMN_NAMES['group']}",
    b.display_name AS "{COLUMN_NAMES['batch']}",
    mcb.corp_num AS "{COLUMN_NAMES['corp_num']}",
    c.admin_email AS "{COLUMN_NAMES['email']}",
    cn.corp_name AS "{COLUMN_NAMES['corp_name']}",
    c.corp_type_cd AS "{COLUMN_NAMES['corp_type']}",
    CASE
        WHEN cp.processed_status = 'COMPLETED' THEN 'Migrated'
        WHEN cp.processed_status IS NULL THEN 'Pending'
    END AS "{COLUMN_NAMES['status']}",
    cp.create_date::date AS "{COLUMN_NAMES['date']}"
FROM
    mig_corp_batch mcb
    JOIN 
        mig_batch b ON mcb.mig_batch_id = b.id
    JOIN 
        mig_group g ON b.mig_group_id = g.id
    LEFT JOIN 
        corporation c ON mcb.corp_num = c.corp_num
    LEFT JOIN 
        corp_processing cp ON mcb.corp_num = cp.corp_num
    LEFT JOIN 
        corp_name cn ON c.corp_num = cn.corp_num 
            AND cn.corp_name_typ_cd IN ('CO', 'NB') 
            AND cn.end_event_id IS NULL
WHERE
    g.id IN ({mig_group_ids})
    AND (
        (cp.processed_status = 'COMPLETED' AND cp.environment = 'prod')
        OR cp.processed_status IS NULL
    )
ORDER BY
    g.display_name, 
    b.display_name,
    CASE
        WHEN cp.processed_status = 'COMPLETED' THEN 0
        ELSE 1
    END, 
    cp.create_date DESC,
    cn.corp_name;
"""
    
try:
    with engines['colin_extract'].connect() as conn:
        colin_extract_df = pd.read_sql(colin_extract_query, conn)

    if colin_extract_df.empty:
        raise ValueError("COLIN Extract database query returned empty result")
    
    print(f"Fetched {len(colin_extract_df)} rows from COLIN Extract database.")
    
except Exception as e:
    print(f"Error fetching data from COLIN Extract: {e}")
    raise

# Display results
with pd.option_context('display.max_rows', None):
    display(colin_extract_df)


Fetched 152 rows from COLIN Extract database.


Unnamed: 0,Group,Batch,Incorporation Number,Admin Email,Company Name,Type,Migration Status,Migrated Date
0,Early Adopters,Batch 1,BC1196188,vannotices@mcmillan.ca,METROPOINTE CAPITAL INC.,BC,Migrated,2025-04-29
1,Early Adopters,Batch 1,BC1245585,vannotices@mcmillan.ca,PFU CANADA INC.,BC,Migrated,2025-04-29
2,Early Adopters,Batch 1,BC1302343,vannotices@mcmillan.ca,1302343 B.C. LTD.,BC,Migrated,2025-04-16
3,Early Adopters,Batch 1,BC1201211,VanCorp@bennettjones.com,1661 OLIVE WAY GP INC.,BC,Migrated,2025-04-16
4,Early Adopters,Batch 1,BC1249698,vannotices@mcmillan.ca,ATLANTIS BATTERY METALS CORP.,BC,Migrated,2025-04-16
5,Early Adopters,Batch 1,BC1105588,vancorp@bennettjones.com,BALFOUR PACIFIC CAPITAL INC.,BC,Migrated,2025-04-16
6,Early Adopters,Batch 1,BC0754828,vancorp@bennettjones.com,CLINSCAPE CONSULTING INC.,BC,Migrated,2025-04-16
7,Early Adopters,Batch 1,BC1246637,VanCorp@bennettjones.com,LANE 4 ENTERTAINMENT LTD.,BC,Migrated,2025-04-16
8,Early Adopters,Batch 1,BC0934782,VanCorp@bennettjones.com,LOW TIDE PROPERTIES TRUSTEE LTD.,BC,Migrated,2025-04-16
9,Early Adopters,Batch 1,BC1475529,Vancorp@bennettjones.com,OAK 37 TENANT LIMITED,BC,Migrated,2025-04-16


## Batch Query Function
A function to perform batch queries across multiple databases.

In [6]:
def batch_query(query_sql, db_engine, batch_size, columns):
    # Get unique corporation numbers from the dataset
    unique_corp_nums = colin_extract_df[COLUMN_NAMES['corp_num']].unique().tolist()
    corp_number_batches = [unique_corp_nums[i:i + batch_size] for i in range(0, len(unique_corp_nums), batch_size)]
    db_name = ENGINE_NAMES.get(db_engine, "Unknown database")
    batch_results = []
    
    # Process each batch of corporation numbers
    for batch_idx, current_batch_corp_numbers in enumerate(corp_number_batches):
        if not current_batch_corp_numbers:
            continue
        try:
            with db_engine.connect() as conn:
                df = pd.read_sql(query_sql, conn, params={'identifiers': current_batch_corp_numbers})
            
            # Store results from this batch
            batch_results.append(df)
            print(f"{db_name} Batch {batch_idx+1}: {len(df)} records fetched")
        
        except Exception as e:
            print(f"{db_name} Batch {batch_idx+1}/{len(corp_number_batches)} failed: {e}")
            continue
    
    # Process combined results
    if batch_results:
        combined_df = pd.concat(batch_results, ignore_index=True)
        combined_df = combined_df.drop_duplicates(COLUMN_NAMES['corp_num'], keep='last')
        print(f"Total records fetched: {len(combined_df)}")
    else:
        combined_df = pd.DataFrame(columns=columns)
        print(f"No records fetched")
    
    return combined_df


## Get Filing Data

Retrieve and aggregate filing information from LEAR database for migrated corporations.

In [7]:
lear_combined_query = f"""
SELECT 
    b.id,
    b.identifier AS "{COLUMN_NAMES['corp_num']}",
    COALESCE(
        STRING_AGG(f.filing_type, ', ' ORDER BY f.filing_type), 
        ''
    ) AS "{COLUMN_NAMES['filings']}",
    MAX(f.filing_date)::date AS "{COLUMN_NAMES['filing_date']}"
FROM businesses b
LEFT JOIN filings f ON b.id = f.business_id 
    AND f.source = 'LEAR' 
    AND f.status = 'COMPLETED'
WHERE b.identifier = ANY(%(identifiers)s)
GROUP BY b.id, b.identifier;
"""

lear_combined_df = batch_query(
    query_sql=lear_combined_query,
    db_engine=engines['lear'],
    batch_size=BATCH_SIZE,
    columns=['id', COLUMN_NAMES['corp_num'], COLUMN_NAMES["filings"], COLUMN_NAMES["filing_date"]]
)

# Display results
with pd.option_context('display.max_rows', None):
    display(lear_combined_df)


lear Batch 1: 64 records fetched
Total records fetched: 64


Unnamed: 0,id,Incorporation Number,Filings Done,Last Filing Date
0,631316,BC0754828,,
1,631317,BC0769801,,
2,631318,BC0910591,,
3,631319,BC0934777,,
4,631320,BC0971192,,
5,631321,BC1034551,,
6,631322,BC0988623,,
7,631323,BC0934782,,
8,631324,BC1033896,,
9,631325,BC1072742,,


## Get Affiliation Data

Query the Auth database to get affiliation information, including whether corporations are affiliated and their account IDs.

In [8]:
auth_query = f"""
SELECT
    e.business_identifier AS "{COLUMN_NAMES['corp_num']}",
    CASE WHEN COUNT(a.id) > 0 THEN 'Y' ELSE 'N' END AS "{COLUMN_NAMES['affiliated']}",
    COALESCE(
        STRING_AGG(a.org_id::text, ', ' ORDER BY a.org_id),
        ''
    ) AS "{COLUMN_NAMES['account']}"
FROM
    entities e
LEFT JOIN
    affiliations a ON e.id = a.entity_id
WHERE
    e.business_identifier = ANY(%(identifiers)s)
GROUP BY
    e.business_identifier
"""

auth_combined_df = batch_query(
    query_sql=auth_query,
    db_engine=engines['auth'],
    batch_size=BATCH_SIZE,
    columns=[COLUMN_NAMES['corp_num'], COLUMN_NAMES['affiliated'], COLUMN_NAMES['account']]
)

# Display results
with pd.option_context('display.max_rows', None):
    display(auth_combined_df)


auth Batch 1: 140 records fetched
Total records fetched: 140


Unnamed: 0,Incorporation Number,Affiliated,Account ID
0,BC0754828,Y,378
1,BC0758544,Y,378
2,BC0769801,Y,378
3,BC0910324,Y,378
4,BC0910327,Y,378
5,BC0910591,Y,378
6,BC0934777,Y,378
7,BC0934782,Y,378
8,BC0945732,Y,378
9,BC0945932,Y,378


## Merge Data

Combine data from COLIN Extract, LEAR, and Auth databases into a merged dataset.

In [9]:
try:
    result = (colin_extract_df
              .merge(lear_combined_df, 
                     on=COLUMN_NAMES['corp_num'], 
                     how='left')
              .merge(auth_combined_df,
                     on=COLUMN_NAMES['corp_num'],
                     how='left') 
              )
    
    # Select final fields
    merged_df = result[FINAL_EXCEL_FIELDS]
    
    print(f"Data merged successfully: {len(merged_df)} rows")
        
except Exception as e:
    print(f"Error merging data: {e}")

# Display merged results
with pd.option_context('display.max_rows', None):
    display(merged_df)


Data merged successfully: 152 rows


Unnamed: 0,Group,Batch,Admin Email,Incorporation Number,Company Name,Type,Migration Status,Migrated Date,Affiliated,Account ID,Filings Done,Last Filing Date
0,Early Adopters,Batch 1,vannotices@mcmillan.ca,BC1196188,METROPOINTE CAPITAL INC.,BC,Migrated,2025-04-29,Y,"378, 1009",,
1,Early Adopters,Batch 1,vannotices@mcmillan.ca,BC1245585,PFU CANADA INC.,BC,Migrated,2025-04-29,Y,378,,
2,Early Adopters,Batch 1,vannotices@mcmillan.ca,BC1302343,1302343 B.C. LTD.,BC,Migrated,2025-04-16,Y,378,,
3,Early Adopters,Batch 1,VanCorp@bennettjones.com,BC1201211,1661 OLIVE WAY GP INC.,BC,Migrated,2025-04-16,Y,378,,
4,Early Adopters,Batch 1,vannotices@mcmillan.ca,BC1249698,ATLANTIS BATTERY METALS CORP.,BC,Migrated,2025-04-16,Y,378,,
5,Early Adopters,Batch 1,vancorp@bennettjones.com,BC1105588,BALFOUR PACIFIC CAPITAL INC.,BC,Migrated,2025-04-16,Y,378,,
6,Early Adopters,Batch 1,vancorp@bennettjones.com,BC0754828,CLINSCAPE CONSULTING INC.,BC,Migrated,2025-04-16,Y,378,,
7,Early Adopters,Batch 1,VanCorp@bennettjones.com,BC1246637,LANE 4 ENTERTAINMENT LTD.,BC,Migrated,2025-04-16,Y,378,,
8,Early Adopters,Batch 1,VanCorp@bennettjones.com,BC0934782,LOW TIDE PROPERTIES TRUSTEE LTD.,BC,Migrated,2025-04-16,Y,378,,
9,Early Adopters,Batch 1,Vancorp@bennettjones.com,BC1475529,OAK 37 TENANT LIMITED,BC,Migrated,2025-04-16,Y,378,,


## Get Batch Summary Dataframe
Query Colin Extract database to compose the Batch Summary tab.
<br />Currently including 7 columns: Group, Batch, Requested Date, Migration Status, Migrated Date, Business Count, Batch Notes

In [21]:
batch_summary_query = f"""
WITH batch_status AS (
            SELECT 
                b.id as batch_id,
                g.display_name as group_display_name,
                b.display_name as batch_display_name,
                b.requested_date,
                b.migrated_date,
                b.notes,
                COUNT(DISTINCT mcb.corp_num) as total_corps,
                COUNT(DISTINCT CASE 
                    WHEN cp.processed_status = 'COMPLETED' AND cp.environment = 'prod' 
                    THEN mcb.corp_num 
                END) as completed_corps
            FROM mig_group g
            JOIN mig_batch b ON g.id = b.mig_group_id
            LEFT JOIN mig_corp_batch mcb ON b.id = mcb.mig_batch_id
            LEFT JOIN corp_processing cp ON mcb.corp_num = cp.corp_num
            WHERE g.id IN ({mig_group_ids})
            GROUP BY b.id, g.display_name, b.display_name, b.requested_date, b.migrated_date
        )
        SELECT 
            group_display_name,
            batch_display_name,
            requested_date,
            migrated_date,
            total_corps,
            notes,
            CASE
                WHEN total_corps = completed_corps THEN 'COMPLETED'
                ELSE 'PENDING'
            END as batch_status
        FROM batch_status
        ORDER BY group_display_name, batch_display_name
"""
try:
    with engines['colin_extract'].connect() as conn:
        batch_summary_df = pd.read_sql(batch_summary_query, conn)
    
    if batch_summary_df.empty:
        raise ValueError("batch summary data query returned 0 result")
    
    print(f"Composed {len(batch_summary_df)} entries for batch summary")

    # formatting the dataframe with proper column order and column names
    column_order = ['group_display_name', 'batch_display_name', 'requested_date', 'batch_status', 'migrated_date', 'total_corps', 'notes']
    batch_summary_df = batch_summary_df[column_order]
    batch_summary_df = batch_summary_df.rename(columns=SUMMARY_COL_NAMES)
except Exception as e:
    print(f"Error fetching data to compose batch summary: {e}")
    raise

with pd.option_context('display.max_rows', None):
    display(batch_summary_df)


Composed 6 entries for batch summary


Unnamed: 0,Group,Batch,Requested Date,Migration Status,Migrated Date,Business Count,Batch Notes
0,Early Adopters,Batch 1,,COMPLETED,2025-04-29,17,EA Batch 1
1,Early Adopters,Batch 2,,COMPLETED,2025-05-20,6,EA Batch 2
2,Early Adopters,Batch 3,2025-06-06,COMPLETED,2025-06-09,23,EA Batch 3
3,Early Adopters,Batch 4,2025-06-11,COMPLETED,2025-06-13,18,EA Batch 4
4,Early Adopters,Batch 5,2025-06-26,COMPLETED,2025-07-04,79,EA Batch 5 - Bennett Jones & Norton Rose
5,Onboarding Group 1,Batch 1,,PENDING,,9,"Onboarding Group 1, Batch 1"


## Export to Excel

Generate formatted Excel file with the merged migration tracking data.

In [None]:
from openpyxl.styles import Font

def format_worksheet(worksheet) -> None:
    """Format the given worksheet."""
    # Adjust format
    for row_num, row in enumerate(worksheet.iter_rows(), 1):
        for cell in row:
            cell.font = Font(
                size=CONFIG['excel_export']['font_size'], 
                bold=(row_num == 1)
            )

    # Freeze header row
    worksheet.freeze_panes = 'A2'
    
    # Adjust column width
    for column in worksheet.columns:
        max_length = 0
        column_letter = column[0].column_letter
        
        for cell in column:
            try:
                if cell.value and len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except (TypeError, AttributeError):
                continue
        
        adjusted_width = min(max_length + 5, CONFIG['excel_export']['max_column_width'])
        worksheet.column_dimensions[column_letter].width = adjusted_width


if merged_df.empty:
    raise ValueError("Migration Status dataframe is empty, cannot export")

if batch_summary_df.empty:
    raise ValueError("Batch Summary dataframe is empty, nothing to export")

# Create output directory
os.makedirs(CONFIG['excel_export']['output_dir'], exist_ok=True)

# Generate filename
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
excel_filename = f"migration_status_{timestamp}.xlsx"
excel_filepath = os.path.join(CONFIG['excel_export']['output_dir'], excel_filename)

# Export dataframes to two tabs in the Excel file
try:
    with pd.ExcelWriter(excel_filepath, engine='openpyxl') as writer:
        # Export Migration Status tab data
        merged_df.to_excel(writer, sheet_name=TAB_NAMES['status'], index=False)
        mig_status_worksheet = writer.sheets[TAB_NAMES['status']]
        format_worksheet(mig_status_worksheet)

        # Export Batch Summary tab data
        batch_summary_df.to_excel(writer, sheet_name=TAB_NAMES['summary'], index=False)
        b_sum_worksheet = writer.sheets[TAB_NAMES['summary']]
        format_worksheet(b_sum_worksheet)       

    print(f"Excel export successful: {excel_filename}")

except Exception as e:
    print(f"Excel export failed: {e}")
    raise


Excel export successful: migration_status_20250716_094251.xlsx
