# BACKFILL Contact Email for BAR Corps

## Overview
BAR backfill contact email
- Get identifier list per batch and group
- For Each identifier make POST to API to set contact

In [None]:
%pip install pandas requests
%pip install sqlalchemy>=2.0
%pip install psycopg2-binary
%pip install python-dotenv

# Load Configurations

In [None]:
import os
from datetime import datetime
from typing import Optional

import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError, OperationalError
from sqlalchemy.engine import Engine
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
print("Environment variables loaded successfully.")

## Database Configuration

Configure connections to:
- **colin_extract**: Target database for `bar_corps` table
- **auth**: Source Auth API

In [None]:
DATABASE_CONFIG = {
    'colin_extract': {
        'username': os.getenv("DATABASE_COLIN_EXTRACT_USERNAME"),
        'password': os.getenv("DATABASE_COLIN_EXTRACT_PASSWORD"),
        'host': os.getenv("DATABASE_COLIN_EXTRACT_HOST"),
        'port': os.getenv("DATABASE_COLIN_EXTRACT_PORT"),
        'name': os.getenv("DATABASE_COLIN_EXTRACT_NAME")
    },
    'auth': {
        'username': os.getenv("DATABASE_AUTH_USERNAME"),
        'password': os.getenv("DATABASE_AUTH_PASSWORD"),
        'host': os.getenv("DATABASE_AUTH_HOST"),
        'port': os.getenv("DATABASE_AUTH_PORT"),
        'name': os.getenv("DATABASE_AUTH_NAME")
    },
}

# Build connection URIs
for db_key, db_config in DATABASE_CONFIG.items():
    # Validate config
    missing_keys = [k for k, v in db_config.items() if v is None]
    if missing_keys:
        print(f"{db_key.upper()}: Missing environment variables for: {missing_keys}")

    # Build PostgreSQL URI
    uri = f"postgresql://{db_config['username']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['name']}"
    DATABASE_CONFIG[db_key] = {'uri': uri}

    print("Database configurations built successfully.")

AUTH_SVC_URL = os.getenv("AUTH_SVC_URL")
ACCOUNT_SVC_AUTH_URL = os.getenv("ACCOUNT_SVC_AUTH_URL")
ACCOUNT_SVC_CLIENT_ID = os.getenv("ACCOUNT_SVC_CLIENT_ID")
ACCOUNT_SVC_CLIENT_SECRET = os.getenv("ACCOUNT_SVC_CLIENT_SECRET")
ACCOUNT_SVC_ENTITY_URL = os.getenv("ACCOUNT_SVC_ENTITY_URL")

MIG_BATCH_ID = os.getenv("MIG_BATCH_ID")
ENVIRONMENTS = os.getenv("ENVIRONMENTS")
print("Service URLs and credentials loaded successfully.")

## Get Identifier for Batch and Group

In [None]:
engines = {}

for db_key, config in DATABASE_CONFIG.items():
    try:
        print(f"Creating engine for {db_key.upper()}...")
        engine = create_engine(config['uri'])

        # Test connection
        with engine.connect() as conn:
            conn.execute(text("SELECT 1"))

        engines[db_key] = engine
        print(f"✓ {db_key.upper()} database engine created and tested successfully.")

    except OperationalError as e:
        print(f"✗ {db_key.upper()} database connection failed: {e}")
        raise
    except SQLAlchemyError as e:
        print(f"✗ {db_key.upper()} database engine creation failed: {e}")
        raise
    except Exception as e:
        print(f"✗ {db_key.upper()} unexpected error: {e}")
        raise

print("="*50)
print("All database engines ready for use.")
print("="*50)

In [None]:
IDENTIFIERS_QUERY = """
SELECT corp_num
FROM corp_processing cp
WHERE processed_status = 'COMPLETED'
AND mig_batch_id = :mig_batch_id
AND environment = :environment
-- LIMIT 1
"""
def query_identifiers(engine: Engine, mig_batch_id: int, environment: str) -> pd.DataFrame:
    try:
        with engine.connect() as conn:
            result = conn.execute(text(IDENTIFIERS_QUERY), {"mig_batch_id": mig_batch_id, "environment": environment})
            identifiers_df = pd.DataFrame(result.fetchall(), columns=result.keys())
        print(f"✓ Successfully queried identifiers. Total records: {len(identifiers_df)}")
        return identifiers_df
    except SQLAlchemyError as e:
        print(f"✗ Error querying identifiers: {e}")
        raise
    except Exception as e:
        print(f"✗ Unexpected error querying identifiers: {e}")
        raise

identifier = query_identifiers(engines['colin_extract'], MIG_BATCH_ID, ENVIRONMENTS)
print(identifier.head())
print(f"Total identifiers retrieved: {len(identifier)}")
identifiers_from_batch_mig = identifier['corp_num'].tolist()
identifiers = identifiers_from_batch_mig
print(f"Identifiers list created with {len(identifiers)} entries.")

In [None]:
AUTH_ENTRIES_WITHOUT_EMAIL_QUERY = """
SELECT e.business_identifier
FROM entities e
WHERE e.business_identifier = ANY(:identifiers)
AND NOT EXISTS (
    SELECT 1
    FROM contact_links cl
    JOIN contacts c ON cl.contact_id = c.id
    WHERE cl.entity_id = e.id
    AND c.email IS NOT NULL
    )
"""

def query_identifiers_without_email(engine: Engine, identifiers: list) -> pd.DataFrame:
    try:
        with engine.connect() as conn:
            result = conn.execute(text(AUTH_ENTRIES_WITHOUT_EMAIL_QUERY), {"identifiers": identifiers})
            no_email_df = pd.DataFrame(result.fetchall(), columns=result.keys())
        print(f"✓ Successfully queried identifiers without email. Total records: {len(no_email_df)}")
        return no_email_df
    except SQLAlchemyError as e:
        print(f"✗ Error querying identifiers without email: {e}")
        raise
    except Exception as e:
        print(f"✗ Unexpected error querying identifiers without email: {e}")
        raise

identifiers_before_filtering = len(identifiers)
identifiers_without_email_df = query_identifiers_without_email(engines['auth'], identifiers)
identifiers_without_email = identifiers_without_email_df['business_identifier'].tolist()
print(f"Number of Identifiers without email: {len(identifiers_without_email)} out of {identifiers_before_filtering}")
print(f"Identifiers without email: {identifiers_without_email}")

In [None]:
EMAIL_LIST_QUERY = """
SELECT 
c.corp_num as identifier,
c.admin_email as email
FROM corporation c
WHERE 1 = 1
and c.corp_num = ANY(:identifiers)
"""
def query_email_list(engine: Engine, identifiers: list) -> pd.DataFrame:
    try:
        with engine.connect() as conn:
            result = conn.execute(text(EMAIL_LIST_QUERY), {'identifiers': identifiers_without_email})
            email_list_df = pd.DataFrame(result.fetchall(), columns=result.keys())
        print(f"✓ Successfully queried email list. Total records: {len(email_list_df)}")
        return email_list_df
    except SQLAlchemyError as e:
        print(f"✗ Error querying email list: {e}")
        raise
    except Exception as e:
        print(f"✗ Unexpected error querying email list: {e}")
        raise

email_list = query_email_list(engines['colin_extract'], identifiers)
print(email_list.head())

In [None]:
def get_auth_token() -> Optional[str]:
    import requests

    try:
        response = requests.post(
            ACCOUNT_SVC_AUTH_URL,
            data={
                'client_id': ACCOUNT_SVC_CLIENT_ID,
                'client_secret': ACCOUNT_SVC_CLIENT_SECRET,
                'grant_type': 'client_credentials'
            }
        )
        response.raise_for_status()
        token = response.json().get('access_token')
        if token:
            print("✓ Successfully obtained auth token.")
            return token
        else:
            print("✗ Auth token not found in response.")
            return None
    except requests.RequestException as e:
        print(f"✗ Error obtaining auth token: {e}")
        return None
auth_token = get_auth_token()
print(f"Auth token: {auth_token[:10]}...")  

In [None]:
def update_email_contact_auth(email_list: pd.DataFrame) -> tuple[int, int]:
    """
    Update the contact email for a given identifier using the Auth API.

    Args:
        email_list (list): A list of dictionaries containing 'identifier' and 'email' keys.
    Returns:
        tuple[int, int]: A tuple containing the count of successful updates and failed updates.
    """
    import requests
    

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {auth_token}"
    }

    success_count = failure_count = 0
    for _, row in email_list.iterrows():
        identifier, email = row['identifier'], row['email']
        print(f"Updating contact email for identifier: {identifier} with email: {email}")
        try:
            response = requests.put(f"{ACCOUNT_SVC_ENTITY_URL}/{identifier}/contacts", json={"email": email_list[email_list['identifier'] == identifier]['email'].values[0] if not email_list.empty else None}, headers=headers)
            response.raise_for_status()
            print(f"✓ Successfully updated contact email for {identifier}")
            success_count += 1
        except requests.exceptions.HTTPError as http_err:
            print(f"✗ HTTP error occurred while updating {identifier}: {http_err}")
            failure_count += 1
        except Exception as err:
            print(f"✗ Unexpected error occurred while updating {identifier}: {err}")
            failure_count += 1
        
    return success_count, failure_count


success_count, failure_count = update_email_contact_auth(email_list)
print(f"Update results - Success: {success_count}, Failure: {failure_count}")