In [None]:
#| default_exp utils_sync

In [None]:
#| export

from __future__ import annotations
import polars as pl
from fh_saas.utils_graphql import GraphQLClient
from fh_saas.utils_polars_mapper import map_and_upsert, apply_schema
from typing import Dict, Optional, List
import logging

logger = logging.getLogger(__name__)

In [None]:
from nbdev.showdoc import show_doc

## Sync Orchestration

High-level pipeline combining API fetch, Polars transformation, and database upsert.

In [None]:
#| export

async def sync_external_data(
    client: GraphQLClient, # Initialized GraphQL client
    query_template: str, # GraphQL query with $cursor variable
    variables: dict, # Initial variables (e.g., {'cursor': None})
    items_path: list[str], # Path to data list in response
    cursor_path: list[str], # Path to next cursor
    table_name: str, # Target database table
    key_col: str, # Primary key for upsert
    db_uri: str, # Database connection string
    column_map: dict = None, # Optional column renaming
    type_map: dict = None, # Optional type conversions
    has_next_path: list[str] = None, # Optional hasNextPage path
    batch_size: int = 5000 # Max rows per batch (pagination page size)
) -> Dict[str, int]:
    """
    Sync external data from GraphQL API to database.
    
    End-to-end pipeline:
    1. Fetch paginated data from GraphQL (streaming generator)
    2. Transform each batch with Polars (rename, type conversion)
    3. Upsert batch to database (staging table pattern)
    4. Track progress (total records, batches)
    
    Memory efficient: Processes one page at a time, never loads full dataset.
    
    Args:
        client: GraphQL client instance
        query_template: GraphQL query with pagination
        variables: Query variables
        items_path: JSONPath to data items
        cursor_path: JSONPath to cursor
        table_name: Target table name
        key_col: Primary key column
        db_uri: Database connection string
        column_map: Optional column renaming
        type_map: Optional type conversions
        has_next_path: Optional hasNextPage path
        batch_size: Records per batch
    
    Returns:
        Dict with sync stats: {'total_records': 10000, 'batches': 10}
    
    Example:
        ```python
        from fh_saas.utils_api import AsyncAPIClient, bearer_token_auth
        from fh_saas.utils_graphql import GraphQLClient
        import polars as pl
        
        async with AsyncAPIClient(
            'https://api.example.com/graphql',
            auth_headers=bearer_token_auth('TOKEN')
        ) as api_client:
            
            client = GraphQLClient(api_client)
            
            stats = await sync_external_data(
                client=client,
                query_template='''
                    query($cursor: String) {
                        users(after: $cursor, first: 1000) {
                            nodes { user_id_val name_str email_addr }
                            pageInfo { hasNextPage endCursor }
                        }
                    }
                ''',
                variables={'cursor': None},
                items_path=['data', 'users', 'nodes'],
                cursor_path=['data', 'users', 'pageInfo', 'endCursor'],
                has_next_path=['data', 'users', 'pageInfo', 'hasNextPage'],
                table_name='users',
                key_col='user_id',
                db_uri='sqlite:///app.db',
                column_map={
                    'user_id_val': 'user_id',
                    'name_str': 'name',
                    'email_addr': 'email'
                },
                type_map={
                    'user_id': pl.Int64
                }
            )
            
            print(f"Synced {stats['total_records']} records in {stats['batches']} batches")
        ```
    """
    total_records = 0
    batch_count = 0
    
    # Stream paginated data
    async for batch in client.fetch_pages_generator(
        query_template=query_template,
        variables=variables,
        items_path=items_path,
        cursor_path=cursor_path,
        has_next_path=has_next_path
    ):
        batch_count += 1
        
        # Convert to Polars DataFrame
        df = pl.DataFrame(batch)
        logger.info(f"Batch {batch_count}: Converted {len(df)} records to DataFrame")
        
        # Apply type conversions if specified
        if type_map:
            df = apply_schema(df, type_map)
            logger.info(f"Batch {batch_count}: Applied type conversions")
        
        # Upsert to database
        map_and_upsert(
            df=df,
            table_name=table_name,
            key_col=key_col,
            db_uri=db_uri,
            column_map=column_map
        )
        
        total_records += len(df)
        logger.info(f"Batch {batch_count}: Upserted {len(df)} records (total: {total_records})")
    
    return {
        'total_records': total_records,
        'batches': batch_count
    }

In [None]:
show_doc(sync_external_data)

---

[source](https://github.com/abhisheksreesaila/fh-saas/blob/main/fh_saas/utils_sync.py#L17){target="_blank" style="float:right; font-size:smaller"}

### sync_external_data

>      sync_external_data (client:fh_saas.utils_graphql.GraphQLClient,
>                          query_template:str, variables:dict,
>                          items_path:list[str], cursor_path:list[str],
>                          table_name:str, key_col:str, db_uri:str,
>                          column_map:dict=None, type_map:dict=None,
>                          has_next_path:list[str]=None, batch_size:int=5000)

*Sync external data from GraphQL API to database.*

End-to-end pipeline:
1. Fetch paginated data from GraphQL (streaming generator)
2. Transform each batch with Polars (rename, type conversion)
3. Upsert batch to database (staging table pattern)
4. Track progress (total records, batches)

Memory efficient: Processes one page at a time, never loads full dataset.

Args:
    client: GraphQL client instance
    query_template: GraphQL query with pagination
    variables: Query variables
    items_path: JSONPath to data items
    cursor_path: JSONPath to cursor
    table_name: Target table name
    key_col: Primary key column
    db_uri: Database connection string
    column_map: Optional column renaming
    type_map: Optional type conversions
    has_next_path: Optional hasNextPage path
    batch_size: Records per batch

Returns:
    Dict with sync stats: {'total_records': 10000, 'batches': 10}

Example:
    ```python
    from fh_saas.utils_api import AsyncAPIClient, bearer_token_auth
    from fh_saas.utils_graphql import GraphQLClient
    import polars as pl

    async with AsyncAPIClient(
        'https://api.example.com/graphql',
        auth_headers=bearer_token_auth('TOKEN')
    ) as api_client:

        client = GraphQLClient(api_client)

        stats = await sync_external_data(
            client=client,
            query_template='''
                query($cursor: String) {
                    users(after: $cursor, first: 1000) {
                        nodes { user_id_val name_str email_addr }
                        pageInfo { hasNextPage endCursor }
                    }
                }
            ''',
            variables={'cursor': None},
            items_path=['data', 'users', 'nodes'],
            cursor_path=['data', 'users', 'pageInfo', 'endCursor'],
            has_next_path=['data', 'users', 'pageInfo', 'hasNextPage'],
            table_name='users',
            key_col='user_id',
            db_uri='sqlite:///app.db',
            column_map={
                'user_id_val': 'user_id',
                'name_str': 'name',
                'email_addr': 'email'
            },
            type_map={
                'user_id': pl.Int64
            }
        )

        print(f"Synced {stats['total_records']} records in {stats['batches']} batches")
    ```

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| client | GraphQLClient |  | Initialized GraphQL client |
| query_template | str |  | GraphQL query with $cursor variable |
| variables | dict |  | Initial variables (e.g., {'cursor': None}) |
| items_path | list |  | Path to data list in response |
| cursor_path | list |  | Path to next cursor |
| table_name | str |  | Target database table |
| key_col | str |  | Primary key for upsert |
| db_uri | str |  | Database connection string |
| column_map | dict | None | Optional column renaming |
| type_map | dict | None | Optional type conversions |
| has_next_path | list | None | Optional hasNextPage path |
| batch_size | int | 5000 | Max rows per batch (pagination page size) |
| **Returns** | **Dict** |  |  |

## Incremental Sync Helper

Sync only records updated after a specific timestamp.

In [None]:
#| export

async def sync_incremental(
    client: GraphQLClient, # Initialized GraphQL client
    query_template: str, # GraphQL query with $cursor and $last_sync variables
    last_sync_time: str, # ISO timestamp (e.g., '2024-01-15T10:00:00Z')
    items_path: list[str], # Path to data list
    cursor_path: list[str], # Path to cursor
    table_name: str, # Target table
    key_col: str, # Primary key
    db_uri: str, # Database connection
    column_map: dict = None, # Optional column map
    type_map: dict = None, # Optional type map
    has_next_path: list[str] = None # Optional hasNextPage path
) -> Dict[str, any]:
    """
    Incremental sync: fetch only records updated after last sync.
    
    Args:
        client: GraphQL client
        query_template: Query with $last_sync variable
        last_sync_time: ISO timestamp of last sync
        items_path: Path to data items
        cursor_path: Path to cursor
        table_name: Target table
        key_col: Primary key
        db_uri: Database URI
        column_map: Optional column mapping
        type_map: Optional type conversions
        has_next_path: Optional hasNextPage path
    
    Returns:
        Dict with stats: {'total_records': 100, 'batches': 2, 'last_sync_time': '2024-01-16T10:00:00Z'}
    
    Example:
        ```python
        # Query with $last_sync filter
        query = '''
            query($cursor: String, $last_sync: DateTime!) {
                users(after: $cursor, where: {updated_at: {_gt: $last_sync}}) {
                    nodes { id name updated_at }
                    pageInfo { hasNextPage endCursor }
                }
            }
        '''
        
        stats = await sync_incremental(
            client=client,
            query_template=query,
            last_sync_time='2024-01-15T10:00:00Z',
            items_path=['data', 'users', 'nodes'],
            cursor_path=['data', 'users', 'pageInfo', 'endCursor'],
            table_name='users',
            key_col='id',
            db_uri='sqlite:///app.db'
        )
        ```
    """
    from datetime import datetime
    
    # Add last_sync to variables
    variables = {
        'cursor': None,
        'last_sync': last_sync_time
    }
    
    # Execute sync
    stats = await sync_external_data(
        client=client,
        query_template=query_template,
        variables=variables,
        items_path=items_path,
        cursor_path=cursor_path,
        table_name=table_name,
        key_col=key_col,
        db_uri=db_uri,
        column_map=column_map,
        type_map=type_map,
        has_next_path=has_next_path
    )
    
    # Add current timestamp
    stats['last_sync_time'] = datetime.utcnow().isoformat() + 'Z'
    
    return stats

In [None]:
show_doc(sync_incremental)

---

[source](https://github.com/abhisheksreesaila/fh-saas/blob/main/fh_saas/utils_sync.py#L142){target="_blank" style="float:right; font-size:smaller"}

### sync_incremental

>      sync_incremental (client:fh_saas.utils_graphql.GraphQLClient,
>                        query_template:str, last_sync_time:str,
>                        items_path:list[str], cursor_path:list[str],
>                        table_name:str, key_col:str, db_uri:str,
>                        column_map:dict=None, type_map:dict=None,
>                        has_next_path:list[str]=None)

*Incremental sync: fetch only records updated after last sync.*

Args:
    client: GraphQL client
    query_template: Query with $last_sync variable
    last_sync_time: ISO timestamp of last sync
    items_path: Path to data items
    cursor_path: Path to cursor
    table_name: Target table
    key_col: Primary key
    db_uri: Database URI
    column_map: Optional column mapping
    type_map: Optional type conversions
    has_next_path: Optional hasNextPage path

Returns:
    Dict with stats: {'total_records': 100, 'batches': 2, 'last_sync_time': '2024-01-16T10:00:00Z'}

Example:
    ```python
    # Query with $last_sync filter
    query = '''
        query($cursor: String, $last_sync: DateTime!) {
            users(after: $cursor, where: {updated_at: {_gt: $last_sync}}) {
                nodes { id name updated_at }
                pageInfo { hasNextPage endCursor }
            }
        }
    '''

    stats = await sync_incremental(
        client=client,
        query_template=query,
        last_sync_time='2024-01-15T10:00:00Z',
        items_path=['data', 'users', 'nodes'],
        cursor_path=['data', 'users', 'pageInfo', 'endCursor'],
        table_name='users',
        key_col='id',
        db_uri='sqlite:///app.db'
    )
    ```

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| client | GraphQLClient |  | Initialized GraphQL client |
| query_template | str |  | GraphQL query with $cursor and $last_sync variables |
| last_sync_time | str |  | ISO timestamp (e.g., '2024-01-15T10:00:00Z') |
| items_path | list |  | Path to data list |
| cursor_path | list |  | Path to cursor |
| table_name | str |  | Target table |
| key_col | str |  | Primary key |
| db_uri | str |  | Database connection |
| column_map | dict | None | Optional column map |
| type_map | dict | None | Optional type map |
| has_next_path | list | None | Optional hasNextPage path |
| **Returns** | **Dict** |  |  |

In [None]:
#| hide

import nbdev as nb
nb.nbdev_export()