In [None]:
#| default_exp utils_polars_mapper

In [None]:
#| export

from __future__ import annotations
import polars as pl
from sqlalchemy import create_engine, text
from typing import Dict, List, Optional
import uuid
import logging

logger = logging.getLogger(__name__)

In [None]:
from nbdev.showdoc import show_doc

## Polars Mapping & Database Write

High-performance JSON-to-database pipeline using Polars for vectorized transformations and staging tables for bulk upserts.

In [None]:
#| export

def map_and_upsert(
    df: pl.DataFrame, # The raw Polars DataFrame from JSON
    table_name: str, # Target database table name
    key_col: str, # Primary key column for conflict resolution
    db_uri: str, # SQLAlchemy connection string (e.g., 'sqlite:///db.db' or 'postgresql://...')
    column_map: dict = None, # Optional rename map {json_key: db_col}
    unnest_cols: list[str] = None # List of Struct columns to flatten
):
    """
    Map JSON data to database columns and upsert using staging table pattern.
    
    Performance Strategy (Staging Table Pattern):
    1. Write to temporary staging table (fast bulk insert)
    2. Execute SQL INSERT ... ON CONFLICT for upsert (database-native, vectorized)
    3. Drop staging table (cleanup)
    
    This is 10-100x faster than row-by-row upserts for large datasets.
    
    Args:
        df: Polars DataFrame with JSON data
        table_name: Target table (must already exist)
        key_col: Primary key for ON CONFLICT resolution
        db_uri: Database connection string
        column_map: Optional column renaming {json_col: db_col}
        unnest_cols: Optional list of nested columns to flatten
    
    Example:
        ```python
        import polars as pl
        
        # JSON from API
        json_data = [
            {'user_id_val': 1, 'ABC_1': 'Alice', 'extra': 'ignore'},
            {'user_id_val': 2, 'ABC_1': 'Bob', 'extra': 'ignore'}
        ]
        
        # Convert to DataFrame
        df = pl.DataFrame(json_data)
        
        # Map and upsert
        map_and_upsert(
            df=df,
            table_name='users',
            key_col='user_id',
            db_uri='sqlite:///app.db',
            column_map={'user_id_val': 'user_id', 'ABC_1': 'name'}
        )
        ```
    """
    # Step 1: Rename columns if mapping provided
    if column_map:
        df = df.rename(column_map)
        logger.info(f"Renamed columns: {column_map}")
    
    # Step 2: Flatten nested columns if specified
    if unnest_cols:
        for col in unnest_cols:
            if col in df.columns:
                df = df.unnest(col)
                logger.info(f"Unnested column: {col}")
    
    # Step 3: Select only columns that exist in target table (drop extras)
    # This prevents errors from extra JSON fields
    engine = create_engine(db_uri)
    
    # Get target table columns
    with engine.connect() as conn:
        result = conn.execute(text(f"SELECT * FROM {table_name} LIMIT 0"))
        target_columns = list(result.keys())
    
    # Filter DataFrame to only target columns
    available_cols = [col for col in target_columns if col in df.columns]
    df = df.select(available_cols)
    logger.info(f"Selected columns for {table_name}: {available_cols}")
    
    # Step 4: Generate unique staging table name
    staging_table = f"staging_{uuid.uuid4().hex[:8]}"
    
    try:
        # Step 5: Write to staging table (fast bulk insert)
        df.write_database(
            table_name=staging_table,
            connection=db_uri,
            if_table_exists='replace'
        )
        logger.info(f"Wrote {len(df)} rows to staging table {staging_table}")
        
        # Step 6: Determine database type for dialect-specific SQL
        is_sqlite = 'sqlite' in db_uri.lower()
        
        # Step 7: Execute upsert from staging to target
        with engine.connect() as conn:
            if is_sqlite:
                # SQLite: INSERT OR REPLACE
                cols_str = ', '.join(available_cols)
                upsert_sql = f"""
                    INSERT OR REPLACE INTO {table_name} ({cols_str})
                    SELECT {cols_str} FROM {staging_table}
                """
            else:
                # PostgreSQL: INSERT ... ON CONFLICT DO UPDATE
                cols_str = ', '.join(available_cols)
                update_cols = [col for col in available_cols if col != key_col]
                update_set = ', '.join([f"{col} = EXCLUDED.{col}" for col in update_cols])
                
                upsert_sql = f"""
                    INSERT INTO {table_name} ({cols_str})
                    SELECT {cols_str} FROM {staging_table}
                    ON CONFLICT ({key_col}) DO UPDATE SET {update_set}
                """
            
            conn.execute(text(upsert_sql))
            conn.commit()
            logger.info(f"Upserted {len(df)} rows into {table_name}")
    
    finally:
        # Step 8: Cleanup - drop staging table
        with engine.connect() as conn:
            conn.execute(text(f"DROP TABLE IF EXISTS {staging_table}"))
            conn.commit()
            logger.info(f"Dropped staging table {staging_table}")

In [None]:
show_doc(map_and_upsert)

---

[source](https://github.com/abhisheksreesaila/fh-saas/blob/main/fh_saas/utils_polars_mapper.py#L17){target="_blank" style="float:right; font-size:smaller"}

### map_and_upsert

>      map_and_upsert (df:polars.dataframe.frame.DataFrame, table_name:str,
>                      key_col:str, db_uri:str, column_map:dict=None,
>                      unnest_cols:list[str]=None)

*Map JSON data to database columns and upsert using staging table pattern.*

Performance Strategy (Staging Table Pattern):
1. Write to temporary staging table (fast bulk insert)
2. Execute SQL INSERT ... ON CONFLICT for upsert (database-native, vectorized)
3. Drop staging table (cleanup)

This is 10-100x faster than row-by-row upserts for large datasets.

Args:
    df: Polars DataFrame with JSON data
    table_name: Target table (must already exist)
    key_col: Primary key for ON CONFLICT resolution
    db_uri: Database connection string
    column_map: Optional column renaming {json_col: db_col}
    unnest_cols: Optional list of nested columns to flatten

Example:
    ```python
    import polars as pl

    # JSON from API
    json_data = [
        {'user_id_val': 1, 'ABC_1': 'Alice', 'extra': 'ignore'},
        {'user_id_val': 2, 'ABC_1': 'Bob', 'extra': 'ignore'}
    ]

    # Convert to DataFrame
    df = pl.DataFrame(json_data)

    # Map and upsert
    map_and_upsert(
        df=df,
        table_name='users',
        key_col='user_id',
        db_uri='sqlite:///app.db',
        column_map={'user_id_val': 'user_id', 'ABC_1': 'name'}
    )
    ```

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| df | DataFrame |  | The raw Polars DataFrame from JSON |
| table_name | str |  | Target database table name |
| key_col | str |  | Primary key column for conflict resolution |
| db_uri | str |  | SQLAlchemy connection string (e.g., 'sqlite:///db.db' or 'postgresql://...') |
| column_map | dict | None | Optional rename map {json_key: db_col} |
| unnest_cols | list | None | List of Struct columns to flatten |

## Helper: Apply Schema Transformations

Type conversions for common API data formats (ISO dates, boolean strings, numeric strings).

In [None]:
#| export

def apply_schema(
    df: pl.DataFrame, # Input DataFrame
    type_map: dict # Column name -> Polars dtype (e.g., {'created_at': pl.Date, 'is_active': pl.Boolean})
) -> pl.DataFrame:
    """
    Apply explicit type conversions to DataFrame columns.
    
    Args:
        df: Polars DataFrame
        type_map: Dict mapping column names to Polars dtypes
    
    Returns:
        DataFrame with converted types
    
    Example:
        ```python
        df = pl.DataFrame({
            'created_at': ['2024-01-15', '2024-01-16'],
            'is_active': ['true', 'false'],
            'amount': ['123.45', '678.90']
        })
        
        df = apply_schema(df, {
            'created_at': pl.Date,
            'is_active': pl.Boolean,
            'amount': pl.Float64
        })
        ```
    """
    conversions = []
    
    for col_name, dtype in type_map.items():
        if col_name not in df.columns:
            logger.warning(f"Column {col_name} not found in DataFrame, skipping")
            continue
        
        # Handle different type conversions
        if dtype == pl.Date:
            conversions.append(pl.col(col_name).str.strptime(pl.Date, "%Y-%m-%d").alias(col_name))
        elif dtype == pl.Datetime:
            conversions.append(pl.col(col_name).str.strptime(pl.Datetime).alias(col_name))
        elif dtype == pl.Boolean:
            # Handle "true"/"false" strings
            conversions.append(
                pl.col(col_name).str.to_lowercase().eq("true").alias(col_name)
            )
        else:
            # Cast to specified type (works for numeric types)
            conversions.append(pl.col(col_name).cast(dtype).alias(col_name))
    
    if conversions:
        df = df.with_columns(conversions)
        logger.info(f"Applied schema conversions to {len(type_map)} columns")
    
    return df

In [None]:
show_doc(apply_schema)

---

[source](https://github.com/abhisheksreesaila/fh-saas/blob/main/fh_saas/utils_polars_mapper.py#L140){target="_blank" style="float:right; font-size:smaller"}

### apply_schema

>      apply_schema (df:polars.dataframe.frame.DataFrame, type_map:dict)

*Apply explicit type conversions to DataFrame columns.*

Args:
    df: Polars DataFrame
    type_map: Dict mapping column names to Polars dtypes

Returns:
    DataFrame with converted types

Example:
    ```python
    df = pl.DataFrame({
        'created_at': ['2024-01-15', '2024-01-16'],
        'is_active': ['true', 'false'],
        'amount': ['123.45', '678.90']
    })

    df = apply_schema(df, {
        'created_at': pl.Date,
        'is_active': pl.Boolean,
        'amount': pl.Float64
    })
    ```

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| df | DataFrame | Input DataFrame |
| type_map | dict | Column name -> Polars dtype (e.g., {'created_at': pl.Date, 'is_active': pl.Boolean}) |
| **Returns** | **DataFrame** |  |

In [None]:
#| hide

import nbdev as nb
nb.nbdev_export()