### Mainframe to Databricks - Ingest

In [39]:
import copybook
import pandas as pd
import os
from datetime import datetime

In [34]:
DATALAKE_COPYBOOK = """
              01  RAW-POS-RECORD.
    05 TXN-DATE            PIC X(10).
    05 FILLER              PIC X(1).

    05 TXN-TIME            PIC X(8).
    05 FILLER              PIC X(1).

    05 STORE-ID            PIC X(6).
    05 FILLER              PIC X(1).

    05 TERMINAL-ID         PIC X(4).
    05 FILLER              PIC X(1).

    05 TXN-ID              PIC X(12).
    05 FILLER              PIC X(1).

    05 CUST-ID             PIC X(10).
    05 FILLER              PIC X(1).

    05 PAYMENT-MODE        PIC X(10).
    05 FILLER              PIC X(1).

    05 PARTNER-BANK        PIC X(15).
    05 FILLER              PIC X(1).

    05 AMOUNT-PAID         PIC 9(7)V99.
    05 FILLER              PIC X(1).

    05 BANK-PAYABLE        PIC 9(7)V99.
    05 FILLER              PIC X(1).

    05 CUSTOMER-PAYABLE    PIC 9(7)V99.
    05 FILLER              PIC X(1).

    05 CURRENCY-CODE       PIC X(3).
    05 FILLER              PIC X(1).

    05 TXN-STATUS          PIC X(10).  

"""


RAW_POST_DATALAKE = r"D:\mainframe_to_analytics_dev\mainframe\data\datalake\RAW_POS_DATALAKE_20260104.TXT"

In [35]:
## For DataLake

## STEP 1 :  copybook -> get fields

# copybook also provides a parse_file method that receives a text filename
root = copybook.parse_string(DATALAKE_COPYBOOK)

# flatten returns a list of Fields and FieldGroups instead of traversing the tree
list_of_fields = root.flatten()

In [36]:
## STEP 2 : RAW MAINFRAME FILE -> RAW CSV FILE

parsed_rows = []

with open(RAW_POST_DATALAKE, "r") as f:
    for line in f:
        record = {}
        
        for field in list_of_fields:
            # only process Field objects, skip groups/fillers
            if isinstance(field, copybook.Field):
                str_field = line[field.start_pos : field.start_pos + field.get_total_length()]
                record[field.name.lower().replace("-", "_")] = field.parse(str_field)
        
        parsed_rows.append(record)

bronze_df = pd.DataFrame(parsed_rows)



In [37]:
bronze_df.head()

Unnamed: 0,txn_date,filler,txn_time,store_id,terminal_id,txn_id,cust_id,payment_mode,partner_bank,amount_paid,bank_payable,customer_payable,currency_code,txn_status
0,2026-01-04,,12:47:16,STR001,T01,TXN000001,CUST89581,CASH,,1988.85,2038.57,2018.68,ESS,
1,2026-01-04,,12:47:16,STR002,T01,TXN000002,CUST62506,UPI,HDFC,716.64,734.56,727.39,ED,
2,2026-01-04,,12:47:16,STR002,T01,TXN000003,CUST55832,CARD,HDFC,1379.19,1413.67,1399.88,ED,
3,2026-01-04,,12:47:16,STR001,T01,TXN000004,CUST61397,CARD,SBI,1812.28,1857.59,1839.46,ESS,
4,2026-01-04,,12:47:16,STR001,T01,TXN000005,CUST68294,CARD,HDFC,1135.01,1163.39,1152.04,ESS,


In [40]:
## STEP 3 : RAW CSV FILE -> CATALOG
BRONZE_DIR = r"D:\mainframe_to_analytics_dev\databricks\catalog\bronze"
run_date = datetime.now().strftime("%Y%m%d")
BRONZE_FILE = os.path.join(
            BRONZE_DIR, f"BRONZE{run_date}.CSV"
        )

os.makedirs(BRONZE_DIR, exist_ok=True)

bronze_df.to_csv(BRONZE_FILE)