In [1]:
from pymongo import MongoClient
import datetime as dt
from dateutil.relativedelta import relativedelta
import pandas as pd
import numpy as np
import time

In [None]:
# MongoDB connection
c_v1 = MongoClient("mongodb://3.20.189.28:2036/momoKashBd?readPreference=secondary&directConnection=true")
db_v1 = c_v1["momoKashBd"]
Refunds = db_v1["Refund"]
print("MongoDB connection established with success")

# Loop from Sept 2022 to April 2025
current_date = dt.datetime(2022, 9, 1)
final_date = dt.datetime(2025, 4, 30)

while current_date <= final_date:
    date_start = current_date
    date_end = date_start + relativedelta(months=1) - dt.timedelta(days=1)
    print(f"\nProcessing refunds from {date_start.strftime('%B %Y')}...")

    start = time.time()

    cursor = Refunds.find(
        { 
            "dateTime": {"$gte": date_start, "$lte": date_end},
            "status": "COMPLETED"
        },
        {
            "_id": 0,
            "clientID": 1,
            "loanID": 1,
            "amount": 1,
            "refundCapital": 1,
            "refundInterest": 1,
            "balance": 1,
            "refundPeriod": 1,
            "refundType": 1,
            "loanDate": 1,
            "date": 1,
            "status": 1,
            "excess": 1,
            "dateTime": 1
        }
    )

    refunds_df = pd.DataFrame(list(cursor))

    if refunds_df.empty:
        print("No completed refunds were made during this period.")
    else:
        for col in ["loanDate", "date"]:
            refunds_df[col] = pd.to_datetime(refunds_df[col], dayfirst=True, errors="coerce")

        refunds_df = refunds_df.sort_values("dateTime", ascending=True)

        date_cols = ["loanDate", "date"]
        for col in date_cols:
            refunds_df[col] = pd.to_datetime(refunds_df[col], dayfirst=True, errors="coerce")

        req_fields = ["excess", "balance"]
        for col in req_fields:
            if col not in refunds_df.columns:
                refunds_df[col] = 0
            else:
                refunds_df[col] = refunds_df[col].fillna(0)

        num_cols = ["amount", "refundCapital", "refundInterest", "balance", "excess"]
        for col in num_cols:
            if col in refunds_df.columns:
                refunds_df[col] = refunds_df[col].fillna(0).apply(lambda x: max(x, 0))

        text_cols = ["loanID", "clientID", "refundPeriod", "refundType", "status"]
        for col in text_cols:
            if col in refunds_df.columns:
                refunds_df[col] = refunds_df[col].astype(str).str.strip()

        refunds_df.index = np.arange(refunds_df.shape[0])
        refunds_df = refunds_df.sort_values("date", ascending=True)

        print("Sample of fully cleaned refunds:")
        print(refunds_df.head(3))

        filename = f"refunds_{date_start.strftime('%b_%Y').lower()}.csv"
        refunds_df.to_csv(filename, index=False, sep=",")
        print(f"CSV export {filename} complete")

    end = time.time()
    print(f"Fetching time: {round((end-start)/60, 2)} minutes")

    current_date += relativedelta(months=1)


MongoDB connection established with success

Processing refunds from January 2025...
Sample of fully cleaned refunds:
       clientID                                             loanID  amount  \
0     653562962  237653562962:XCAM07X01T20241202100662962M:1074...     3.0   
8348  675610010  237675610010:FIRSTTRUST033X01T2024102214440400...    22.0   
8349  676785744  237676785744:FIRSTTRUST033X01T2024102214393807...   651.0   

      balance  excess       date refundPeriod                    refundType  \
0     10398.0     0.0 2025-01-01  Reconducted                 AUTODEBIT_ECW   
8348   5592.0     0.0 2025-01-01  Reconducted  BankDebit_Reconciliation_24H   
8349  15657.0     0.0 2025-01-01  Reconducted  BankDebit_Reconciliation_24H   

       loanDate  refundInterest  refundCapital     status            dateTime  
0    2024-12-02            0.40           2.60  COMPLETED 2025-01-01 00:00:01  
8348 2024-10-22            2.96          19.04  COMPLETED 2025-01-01 16:59:03  
8349 2024-10