In [69]:

import pandas as pd
from collections import Counter
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import concurrent.futures
import pickle as pkl




In [70]:
data = pkl.load(open('../../processed_synth_dataset/synth_features_4_5.pkl', 'rb'))

In [71]:
comb_df = pd.concat([value for key, value in data], axis = 1, join='outer')
# for key, value in data:
#     print(key)
#     print(value.columns)

# Wire Transfers

In [60]:
frac = 1
wire = pd.read_csv('../../processed_synth_dataset/wire_s.csv', engine="pyarrow").sample(frac = frac)
ach = pd.read_csv('../../processed_synth_dataset/ach_s.csv', engine="pyarrow").sample(frac = frac)
cheque = pd.read_csv('../../processed_synth_dataset/cheque_s.csv', engine="pyarrow").sample(frac = frac)
card = pd.read_csv('../../processed_synth_dataset/card_s.csv', engine="pyarrow").sample(frac = frac)


In [61]:
print(len(wire)+len(ach)+len(cheque)+len(card)) 

5308695


In [62]:
print("# of Wire transactions: ", wire.shape[0])
print("# of ABM transactions: ", ach.shape[0])
print("# of Cheque transactions: ", cheque.shape[0])
print("# of Card transactions: ", card.shape[0])

# of Wire transactions:  228567
# of ABM transactions:  796581
# of Cheque transactions:  2503158
# of Card transactions:  1780389


In [63]:
# dfs = {'card': card, 'wire': wire, 'ach': ach, 'cheque': cheque}
dfs = {'wire': wire}
#Sorting the DFs by date and Time
for key in dfs.keys():
    dfs[key]['transaction_datetime'] = pd.to_datetime(dfs[key]['transaction_date'].astype(str) + ' ' + dfs[key]['transaction_time'].astype(str))
    dfs[key].sort_values( by = ['customer_id', 'transaction_datetime'] , ascending = [True, True], ignore_index=True, inplace = True)

In [64]:
# Function to divide the date range into weekly and monthly ranges
def get_date_ranges(start_date, end_date):
    # Convert the start and end date to datetime
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    start_date_week = start_date
    end_date_week = end_date
    
    start_date_month = start_date
    end_date_month = end_date
    
    # Adjust the start_date to the previous Monday if it's not already a Monday
    if start_date_week.weekday() != 0:
        start_date_week -= pd.Timedelta(days=start_date_week.weekday())
    
    # Adjust the end_date to the next Sunday if it's not already a Sunday
    if end_date_week.weekday() != 6:
        end_date_week += pd.Timedelta(days=(6 - end_date_week.weekday()))
    
    # Generate a range of dates from start_date to end_date with a frequency of 'W-MON' (weekly on Monday)
    week_starts = pd.date_range(start=start_date_week, end=end_date_week, freq='W-MON')
    
    # Create a list of tuples with start and end dates for each week
    weekly_ranges = []
    for start in week_starts:
        end = start + pd.Timedelta(days=6)
        weekly_ranges.append((start, end))
    
    # Adjust the start_date to the first day of the month
    start_date_month = start_date_month.replace(day=1)
    
    # Adjust the end_date to the last day of the month
    if end_date_month.days_in_month != end_date_month.day:
        end_date_month.replace(day=end_date_month.days_in_month)
        
    # end_date_month = (end_date_month + pd.offsets.MonthEnd(1)).normalize()
    
    # Generate a range of dates from start_date_month to end_date_month with a frequency of 'MS' (monthly start)
    month_starts = pd.date_range(start=start_date_month, end=end_date_month, freq='MS')
    
    # Create a list of tuples with start and end dates for each month
    monthly_ranges = []
    for start in month_starts:
        end = (start + pd.offsets.MonthEnd(1)).normalize()
        monthly_ranges.append((start, end))
    
    return weekly_ranges, monthly_ranges

In [65]:
def get_minute_day_ranges(start_date_time, end_date_time):
    # Convert inputs to pandas timestamps
    start_date_time = pd.to_datetime(start_date_time)
    end_date_time = pd.to_datetime(end_date_time)

    # ------------------------------------------
    # 60-minute intervals
    # ------------------------------------------
    # Align the start to the nearest 60-minute boundary (rounding down)
    start_60min = start_date_time.floor('60min')
    # Align the end to the nearest 60-minute boundary (rounding up)
    end_60min = end_date_time.ceil('60min')

    # Create 60-minute interval ranges.
    # We subtract 60 minutes from end_5min because we want full 60-minute blocks.
    five_min_intervals = []
    five_min_starts = pd.date_range(start=start_60min, 
                                    end=end_60min - pd.Timedelta(minutes=5), 
                                    freq='60min')
    for start in five_min_starts:
        # Define the end of the interval as exactly 5 minutes later,
        # subtracting a microsecond so that intervals don't overlap (if needed)
        end = start + pd.Timedelta(minutes=60) - pd.Timedelta(microseconds=1)
        five_min_intervals.append((start, end))

    # ------------------------------------------
    # Daily intervals
    # ------------------------------------------
    # Normalize the start to midnight
    start_day = start_date_time.normalize()
    # Normalize the end to midnight.
    # Note: if end_date_time is not exactly midnight, this represents the beginning
    #       of that day. That day will be included as a full day interval.
    end_day = end_date_time.normalize()

    daily_intervals = []
    # Create a date_range for each day
    day_starts = pd.date_range(start=start_day, end=end_day, freq='D')
    for day in day_starts:
        # Each interval spans the entire day.
        # We set the end to be one day later minus one microsecond.
        day_end = day + pd.Timedelta(days=1) - pd.Timedelta(microseconds=1)
        daily_intervals.append((day, day_end))

    return five_min_intervals, daily_intervals


In [66]:
start = pd.Timestamp("2025-01-01 12:03:15")
end = pd.Timestamp("2025-01-01 13:02:45")
five_min_ranges, daily_ranges = get_minute_day_ranges(start, end)
print("5-Minute Intervals:")
for interval in five_min_ranges:
    print(interval)
print("\nDaily Intervals:")
for interval in daily_ranges:
    print(interval)


5-Minute Intervals:
(Timestamp('2025-01-01 12:00:00'), Timestamp('2025-01-01 12:59:59.999999'))
(Timestamp('2025-01-01 13:00:00'), Timestamp('2025-01-01 13:59:59.999999'))

Daily Intervals:
(Timestamp('2025-01-01 00:00:00'), Timestamp('2025-01-01 23:59:59.999999'))


In [67]:
#Collecting all unique customer IDs

wire_customers = dfs['wire']['customer_id'].unique()
ach_customers = dfs['ach']['customer_id'].unique()
cheque_customers = dfs['cheque']['customer_id'].unique()
card_customers = dfs['card']['customer_id'].unique()

all_customers= list(set(np.concatenate((wire_customers, ach_customers, cheque_customers, card_customers), axis=0)))
print(len(all_customers))

KeyError: 'ach'

In [42]:
'''
# import random
# from datetime import datetime, timedelta

# # Define a two-day time window
# start_date = datetime(2023, 1, 1)
# end_date = datetime(2023, 1, 2, 23, 59, 59)

# def random_date(start, end):
#     """Return a random datetime between start and end."""
#     delta = end - start
#     random_seconds = random.randint(0, int(delta.total_seconds()))
#     return start + timedelta(seconds=random_seconds)

# # Create a list of 20 unique customers (alphanumeric IDs)

# customer_ids = [f"C{str(i).zfill(3)}" for i in random.choices(range(1, 3), k=20)]

# # Prepare a list to hold each transaction as a dictionary
# data = []

# # Generate 20 transactions (one per customer)
# for cid in customer_ids:
#     txn = {
#         'customer_id': cid,
#         'debit_credit': random.choice(['debit', 'credit']),
#         'amount_cad': random.randint(10, 50),
#         'transaction_datetime': random_date(start_date, end_date)
#     }
#     data.append(txn)

# # Enforce that two transactions occur within 5 minutes of each other.
# # For example, choose the first two transactions to be within 5 minutes.
# base_time = random_date(start_date, end_date)
# data[0]['transaction_datetime'] = base_time
# # For the second transaction, add a random delta from 0 to 299 seconds (i.e. less than 5 minutes)
# data[1]['transaction_datetime'] = base_time + timedelta(seconds=random.randint(0, 299))

# # Create the DataFrame. The columns are: customer_id, debit_credit, amount_cad, transaction_datetime.
# df = pd.DataFrame(data)

# df.sort_values(by=['customer_id', 'debit_credit','transaction_datetime'], ascending=[True, True, True], ignore_index=True, inplace=True)
# print(df)
'''

'\n# import random\n# from datetime import datetime, timedelta\n\n# # Define a two-day time window\n# start_date = datetime(2023, 1, 1)\n# end_date = datetime(2023, 1, 2, 23, 59, 59)\n\n# def random_date(start, end):\n#     """Return a random datetime between start and end."""\n#     delta = end - start\n#     random_seconds = random.randint(0, int(delta.total_seconds()))\n#     return start + timedelta(seconds=random_seconds)\n\n# # Create a list of 20 unique customers (alphanumeric IDs)\n\n# customer_ids = [f"C{str(i).zfill(3)}" for i in random.choices(range(1, 3), k=20)]\n\n# # Prepare a list to hold each transaction as a dictionary\n# data = []\n\n# # Generate 20 transactions (one per customer)\n# for cid in customer_ids:\n#     txn = {\n#         \'customer_id\': cid,\n#         \'debit_credit\': random.choice([\'debit\', \'credit\']),\n#         \'amount_cad\': random.randint(10, 50),\n#         \'transaction_datetime\': random_date(start_date, end_date)\n#     }\n#     data.appe

In [43]:
# Creating a DataFrame with all unique customer IDs to store the features
customer_stats = pd.DataFrame(index=sorted(all_customers))


In [44]:
customer_stats

10042B660
10042B6A8
10042B6F0
10042B738
10042B780
...
81C1EC560
81C1EC5B0
81C1EC600
81C1EC650
81C1EC6A0


In [68]:
for df_key in dfs.keys():
    df = dfs[df_key]
    df_customers = sorted(df['customer_id'].unique()) 
    
    start_idx = 0
    curr_idx = 0
    last_idx = None
        
    for customer in tqdm(df_customers):
        end_flag = False
        while (df.loc[curr_idx, 'customer_id'] == customer):
            # print(df.loc[curr_idx, 'customer_id'])
            curr_idx += 1
            # print(curr_idx)
            if curr_idx == len(df):
                end_flag = True
                break
        last_idx = curr_idx            
        
        if not end_flag:
            customer_df = df.iloc[start_idx:last_idx, :]
        else:
            customer_df = df.iloc[start_idx:, :]

        start_idx = curr_idx
        
        max_credit_minute_trx = 0 
        max_credit_daily_trx = 0
        # max_credit_weekly_trx = 0
        # max_credit_monthly_trx = 0
        max_credit_minute_trx_avg_val = 0
        max_credit_daily_trx_avg_val = 0
        # max_credit_weekly_trx_avg_val = 0
        # max_credit_monthly_trx_avg_val = 0
        
        max_debit_minute_trx = 0
        max_debit_daily_trx = 0
        # max_debit_weekly_trx = 0
        # max_debit_monthly_trx = 0
        max_debit_minute_trx_avg_val = 0
        max_debit_daily_trx_avg_val = 0
        # max_debit_weekly_trx_avg_val = 0
        # max_debit_monthly_trx_avg_val = 0
                
        start_date = df['transaction_datetime'].min()
        end_date = df['transaction_datetime'].max()
        # weekly_ranges, monthly_ranges = get_date_ranges(start_date, end_date)
        minute_ranges, day_ranges = get_minute_day_ranges(start_date, end_date)
 
        
        for i, (start, end) in enumerate(minute_ranges):
            period_df = customer_df[(customer_df['transaction_datetime'] >= start) & (customer_df['transaction_datetime'] <= end)]
            
            credit_period_df = period_df[period_df['debit_credit'] == 'credit']
            debit_period_df = period_df[period_df['debit_credit'] == 'debit']
            
            if len(credit_period_df) > max_credit_minute_trx:
                max_credit_minute_trx = len(credit_period_df)
                max_credit_minute_trx_avg_val = credit_period_df['amount_cad'].mean()    
            if len(debit_period_df) > max_debit_minute_trx:
                max_debit_minute_trx = len(debit_period_df)
                max_debit_minute_trx_avg_val = debit_period_df['amount_cad'].mean()

        customer_stats.loc[customer, df_key+'_max_credit_minute_trx'] = max_credit_minute_trx
        customer_stats.loc[customer, df_key+'_max_debit_minute_trx_avg_val'] = max_debit_minute_trx_avg_val
        customer_stats.loc[customer, df_key+'_max_debit_minute_trx'] = max_debit_minute_trx
        customer_stats.loc[customer, df_key+'_max_credit_minute_trx_avg_val'] = max_credit_minute_trx_avg_val
        
        for i, (start, end) in enumerate(day_ranges):
            period_df = customer_df[(customer_df['transaction_datetime'] >= start) & (customer_df['transaction_datetime'] <= end)]
            
            credit_period_df = period_df[period_df['debit_credit'] == 'credit']
            debit_period_df = period_df[period_df['debit_credit'] == 'debit']
            
            if len(credit_period_df) > max_credit_daily_trx:
                max_credit_daily_trx = len(credit_period_df)
                max_credit_daily_trx_avg_val = credit_period_df['amount_cad'].mean()
            
            if len(debit_period_df) > max_debit_daily_trx:
                max_debit_daily_trx = len(debit_period_df)
                max_debit_daily_trx_avg_val = debit_period_df['amount_cad'].mean()

        customer_stats.loc[customer, df_key+'_max_credit_daily_trx'] = max_credit_daily_trx
        customer_stats.loc[customer, df_key+'_max_credit_daily_trx_avg_val'] = max_credit_daily_trx_avg_val
        customer_stats.loc[customer, df_key+'_max_debit_daily_trx'] = max_debit_daily_trx
        customer_stats.loc[customer, df_key+'_max_debit_daily_trx_avg_val'] = max_debit_daily_trx_avg_val


  1%|          | 733/64587 [00:48<1:10:34, 15.08it/s]


KeyboardInterrupt: 

In [48]:
def process_single_df(item):
    df_key, df = item
    
    df_customers = sorted(df['customer_id'].unique()) 
    customer_stats_p = pd.DataFrame(index=df_customers)
    start_idx = 0
    curr_idx = 0
    last_idx = None
        
    for customer in tqdm(df_customers):
        end_flag = False
        while (df.loc[curr_idx, 'customer_id'] == customer):
            # print(df.loc[curr_idx, 'customer_id'])
            curr_idx += 1
            # print(curr_idx)
            if curr_idx == len(df):
                end_flag = True
                break
        last_idx = curr_idx            
        
        if not end_flag:
            customer_df = df.iloc[start_idx:last_idx, :]
        else:
            customer_df = df.iloc[start_idx:, :]

        start_idx = curr_idx
        
        max_credit_minute_trx = 0 
        max_credit_daily_trx = 0
        # max_credit_weekly_trx = 0
        # max_credit_monthly_trx = 0
        max_credit_minute_trx_avg_val = 0
        max_credit_daily_trx_avg_val = 0
        # max_credit_weekly_trx_avg_val = 0
        # max_credit_monthly_trx_avg_val = 0
        
        max_debit_minute_trx = 0
        max_debit_daily_trx = 0
        # max_debit_weekly_trx = 0
        # max_debit_monthly_trx = 0
        max_debit_minute_trx_avg_val = 0
        max_debit_daily_trx_avg_val = 0
        # max_debit_weekly_trx_avg_val = 0
        # max_debit_monthly_trx_avg_val = 0
                
        start_date = df['transaction_datetime'].min()
        end_date = df['transaction_datetime'].max()
        # weekly_ranges, monthly_ranges = get_date_ranges(start_date, end_date)
        minute_ranges, day_ranges = get_minute_day_ranges(start_date, end_date)
 
        
        for i, (start, end) in enumerate(minute_ranges):
            period_df = customer_df[(customer_df['transaction_datetime'] >= start) & (customer_df['transaction_datetime'] <= end)]
            
            credit_period_df = period_df[period_df['debit_credit'] == 'credit']
            debit_period_df = period_df[period_df['debit_credit'] == 'debit']
            
            if len(credit_period_df) > max_credit_minute_trx:
                max_credit_minute_trx = len(credit_period_df)
                max_credit_minute_trx_avg_val = credit_period_df['amount_cad'].mean()    
            if len(debit_period_df) > max_debit_minute_trx:
                max_debit_minute_trx = len(debit_period_df)
                max_debit_minute_trx_avg_val = debit_period_df['amount_cad'].mean()

        customer_stats_p.loc[customer, df_key+'_max_credit_minute_trx'] = max_credit_minute_trx
        customer_stats_p.loc[customer, df_key+'_max_debit_minute_trx_avg_val'] = max_debit_minute_trx_avg_val
        customer_stats_p.loc[customer, df_key+'_max_debit_minute_trx'] = max_debit_minute_trx
        customer_stats_p.loc[customer, df_key+'_max_credit_minute_trx_avg_val'] = max_credit_minute_trx_avg_val
        
        for i, (start, end) in enumerate(day_ranges):
            period_df = customer_df[(customer_df['transaction_datetime'] >= start) & (customer_df['transaction_datetime'] <= end)]
            
            credit_period_df = period_df[period_df['debit_credit'] == 'credit']
            debit_period_df = period_df[period_df['debit_credit'] == 'debit']
            
            if len(credit_period_df) > max_credit_daily_trx:
                max_credit_daily_trx = len(credit_period_df)
                max_credit_daily_trx_avg_val = credit_period_df['amount_cad'].mean()
            
            if len(debit_period_df) > max_debit_daily_trx:
                max_debit_daily_trx = len(debit_period_df)
                max_debit_daily_trx_avg_val = debit_period_df['amount_cad'].mean()

        customer_stats_p.loc[customer, df_key+'_max_credit_daily_trx'] = max_credit_daily_trx
        customer_stats_p.loc[customer, df_key+'_max_credit_daily_trx_avg_val'] = max_credit_daily_trx_avg_val
        customer_stats_p.loc[customer, df_key+'_max_debit_daily_trx'] = max_debit_daily_trx
        customer_stats_p.loc[customer, df_key+'_max_debit_daily_trx_avg_val'] = max_debit_daily_trx_avg_val
        
    return (df_key, customer_stats_p)

with concurrent.futures.ProcessPoolExecutor() as executor:
    # dfs.items() produces (df_key, df) tuples.
    results = list(tqdm(executor.map(process_single_df, dfs.items()), total=len(dfs)))

  0%|          | 0/4 [00:00<?, ?it/s]Process SpawnProcess-1:
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.6/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/homebrew/Cellar/python@3.12/3.12.6/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/homebrew/Cellar/python@3.12/3.12.6/Frameworks/Python.framework/Versions/3.12/lib/python3.12/concurrent/futures/process.py", line 251, in _process_worker
    call_item = call_queue.get(block=True)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.12/3.12.6/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  0%|          | 0/4 [00:00<?, ?it/s]ter'>)>_single_df' 

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [27]:
# import pandas as pd
# from tqdm import tqdm

# Suppose customer_stats DataFrame is already created with customers as the index.
# And dfs is a dictionary of DataFrames keyed by something (e.g., transaction type).

for df_key in dfs.keys():
    df = dfs[df_key]
    df_customers = sorted(df['customer_id'].unique()) 
    
    start_idx = 0
    curr_idx = 0
    last_idx = None
        
    for customer in tqdm(df_customers):
        end_flag = False
        while (df.loc[curr_idx, 'customer_id'] == customer):
            curr_idx += 1
            if curr_idx == len(df):
                end_flag = True
                break
        last_idx = curr_idx            
        
        if not end_flag:
            customer_df = df.iloc[start_idx:last_idx, :]
        else:
            customer_df = df.iloc[start_idx:, :]

        start_idx = curr_idx
        
        # Prepare lists to hold (count, avg) tuples for top 5 metrics.
        credit_minute_list = []
        debit_minute_list = []
        credit_daily_list = []
        debit_daily_list = []
                
        start_date = df['transaction_datetime'].min()
        end_date = df['transaction_datetime'].max()
        minute_ranges, day_ranges = get_minute_day_ranges(start_date, end_date)
        
        # Process minute ranges
        for (start, end) in minute_ranges:
            period_df = customer_df[(customer_df['transaction_datetime'] >= start) & (customer_df['transaction_datetime'] <= end)]
            credit_period_df = period_df[period_df['debit_credit'] == 'credit']
            debit_period_df = period_df[period_df['debit_credit'] == 'debit']
            
            # For credit
            credit_count = len(credit_period_df)
            credit_avg = credit_period_df['amount_cad'].mean() if credit_count > 0 else 0
            credit_minute_list.append((credit_count, credit_avg))
            
            # For debit
            debit_count = len(debit_period_df)
            debit_avg = debit_period_df['amount_cad'].mean() if debit_count > 0 else 0
            debit_minute_list.append((debit_count, debit_avg))
        
        # Process day ranges
        for (start, end) in day_ranges:
            period_df = customer_df[(customer_df['transaction_datetime'] >= start) & (customer_df['transaction_datetime'] <= end)]
            credit_period_df = period_df[period_df['debit_credit'] == 'credit']
            debit_period_df = period_df[period_df['debit_credit'] == 'debit']
            
            credit_count = len(credit_period_df)
            credit_avg = credit_period_df['amount_cad'].mean() if credit_count > 0 else 0
            credit_daily_list.append((credit_count, credit_avg))
            
            debit_count = len(debit_period_df)
            debit_avg = debit_period_df['amount_cad'].mean() if debit_count > 0 else 0
            debit_daily_list.append((debit_count, debit_avg))
        
        # Sort and get top 5 for each category
        top5_credit_minute = sorted(credit_minute_list, key=lambda x: x[0], reverse=True)[:2]
        top5_debit_minute = sorted(debit_minute_list, key=lambda x: x[0], reverse=True)[:2]
        top5_credit_daily = sorted(credit_daily_list, key=lambda x: x[0], reverse=True)[:2]
        top5_debit_daily = sorted(debit_daily_list, key=lambda x: x[0], reverse=True)[:2]
        
        for i in range(2):
            customer_stats.loc[customer, df_key+f'_top{str(i)}_credit_minute_trx_count'] = top5_credit_minute[i][0]
            customer_stats.loc[customer, df_key+f'_top{str(i)}_credit_minute_trx_avg_val'] = top5_credit_minute[i][1]
            customer_stats.loc[customer, df_key+f'_top{str(i)}_debit_minute_trx_count'] = top5_debit_minute[i][0]
            customer_stats.loc[customer, df_key+f'_top{str(i)}_debit_minute_trx_avg_val'] = top5_debit_minute[i][1]
            customer_stats.loc[customer, df_key+f'_top{str(i)}_credit_daily_trx_count'] = top5_credit_daily[i][0]
            customer_stats.loc[customer, df_key+f'_top{str(i)}_credit_daily_trx_avg_val'] = top5_credit_daily[i][1]
            customer_stats.loc[customer, df_key+f'_top{str(i)}_debit_daily_trx_count'] = top5_debit_daily[i][0]
            customer_stats.loc[customer, df_key+f'_top{str(i)}_debit_daily_trx_avg_val'] = top5_debit_daily[i][1]
        
        # # You can store these as lists in the DataFrame.
        # customer_stats.loc[customer, df_key+'_top5_credit_minute'] = str(top5_credit_minute)
        # customer_stats.loc[customer, df_key+'_top5_debit_minute'] = str(top5_debit_minute)
        # customer_stats.loc[customer, df_key+'_top5_credit_daily'] = str(top5_credit_daily)
        # customer_stats.loc[customer, df_key+'_top5_debit_daily'] = str(top5_debit_daily)

100%|██████████| 168/168 [00:11<00:00, 15.13it/s]
100%|██████████| 23/23 [00:01<00:00, 15.48it/s]
  customer_stats.loc[customer, df_key+f'_top{str(i)}_credit_daily_trx_count'] = top5_credit_daily[i][0]
  customer_stats.loc[customer, df_key+f'_top{str(i)}_credit_daily_trx_avg_val'] = top5_credit_daily[i][1]
  customer_stats.loc[customer, df_key+f'_top{str(i)}_debit_daily_trx_count'] = top5_debit_daily[i][0]
  customer_stats.loc[customer, df_key+f'_top{str(i)}_debit_daily_trx_avg_val'] = top5_debit_daily[i][1]
  customer_stats.loc[customer, df_key+f'_top{str(i)}_credit_minute_trx_count'] = top5_credit_minute[i][0]
  customer_stats.loc[customer, df_key+f'_top{str(i)}_credit_minute_trx_avg_val'] = top5_credit_minute[i][1]
  customer_stats.loc[customer, df_key+f'_top{str(i)}_debit_minute_trx_count'] = top5_debit_minute[i][0]
  customer_stats.loc[customer, df_key+f'_top{str(i)}_debit_minute_trx_avg_val'] = top5_debit_minute[i][1]
  customer_stats.loc[customer, df_key+f'_top{str(i)}_credit_da

In [78]:
for df_key in dfs.keys():
    df = dfs[df_key]
    df_customers = sorted(df['customer_id'].unique()) 
    
    start_idx = 0
    curr_idx = 0
    last_idx = None
        
    for customer in tqdm(df_customers):
        end_flag = False
        while (df.loc[curr_idx, 'customer_id'] == customer):
            curr_idx += 1
            if curr_idx == len(df):
                end_flag = True
                break
        last_idx = curr_idx            
        
        if not end_flag:
            customer_df = df.iloc[start_idx:last_idx, :]
        else:
            customer_df = df.iloc[start_idx:, :]

        start_idx = curr_idx
        
        # Prepare lists to collect (count, avg) tuples per period.
        credit_minute_list = []
        debit_minute_list = []
                
        start_date = df['transaction_datetime'].min()
        end_date = df['transaction_datetime'].max()
        minute_ranges, day_ranges = get_minute_day_ranges(start_date, end_date)
        
        # Process minute ranges
        for (start, end) in minute_ranges:
            period_df = customer_df[(customer_df['transaction_datetime'] >= start) & 
                                    (customer_df['transaction_datetime'] <= end)]
            credit_period_df = period_df[period_df['debit_credit'] == 'credit']
            debit_period_df = period_df[period_df['debit_credit'] == 'debit']
            
            credit_count = len(credit_period_df)
            credit_avg = credit_period_df['amount_cad'].mean() if credit_count > 0 else 0
            credit_minute_list.append((credit_count, credit_avg))
            
            debit_count = len(debit_period_df)
            debit_avg = debit_period_df['amount_cad'].mean() if debit_count > 0 else 0
            debit_minute_list.append((debit_count, debit_avg))
        
        # Sort and pick top 5 (based on count) for credit and debit
        top5_credit_minute = sorted(credit_minute_list, key=lambda x: x[0], reverse=True)[:5]
        top5_debit_minute = sorted(debit_minute_list, key=lambda x: x[0], reverse=True)[:5]
        
        # Store the top 5 values in 5 different columns each.
        for i in range(5):
            if i < len(top5_credit_minute):
                credit_count, credit_avg = top5_credit_minute[i]
            else:
                credit_count, credit_avg = None, None
            if i < len(top5_debit_minute):
                debit_count, debit_avg = top5_debit_minute[i]
            else:
                debit_count, debit_avg = None, None
            
            customer_stats.loc[customer, f'{df_key}_credit_minute_top{i+1}_count'] = credit_count
            customer_stats.loc[customer, f'{df_key}_credit_minute_top{i+1}_avg'] = credit_avg
            
            customer_stats.loc[customer, f'{df_key}_debit_minute_top{i+1}_count'] = debit_count
            customer_stats.loc[customer, f'{df_key}_debit_minute_top{i+1}_avg'] = debit_avg
        
        # Similarly, to do this for daily ranges, you can follow the same pattern:
        credit_daily_list = []
        debit_daily_list = []
        for (start, end) in day_ranges:
            period_df = customer_df[(customer_df['transaction_datetime'] >= start) & 
                                    (customer_df['transaction_datetime'] <= end)]
            credit_period_df = period_df[period_df['debit_credit'] == 'credit']
            debit_period_df = period_df[period_df['debit_credit'] == 'debit']
            
            credit_count = len(credit_period_df)
            credit_avg = credit_period_df['amount_cad'].mean() if credit_count > 0 else 0
            credit_daily_list.append((credit_count, credit_avg))
            
            debit_count = len(debit_period_df)
            debit_avg = debit_period_df['amount_cad'].mean() if debit_count > 0 else 0
            debit_daily_list.append((debit_count, debit_avg))
        
        top5_credit_daily = sorted(credit_daily_list, key=lambda x: x[0], reverse=True)[:5]
        top5_debit_daily = sorted(debit_daily_list, key=lambda x: x[0], reverse=True)[:5]

        for i in range(5):
            if i < len(top5_credit_daily):
                credit_count, credit_avg = top5_credit_daily[i]
            else:
                credit_count, credit_avg = None, None
            if i < len(top5_debit_daily):
                debit_count, debit_avg = top5_debit_daily[i]
            else:
                debit_count, debit_avg = None, None
            
            customer_stats.loc[customer, f'{df_key}_credit_daily_top{i+1}_count'] = credit_count
            customer_stats.loc[customer, f'{df_key}_credit_daily_top{i+1}_avg'] = credit_avg
            
            customer_stats.loc[customer, f'{df_key}_debit_daily_top{i+1}_count'] = debit_count
            customer_stats.loc[customer, f'{df_key}_debit_daily_top{i+1}_avg'] = debit_avg

  0%|          | 18/191927 [00:16<49:52:01,  1.07it/s]


KeyboardInterrupt: 

In [None]:
# customer_stats = pd.DataFrame({'customer_id': all_customers})

In [66]:
# df_customers = {'card': card_customers, 'wire': wire_customers, 'ach': ach_customers, 'cheque': cheque_customers}

# for df_key in dfs.keys():
#     df = dfs[df_key]
#     df_customers = sorted(df['customer_id'].unique()) 
    
#     start_idx = 0
#     curr_idx = 0
#     last_idx = None
        
#     for customer in tqdm(df_customers):
        
#         end_flag = False
#         while (df.loc[curr_idx, 'customer_id'] == customer):
#             # print(df.loc[curr_idx, 'customer_id'])
#             curr_idx += 1
#             # print(curr_idx)
#             if curr_idx == len(df):
#                 end_flag = True
#                 break
#         last_idx = curr_idx            
        
#         if not end_flag:
#             customer_df = df.iloc[start_idx:last_idx, :]
#         else:
#             customer_df = df.iloc[start_idx:, :]
#         l = l + len(customer_df)
#         start_idx = curr_idx
        
#         max_credit_weekly_trx = 0
#         max_credit_monthly_trx = 0
#         max_credit_weekly_trx_avg_val = 0
#         max_credit_monthly_trx_avg_val = 0
        
#         max_debit_weekly_trx = 0
#         max_debit_monthly_trx = 0
#         max_debit_weekly_trx_avg_val = 0
#         max_debit_monthly_trx_avg_val = 0
                
#         start_date = df['transaction_date'].min()
#         end_date = df['transaction_date'].max()
#         weekly_ranges, monthly_ranges = get_date_ranges(start_date, end_date)
        
#         for i, (start, end) in enumerate(weekly_ranges):
#             period_df = customer_df[(customer_df['transaction_date'] >= pd.to_datetime(start).date()) & (customer_df['transaction_date'] <= pd.to_datetime(end).date())]
            
#             credit_period_df = period_df[period_df['debit_credit'] == 'credit']
#             debit_period_df = period_df[period_df['debit_credit'] == 'debit']
            
#             if len(credit_period_df) > max_credit_weekly_trx:
#                 max_credit_weekly_trx = len(credit_period_df)
#                 max_credit_weekly_trx_avg_val = credit_period_df['amount_cad'].mean()
            
#             if len(debit_period_df) > max_debit_weekly_trx:
#                 max_debit_weekly_trx = len(debit_period_df)
#                 max_debit_weekly_trx_avg_val = debit_period_df['amount_cad'].mean()


#         customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_credit_weekly_trx'] = max_credit_weekly_trx
#         customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_credit_weekely_trx_avg_val'] = max_credit_weekly_trx_avg_val
        
#         customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_debit_weekly_trx'] = max_debit_weekly_trx
#         customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_debit_weekely_trx_avg_val'] = max_debit_weekly_trx_avg_val
    
        
#         # Doing the monthly calculations
#         for i,(start, end) in enumerate(monthly_ranges):
#             period_df = customer_df[(customer_df['transaction_date'] >= pd.to_datetime(start).date()) & (customer_df['transaction_date'] <= pd.to_datetime(end).date())]
            
#             credit_period_df = period_df[period_df['debit_credit'] == 'credit']
#             debit_period_df = period_df[period_df['debit_credit'] == 'debit']
            
#             # Calculate the maximum number of transactions in a month and the average transaction value
#             if len(credit_period_df) > max_credit_monthly_trx:
#                 max_credit_monthly_trx = len(credit_period_df)
#                 max_credit_monthly_trx_avg_val = credit_period_df['amount_cad'].mean()
            
#             if len(debit_period_df) > max_debit_monthly_trx:
#                 max_debit_monthly_trx = len(debit_period_df)
#                 max_debit_monthly_trx_avg_val = debit_period_df['amount_cad'].mean()
            
#             # if len(period_df) > max_monthly_trx:
#             #     max_monthly_trx = len(period_df)
#             #     max_monthly_trx_avg_val = period_df['amount_cad'].mean()

        
#         # Store the maximum number of transactions and average transaction value for the month       
#         customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_credit_monthly_trx'] = max_credit_monthly_trx
#         customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_credit_monthly_trx_avg_val'] = max_credit_monthly_trx_avg_val

#         customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_debit_monthly_trx'] = max_debit_monthly_trx
#         customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_debit_monthly_trx_avg_val'] = max_debit_monthly_trx_avg_val
        
        


  0%|          | 0/2240 [00:00<?, ?it/s]

 33%|███▎      | 750/2240 [00:16<00:32, 45.77it/s]


KeyboardInterrupt: 

In [20]:
ll = 0
for df_key in dfs.keys():
    ll = ll + len(dfs[df_key])
print(ll)

530870


In [13]:
df_customers = {'wire': wire_customers, 'ach': ach_customers, 'cheque': cheque_customers, 'card': card_customers}
all_customers = sorted(all_customers)
for df_key in dfs.keys():
    df = dfs[df_key]
    
    start_idx = 0
    curr_idx = 0
    last_idx = None
        
    for customer in tqdm(all_customers):
        # Find the eariest and latest transaction date for the customer in this transaction type
       
        # if customer == 'SYNCID0000006876':
        #     print("SYNCID0000006876")
       
        # Setting the features to 0 if the customer is not present in the transactions       
        if customer != df_customers[df_key][start_idx]:
            customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_credit_weekly_trx'] = 0
            customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_credit_monthly_trx'] = 0
            customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_credit_weekely_trx_avg_val'] = 0
            customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_credit_monthly_trx_avg_val'] = 0
            
            customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_debit_weekly_trx'] = 0
            customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_debit_monthly_trx'] = 0
            customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_debit_weekely_trx_avg_val'] = 0
            customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_debit_monthly_trx_avg_val'] = 0
            continue
        
        # cust_df = pd.DataFrame()
        end_flag = False
        while df.loc[curr_idx, 'customer_id'] == customer:
            curr_idx += 1
            if curr_idx == len(df):
                end_flag = True
        last_idx = curr_idx 
        
        if not end_flag:
            customer_df = df.iloc[start_idx:last_idx, :]
        else:
            customer_df = df.iloc[start_idx:, :]
            
        start_idx = curr_idx
            
        
        max_credit_weekly_trx = 0
        max_credit_monthly_trx = 0
        max_credit_weekly_trx_avg_val = 0
        max_credit_monthly_trx_avg_val = 0
        
        max_debit_weekly_trx = 0
        max_debit_monthly_trx = 0
        max_debit_weekly_trx_avg_val = 0
        max_debit_monthly_trx_avg_val = 0
                
        start_date = df['transaction_date'].min()
        end_date = df['transaction_date'].max()
        weekly_ranges, monthly_ranges = get_date_ranges(start_date, end_date)
        
        for i, (start, end) in enumerate(weekly_ranges):
            period_df = customer_df[(customer_df['transaction_date'] >= pd.to_datetime(start).date()) & (customer_df['transaction_date'] <= pd.to_datetime(end).date())]
            
            credit_period_df = period_df[period_df['debit_credit'] == 'credit']
            debit_period_df = period_df[period_df['debit_credit'] == 'debit']
            
            if len(credit_period_df) > max_credit_weekly_trx:
                max_credit_weekly_trx = len(credit_period_df)
                max_credit_weekly_trx_avg_val = credit_period_df['amount_cad'].mean()
            
            if len(debit_period_df) > max_debit_weekly_trx:
                max_debit_weekly_trx = len(debit_period_df)
                max_debit_weekly_trx_avg_val = debit_period_df['amount_cad'].mean()


        customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_credit_weekly_trx'] = max_credit_weekly_trx
        customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_credit_weekely_trx_avg_val'] = max_credit_weekly_trx_avg_val
        
        customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_debit_weekly_trx'] = max_debit_weekly_trx
        customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_debit_weekely_trx_avg_val'] = max_debit_weekly_trx_avg_val
    
        
        # Doing the monthly calculations
        for i,(start, end) in enumerate(monthly_ranges):
            period_df = customer_df[(customer_df['transaction_date'] >= pd.to_datetime(start).date()) & (customer_df['transaction_date'] <= pd.to_datetime(end).date())]
            
            credit_period_df = period_df[period_df['debit_credit'] == 'credit']
            debit_period_df = period_df[period_df['debit_credit'] == 'debit']
            
            # Calculate the maximum number of transactions in a month and the average transaction value
            if len(credit_period_df) > max_credit_monthly_trx:
                max_credit_monthly_trx = len(credit_period_df)
                max_credit_monthly_trx_avg_val = credit_period_df['amount_cad'].mean()
            
            if len(debit_period_df) > max_debit_monthly_trx:
                max_debit_monthly_trx = len(debit_period_df)
                max_debit_monthly_trx_avg_val = debit_period_df['amount_cad'].mean()

        
        # Store the maximum number of transactions and average transaction value for the month       
        customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_credit_monthly_trx'] = max_credit_monthly_trx
        customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_credit_monthly_trx_avg_val'] = max_credit_monthly_trx_avg_val

        customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_debit_monthly_trx'] = max_debit_monthly_trx
        customer_stats.loc[customer_stats['customer_id'] == customer, df_key+'_max_debit_monthly_trx_avg_val'] = max_debit_monthly_trx_avg_val
        
#save the customer stats to a csv file
customer_stats.to_csv('../../processed_synth_dataset/features_4_5.csv', index=False)

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
  0%|          | 0/214958 [00:38<?, ?it/s]


KeyboardInterrupt: 

In [57]:
customer_stats = pd.read_csv('../../processed_synth_dataset/features_4_5.csv', engine="pyarrow").sample(frac = frac)