In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast

In [2]:
# Load the CSV file into a pandas DataFrame
df_apps = pd.read_csv('data/apps_sample.csv')
df_transactions = pd.read_csv('data/txs_sample.csv')

In [3]:
# Convert the date columns to datetime objects + other initial data cleaning
df_transactions['date'] = pd.to_datetime(df_transactions['date'], errors='coerce').dt.date
df_apps['issuanceDate'] = pd.to_datetime(df_apps['issuanceDate'], errors='coerce').dt.date
df_apps['applicationDate'] = pd.to_datetime(df_apps['applicationDate'], errors='coerce').dt.date
df_apps['repaidDate'] = pd.to_datetime(df_apps['repaidDate'], errors='coerce').dt.date
df_apps['nextPaycheck'] = df_apps['nextPaycheck'].str.replace('T.*', '', regex=True)
df_apps['nextPaycheck'] = pd.to_datetime(df_apps['nextPaycheck'], errors='coerce').dt.date
def parse_tags(x):
    if pd.isna(x):  # Handle NaN
        return []
    try:
        return ast.literal_eval(x)  # Convert string to list
    except (ValueError, SyntaxError):  # Handle invalid formats
        return []
df_transactions['tags'] = df_transactions['tags'].apply(parse_tags)
df_transactions['tags'] = df_transactions['tags'].apply(lambda x: x if isinstance(x, list) else [])
df_apps["paidByUser"] = df_apps["paidByUser"].astype(bool)

In [7]:
def extract_loan_transactions(email: str) -> list:
    # Get the user's loan applications
    user_apps = df_apps[df_apps['email'] == email].sort_values(by='applicationDate')
    # Get the user's transactions
    user_txs = df_transactions[df_transactions['email'] == email].sort_values(by='date')
    # Get the user's loan transactions
    loans = []
    for apps_index, apps_row in user_apps.iterrows():
        if apps_row['status'] == 'Repaid':
            for txs_index, txs_row in user_txs.iterrows():
                if txs_row['amount'] == apps_row['paidAmount'] and txs_row['date'] == apps_row['repaidDate']:
                    loans.append((apps_row, txs_row))
                    break
        elif apps_row['status'] == 'AutoRejected':
            loans.append((apps_row, []))
    return loans

In [80]:
# Get all the transactions for a specific email and a in a specific date range
def get_transactions_for_dates(email: str, start_date: str, end_date: str) -> pd.DataFrame:
    # Get the user's transactions
    user_txs = df_transactions[df_transactions['email'] == email]
    # Filter the transactions by date
    user_txs = user_txs[(user_txs['date'] >= start_date) & (user_txs['date'] <= end_date)]
    return user_txs

def get_initial_balance(email: str, loan_row) -> float:
    # Get the user's transactions
    user_txs = df_transactions[df_transactions['email'] == email]
    # Filter the transactions by date
    user_txs = user_txs[user_txs['date'] <= loan_row['applicationDate']]
    return user_txs.iloc[-1]['balance'] if not user_txs.empty else 0

In [113]:
def data_for_timestep(email: str, loans: list, currIdx: int):
    loan_row, tx_row = loans[currIdx]
    if loan_row['status'] == 'AutoRejected':
        return None
    start_date = loan_row['applicationDate']
    end_date = loan_row['repaidDate']
    # Get the user's transactions
    amount_requested = loan_row['amount']
    user_txs = get_transactions_for_dates(email, pd.to_datetime('2024-01-01').date(), start_date)
    user_txs_during_loan = get_transactions_for_dates(email, start_date, end_date)
    # Get the initial balance
    initial_balance = get_initial_balance(email, loan_row)
    txs = get_transactions_for_dates(email, pd.to_datetime('2024-01-01').date(), loan_row['issuanceDate'])
    recent_txs = txs[txs['date'] >= max(loan_row['issuanceDate'] - pd.Timedelta(days=30), pd.Timestamp('2024-01-01').date())]
    avg_amount_spent_last_30_days = np.mean(recent_txs['amount']) if not recent_txs.empty else 0
    std_amount_spent_last_30_days = np.std(recent_txs['amount']) if not recent_txs.empty else 0
    avg_spending = np.mean(user_txs['amount']) if len(user_txs) > 0 else 0
    avg_balance = np.mean(user_txs['balance']) if len(user_txs) > 0 else 0
    days_since_last_loan_request = (start_date -loans[currIdx-1][0]['applicationDate']).days if currIdx > 0 else -1
    was_last_loan_auto_rejected = loans[currIdx-1][0]['status'] == 'AutoRejected' if currIdx > 0 else False
    
    avg_num_txs_per_day_during_loan = len(user_txs_during_loan) / ((end_date - start_date).days + 1) if len(user_txs_during_loan) > 0 else 0
    avg_daily_balance_during_loan = np.mean(user_txs_during_loan['balance']) if len(user_txs_during_loan) > 0 else 0
    std_daily_balance_during_loan = np.std(user_txs_during_loan['balance']) if len(user_txs_during_loan) > 0 else 0
    avg_spending_amount_during_loan = np.mean(user_txs_during_loan['amount']) if len(user_txs_during_loan) > 0 else 0
    std_spending_amount_during_loan = np.std(user_txs_during_loan['amount']) if len(user_txs_during_loan) > 0 else 0
    num_times_balance_below_0 = len(user_txs_during_loan[user_txs_during_loan['balance'] < 0])
    balance_after_repayment = tx_row['balance'] if not tx_row.empty else 0
    amount_paid = tx_row['amount'] if not tx_row.empty else 0
    voluntary_repayment = loan_row['paidByUser'] if not loan_row.empty else False
    how_early_repayment = max(abs((loan_row['repaidDate'] - loan_row['nextPaycheck']).days), 0) if not loan_row.empty else 0
    return [amount_requested, initial_balance, avg_balance, avg_spending, avg_amount_spent_last_30_days, std_amount_spent_last_30_days, days_since_last_loan_request, was_last_loan_auto_rejected, avg_num_txs_per_day_during_loan, avg_daily_balance_during_loan, std_daily_balance_during_loan, avg_spending_amount_during_loan, std_spending_amount_during_loan, num_times_balance_below_0, balance_after_repayment, amount_paid, voluntary_repayment, how_early_repayment]


In [114]:
def data_for_input_timestep(email: str, loans: list, currIdx: int):
    loan_row, tx_row = loans[currIdx]
    if loan_row['status'] == 'AutoRejected':
        return None
    start_date = loan_row['applicationDate']
    # Get the user's transactions
    amount_requested = loan_row['amount']
    user_txs = get_transactions_for_dates(email, pd.to_datetime('2024-01-01').date(), start_date)
    # Get the initial balance
    initial_balance = get_initial_balance(email, loan_row)
    txs = get_transactions_for_dates(email, pd.to_datetime('2024-01-01').date(), loan_row['issuanceDate'])
    recent_txs = txs[txs['date'] >= max(loan_row['issuanceDate'] - pd.Timedelta(days=30), pd.Timestamp('2024-01-01').date())]
    avg_amount_spent_last_30_days = np.mean(recent_txs['amount']) if not recent_txs.empty else 0
    std_amount_spent_last_30_days = np.std(recent_txs['amount']) if not recent_txs.empty else 0
    avg_spending = np.mean(user_txs['amount']) if len(user_txs) > 0 else 0
    avg_balance = np.mean(user_txs['balance']) if len(user_txs) > 0 else 0
    days_since_last_loan_request = (start_date -loans[currIdx-1][0]['applicationDate']).days if currIdx > 0 else -1
    was_last_loan_auto_rejected = loans[currIdx-1][0]['status'] == 'AutoRejected' if currIdx > 0 else False
    output = [amount_requested, initial_balance, avg_balance, avg_spending, avg_amount_spent_last_30_days, std_amount_spent_last_30_days, days_since_last_loan_request, was_last_loan_auto_rejected]
    output.extend([0]*10)
    return output

In [115]:
def format_one_data_point(email: str):
    loans = extract_loan_transactions(email)
    timesteps = []
    timesteps_inputs = []
    data = []
    labels =  []
    for i in range(len(loans)):
        data_point = data_for_timestep(email, loans, i)
        if data_point is not None:
            timesteps.append(data_point)
        data_point_input = data_for_input_timestep(email, loans, i)
        if data_point_input is not None:
            timesteps_inputs.append(data_point_input)
            labels.append(loans[i][1]['balance'])
    for i in range(0, len(timesteps)):
        data.append(timesteps[:i] + [timesteps_inputs[i]])
    return data, labels

In [116]:
res = format_one_data_point('61be9c6909d91d74fe46a2c9f6d830d69f7015e8fa2ac4a4129cbb3edd18ae21')
data = res[0]
labels = res[1]

In [121]:
len(data[5])

6

In [122]:
def get_data():
    data = []
    labels = []
    count = 0
    unique_emails = df_transactions['email'].unique()
    unique_emails_list = unique_emails.tolist()
    for email in unique_emails_list:
        res = format_one_data_point(email)
        data.extend(res[0])
        labels.extend(res[1])
        count += 1
        if count % 10 == 0:
            print(count)
        if count == 50:
            break
    return data, labels

In [123]:
data, labels = get_data()

10
20
30
40
50


In [152]:
for i, arr in enumerate(data):
    print(f"Shape of array {i}: {arr.shape}")

Shape of array 0: (1, 18)
Shape of array 1: (2, 18)
Shape of array 2: (3, 18)
Shape of array 3: (4, 18)
Shape of array 4: (5, 18)
Shape of array 5: (6, 18)
Shape of array 6: (7, 18)
Shape of array 7: (8, 18)
Shape of array 8: (9, 18)
Shape of array 9: (10, 18)
Shape of array 10: (11, 18)
Shape of array 11: (12, 18)
Shape of array 12: (13, 18)
Shape of array 13: (14, 18)
Shape of array 14: (15, 18)
Shape of array 15: (16, 18)
Shape of array 16: (17, 18)
Shape of array 17: (18, 18)
Shape of array 18: (19, 18)
Shape of array 19: (20, 18)
Shape of array 20: (21, 18)
Shape of array 21: (1, 18)
Shape of array 22: (2, 18)
Shape of array 23: (3, 18)
Shape of array 24: (4, 18)
Shape of array 25: (5, 18)
Shape of array 26: (6, 18)
Shape of array 27: (7, 18)
Shape of array 28: (8, 18)
Shape of array 29: (9, 18)
Shape of array 30: (10, 18)
Shape of array 31: (11, 18)
Shape of array 32: (12, 18)
Shape of array 33: (13, 18)
Shape of array 34: (14, 18)
Shape of array 35: (15, 18)
Shape of array 36: (

In [157]:
def pad_nested_lists(lst, pad_value=0):
    max_len = max(sublist.shape[0] for sublist in lst)
    padded_data = np.array([np.pad(sublist, ((0, max_len - sublist.shape[0]), (0, 0)), mode='constant', constant_values=pad_value) for sublist in lst])
    return padded_data

padded_data = pad_nested_lists(data)

In [162]:
array = np.nan_to_num(padded_data, nan=0)
np.save('50_data_pts_features_sequential.npy', array)

In [163]:
labels = np.array(labels)
array = np.nan_to_num(labels, nan=0)
np.save('50_data_pts_labels_sequential.npy', array)