In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast

In [2]:
# Load the CSV file into a pandas DataFrame
df_apps = pd.read_csv('data/apps_sample.csv')
df_transactions = pd.read_csv('data/txs_sample.csv')

In [3]:
# Drop rows that don't have any data in the specified columns
columns_to_check = ['balance', 'amount'] 
df_transactions = df_transactions.dropna(subset=columns_to_check)

In [4]:
# Convert the date columns to datetime objects + other initial data cleaning
df_transactions['date'] = pd.to_datetime(df_transactions['date'], errors='coerce').dt.date
df_apps['issuanceDate'] = pd.to_datetime(df_apps['issuanceDate'], errors='coerce').dt.date
df_apps['applicationDate'] = pd.to_datetime(df_apps['applicationDate'], errors='coerce').dt.date
df_apps['repaidDate'] = pd.to_datetime(df_apps['repaidDate'], errors='coerce').dt.date
df_apps['nextPaycheck'] = df_apps['nextPaycheck'].str.replace('T.*', '', regex=True)
df_apps['nextPaycheck'] = pd.to_datetime(df_apps['nextPaycheck'], errors='coerce').dt.date
def parse_tags(x):
    if pd.isna(x):  # Handle NaN
        return []
    try:
        return ast.literal_eval(x)  # Convert string to list
    except (ValueError, SyntaxError):  # Handle invalid formats
        return []
df_transactions['tags'] = df_transactions['tags'].apply(parse_tags)
df_transactions['tags'] = df_transactions['tags'].apply(lambda x: x if isinstance(x, list) else [])
df_apps["paidByUser"] = df_apps["paidByUser"].astype(bool)

In [5]:
# For each email, extract all transactions that are loans, which are indicated by df_apps
def extract_loan_transactions(email: str) -> list:
    # Get the user's loan applications
    user_apps = df_apps[df_apps['email'] == email].sort_values(by='applicationDate')
    # Get the user's transactions
    user_txs = df_transactions[df_transactions['email'] == email].sort_values(by='date')
    # Get the user's loan transactions
    loans = []
    for apps_index, apps_row in user_apps.iterrows():
        if apps_row['status'] == 'Repaid':
            for txs_index, txs_row in user_txs.iterrows():
                if txs_row['amount'] == apps_row['paidAmount'] and txs_row['date'] == apps_row['repaidDate']:
                    loans.append((apps_row, txs_row))
                    break
    return loans

In [6]:
# Get all the transactions for a specific email and a in a specific date range
def get_transactions_for_dates(email: str, start_date: str, end_date: str) -> pd.DataFrame:
    # Get the user's transactions
    user_txs = df_transactions[df_transactions['email'] == email]
    # Filter the transactions by date
    user_txs = user_txs[(user_txs['date'] >= start_date) & (user_txs['date'] <= end_date)]
    return user_txs

def get_initial_balance(email: str, loan_row) -> float:
    # Get the user's transactions
    user_txs = df_transactions[df_transactions['email'] == email]
    # Filter the transactions by date
    user_txs = user_txs[user_txs['date'] <= loan_row['applicationDate']]
    return user_txs.iloc[-1]['balance'] if not user_txs.empty else 0 # Take the previous balance for the intial amount, since not all data points have the loan money deposit into account (see data_exploration.ipynb) (safe assumption)

In [7]:
def get_loans_history(loans: list, currIdx: int) -> list:
    loan = loans[currIdx]
    current_amount_requested = loan[0]['amount']
    initial_balance = get_initial_balance(loan[0]["email"], loan[0])
    avg_amount_requested = np.mean([loan[0]['amount'] for loan in loans[:currIdx]]) if currIdx > 0 else 0
    avg_amount_repaid = np.mean([loan[0]['amount'] for loan in loans[:currIdx]]) if currIdx > 0 else 0
    avg_payback_time = np.mean([(loan[0]['repaidDate'] - loan[0]['issuanceDate']).days for loan in loans[:currIdx] if pd.notna(loan[0]['repaidDate']) and pd.notna(loan[0]['issuanceDate'])]) if currIdx > 0 else 0
    num_of_paidByUser = len([loan for loan in loans[:currIdx] if loan[0]['paidByUser']])
    num_of_not_paidByUser = len([loan for loan in loans[:currIdx] if not loan[0]['paidByUser']])
    ratio_paidByUser = num_of_paidByUser / (num_of_paidByUser + num_of_not_paidByUser) if (num_of_paidByUser + num_of_not_paidByUser) > 0 else 0
    ratio_loans_approved = (currIdx + 1) / df_apps[(df_apps['email'] == loan[0]['email']) & (df_apps['applicationDate'] < loan[0]['applicationDate'])].shape[0] if df_apps[(df_apps['email'] == loan[0]['email']) & (df_apps['applicationDate'] < loan[0]['applicationDate'])].shape[0] > 0 else 0
    num_days_since_last_loan = (loan[0]['applicationDate'] - loans[currIdx - 1][0]['applicationDate']).days if currIdx > 0 and pd.notna(loan[0]['applicationDate']) else -1
    num_days_loan_was_paid_late = sum(max(int((loan[0]['repaidDate'] - loan[0]['nextPaycheck']).days), 0) for loan in loans[:currIdx] if pd.notna(loan[0]['repaidDate']) and pd.notna(loan[0]['nextPaycheck']))
    ratio_loans_were_paid_on_time = len([loan for loan in loans[:currIdx] if pd.notna(loan[0]['repaidDate']) and loan[0]['repaidDate'] <= loan[0]['nextPaycheck']]) / (currIdx + 1) if (currIdx + 1) > 0 else 0
    
    return [
        current_amount_requested,
        initial_balance,
        avg_amount_requested,
        avg_amount_repaid,
        avg_payback_time,
        num_of_paidByUser,
        num_of_not_paidByUser,
        ratio_paidByUser,
        ratio_loans_approved,
        num_days_since_last_loan,
        num_days_loan_was_paid_late,
        ratio_loans_were_paid_on_time
    ]

def get_transactions_history(email: str, loans: list, currIdx: int) -> list:
    loan = loans[currIdx]
    txs = get_transactions_for_dates(email, pd.to_datetime('2024-01-01').date(), loan[0]['issuanceDate'])
    avg_amount_spent = np.mean(txs['amount']) if not txs.empty else 0
    std_amount_spent = np.std(txs['amount']) if not txs.empty else 0
    avg_balance = np.mean(txs['balance']) if not txs.empty else 0
    recent_txs = txs[txs['date'] >= max(loan[0]['issuanceDate'] - pd.Timedelta(days=30), pd.Timestamp('2024-01-01').date())]
    avg_amount_spent_last_30_days = np.mean(recent_txs['amount']) if not recent_txs.empty else 0
    std_amount_spent_last_30_days = np.std(recent_txs['amount']) if not recent_txs.empty else 0
    avg_num_daily_transactions = txs.groupby('date').size().mean() if not txs.empty else 0
    avg_daily_balance = txs.groupby('date')['balance'].mean().mean() if not txs.empty else 0
    std_daily_balance = txs.groupby('date')['balance'].mean().std() if not txs.empty else 0
    num_times_balance_dropped_below_zero = len(txs[(txs['balance'] < 0) & (txs['date'] <= loan[0]['issuanceDate'])]) if not txs.empty else 0
    lowest_balance = txs['balance'].min() if not txs.empty else 0
    highest_balance = txs['balance'].max() if not txs.empty else 0
    
    return [
        avg_amount_spent,
        std_amount_spent,
        avg_balance,
        avg_amount_spent_last_30_days,
        std_amount_spent_last_30_days,
        avg_num_daily_transactions,
        avg_daily_balance,
        std_daily_balance,
        num_times_balance_dropped_below_zero,
        lowest_balance,
        highest_balance
    ]


In [8]:
def format_one_data_point(email: str) -> list:
    loans = extract_loan_transactions(email)
    features = []
    labels = []
    for i in range(len(loans)):
        loan = loans[i]
        data_point = []
        data_point += get_loans_history(loans, i)
        data_point += get_transactions_history(email, loans, i)
        features.append(data_point)
        labels.append(loan[1]['balance'])
    return features, labels

In [9]:
format_one_data_point("61be9c6909d91d74fe46a2c9f6d830d69f7015e8fa2ac4a4129cbb3edd18ae21")

([[50.0,
   np.float64(-9936.53),
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   -1,
   0,
   0.0,
   np.float64(1.3010183639398998),
   np.float64(275.97412779065786),
   np.float64(-272.31021702838063),
   np.float64(4.350000000000001),
   np.float64(251.6131762178076),
   np.float64(9.661290322580646),
   np.float64(-184.49583363858366),
   np.float64(713.3154739432085),
   190,
   np.float64(-10138.53),
   np.float64(1300.85)],
  [70.0,
   np.float64(-9936.53),
   np.float64(50.0),
   np.float64(50.0),
   np.float64(8.0),
   1,
   0,
   1.0,
   2.0,
   12,
   0,
   0.5,
   np.float64(0.5889779874213846),
   np.float64(275.0870944250902),
   np.float64(-265.07286163522014),
   np.float64(-1.1612831858407084),
   np.float64(277.58102034439077),
   np.float64(9.9375),
   np.float64(-179.7871365524317),
   np.float64(703.0428989667078),
   199,
   np.float64(-10138.53),
   np.float64(1300.85)],
  [125.0,
   np.float64(-0.3),
   np.float64(60.0),
   np.float64(60.0),
   np.float64(11.0),


In [None]:
def get_data():
    data = []
    labels = []
    unique_emails = df_transactions['email'].unique()
    unique_emails_list = unique_emails.tolist()
    count = 0
    for email in unique_emails_list:
        data_point = format_one_data_point(email)
        data.extend(data_point[0])
        labels.extend(data_point[1])
        count += 1
        if count % 100 == 0:
            print(count)
        if count == 200:
            break
    return data, labels

data, labels = get_data()

In [171]:
arr = np.array(data)
array = np.nan_to_num(arr, nan=0)
np.save('500_data_pts_features.npy', array)

In [172]:
labels = np.array(labels)
np.save('500_data_pts_labels.npy', labels)