In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast

In [2]:
# Load the CSV file into a pandas DataFrame
df_apps = pd.read_csv('data/apps_sample.csv')
df_transactions = pd.read_csv('data/txs_sample.csv')

In [3]:
# Drop rows that don't have any data in the specified columns
columns_to_check = ['balance', 'amount'] 
df_transactions = df_transactions.dropna(subset=columns_to_check)

In [10]:
# Convert the date columns to datetime objects + other initial data cleaning
df_transactions['date'] = pd.to_datetime(df_transactions['date'], errors='coerce').dt.date
df_apps['issuanceDate'] = pd.to_datetime(df_apps['issuanceDate'], errors='coerce').dt.date
df_apps['repaidDate'] = pd.to_datetime(df_apps['repaidDate'], errors='coerce').dt.date
df_apps['nextPaycheck'] = pd.to_datetime(df_apps['nextPaycheck'], errors='coerce').dt.date
def parse_tags(x):
    if pd.isna(x):  # Handle NaN
        return []
    try:
        return ast.literal_eval(x)  # Convert string to list
    except (ValueError, SyntaxError):  # Handle invalid formats
        return []
df_transactions['tags'] = df_transactions['tags'].apply(parse_tags)
df_transactions['tags'] = df_transactions['tags'].apply(lambda x: x if isinstance(x, list) else [])
df_apps["paidByUser"] = df_apps["paidByUser"].astype(bool)

In [None]:
# For each email, extract all transactions that are loans, which are indicated by df_apps
def extract_loan_transactions(email: str) -> list:
    # Get the user's loan applications
    user_apps = df_apps[df_apps['email'] == email].sort_values(by='applicationDate')
    # Get the user's transactions
    user_txs = df_transactions[df_transactions['email'] == email].sort_values(by='date')
    # Get the user's loan transactions
    loans = []
    for apps_index, apps_row in user_apps.iterrows():
        if apps_row['status'] == 'Repaid':
            for txs_index, txs_row in user_txs.iterrows():
                if txs_row['amount'] == apps_row['paidAmount'] and txs_row['date'] == apps_row['repaidDate']:
                    loans.append((apps_row, txs_row))
                    break
    return loans

In [None]:
# Get all the transactions for a specific email and a in a specific date range
def get_transactions_for_dates(email: str, start_date: str, end_date: str) -> pd.DataFrame:
    # Get the user's transactions
    user_txs = df_transactions[df_transactions['email'] == email]
    # Filter the transactions by date
    user_txs = user_txs[(user_txs['date'] >= start_date) & (user_txs['date'] <= end_date)]
    return user_txs

def get_initial_balance(email: str, loan_row) -> float:
    # Get the user's transactions
    user_txs = df_transactions[df_transactions['email'] == email]
    # Filter the transactions by date
    user_txs = user_txs[user_txs['date'] < loan_row['date']]
    return user_txs['balance'].iloc[-1] # since the transactions are sorted by date, the last balance is the initial balance, which is the balance before the loan was taken (safe assumption)

In [None]:
def get_loans_history(loans: list, currIdx: int) -> list:
    loan = loans[currIdx]
    avg_amount_requested = np.mean([loan[0]['amount'] for loan in loans[:currIdx]])
    avg_amount_repaid = np.mean([loan[0]['amount'] for loan in loans[:currIdx]])
    avg_payback_time = np.mean([(loan[0]['repaidDate'] - loan[0]['issuanceDate']).days for loan in loans[:currIdx]])
    num_of_paidByUser = len([loan for loan in loans[:currIdx] if loan[0]['paidByUser']])
    num_of_not_paidByUser = len([loan for loan in loans[:currIdx] if not loan[0]['paidByUser']])
    ratio_paidByUser = num_of_paidByUser / (num_of_paidByUser + num_of_not_paidByUser)
    ratio_loans_approved = (currIdx + 1) / df_apps[(df_apps['email'] == loan[0]['email']) & (df_apps['applicationDate'] < loan[0]['applicationDate'])].shape[0]
    num_days_since_last_loan = (loan[0]['applicationDate'] - loans[currIdx - 1][0]['applicationDate']).days if currIdx > 0 else -1
    num_days_loan_was_paid_late = sum(max((loan[0]['repaidDate'] - loan[0]['nextPaycheck']).days, 0) for loan in loans[:currIdx])
    ratio_loans_were_paid_on_time = len([loan for loan in loans[:currIdx] if loan[0]['repaidDate'] <= loan[0]['nextPaycheck']]) / (currIdx + 1)
    return [avg_amount_requested, avg_amount_repaid, avg_payback_time, num_of_paidByUser, num_of_not_paidByUser, ratio_paidByUser, ratio_loans_approved, num_days_since_last_loan, num_days_loan_was_paid_late, ratio_loans_were_paid_on_time]

def get_transactions_history(email: str, loans: list, currIdx: int) -> list:
    loan = loans[currIdx]
    txs = get_transactions_for_dates(email, "January 1st 2024", loan[0]['issuanceDate'])
    avg_amount_spent = np.mean(txs['amount'])
    avg_balance = np.mean(txs['balance'])
    return [avg_amount_spent, avg_balance]
