## Navigation
1. [Start Here](hey.ipynb)
1. [Load Data and Clean](/eda.ipynb)
1. [To Clean, or Not To Clean?](eval_v1.ipynb)
1. Generate Datasets
    1. [Faker Naive](faker_naive.ipynb)
    1. [Faker Plus](faker_plus.ipynb)
    1. [SDV Naive](sdv_v1.ipynb)
    1. [SDV More Better](sdv_v2.ipynb)
    1. [SDV TVAE]()
1. Compare and Evaluate Performance
    1. [First impressions](eval_v2.ipynb)
    1. [Loan financial models](eval_v3.ipynb)
    1. [Predicting default risk](eval_v4.ipynb)
    1. [How hackable]()

# Analysis: Predicting Loan Default Risk
#### More advanced evaluation of performance using a predictive model and visualizations

Here’s an updated Python script that evaluates the performance of a real dataset and a synthetic dataset in a basic consumer loan financial model. The script calculates monthly payments and total interest paid for both datasets, checks for statistical significance, includes error handling, and generates visualizations saved as images.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Function to convert loan term from string to years
def convert_loan_term(term_str):
    months = 0
    try:
        # Extract the number of months from the string
        if term_str == "60 months":
            months = 60 / 12  # Convert months to years
        elif term_str == "36 months":
            months = 36 / 12  # Convert months to years
        return months
    except Exception as e:
        print(f"Error in converting loan term: {e}\t:{term_str}")
        return None

# Function to calculate the total interest paid
def calculate_total_interest(data):
    try:
        data['TotalInterest'] = (data['loan_amnt'] * (data['int_rate'] / 100)) * data['term']  # Total interest formula
        return data['TotalInterest'].sum(), data['TotalInterest']
    except Exception as e:
        print(f"Error in calculating total interest: {e}")
        return None, None

# Function to calculate the proportion of loans fully paid
def calculate_loan_status(data):
    try:
        total_loans = len(data)
        fully_paid_loans = len(data[data['loan_status'] == 'Fully Paid'])
        default_loans = len(data[data['loan_status'] == 'Default'])
        
        proportion_fully_paid = fully_paid_loans / total_loans if total_loans > 0 else 0
        return proportion_fully_paid, fully_paid_loans, default_loans
    except Exception as e:
        print(f"Error in calculating loan status: {e}")
        return None, None, None

# Load real and synthetic datasets
def load_data(real_data_path, synthetic_data_path):
    try:
        real_data = pd.read_csv(real_data_path)
        synthetic_data = pd.read_csv(synthetic_data_path)
        
        # Convert loan term from string to years
        real_data['term'] = real_data['term'].apply(convert_loan_term)
        synthetic_data['term'] = synthetic_data['term'].apply(convert_loan_term)
        
        return real_data, synthetic_data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None, None

# Evaluate loan performance
def evaluate_loans(data, dataset_name):
    proportion_fully_paid, fully_paid_loans, default_loans = calculate_loan_status(data)
    total_interest_paid, total_interest_series = calculate_total_interest(data)
    # Before applying log1p, ensure the data is clean
    data['TotalInterest'] = data['TotalInterest'].fillna(0)  # Handle NaN values
    data['TotalInterest'] = data['TotalInterest'].astype(float)  # Ensure correct type

    # Apply logarithmic normalization
    data['LogTotalInterest'] = np.log1p(data['TotalInterest'])  # log1p handles log(0) case

    # Calculate mean log total interest
    mean_log_interest = data['LogTotalInterest'].mean()
    
    results = {
        'Dataset': dataset_name,
        'ProportionFullyPaid': proportion_fully_paid,
        'FullyPaidLoans': fully_paid_loans,
        'DefaultLoans': default_loans,
        'MeanLogTotalInterest': mean_log_interest
    }
    
    # Save total interest series for further analysis
    data['TotalInterest'] = total_interest_series
    data.to_csv(f'{dataset_name}_loan_results.csv', index=False)
    
    return results

# Calculate percentage difference with error handling
def calculate_percentage_difference(real_mean, synthetic_mean):
    try:
        # Check for NaN values and avoid division by zero
        if np.isnan(real_mean) or np.isnan(synthetic_mean):
            print("One of the means is NaN, cannot calculate percentage difference.")
            return None
        if real_mean == 0:
            print("Real mean is zero, cannot calculate percentage difference.")
            return None
        
        percentage_difference = ((real_mean - synthetic_mean) / real_mean) * 100
        return percentage_difference
    except Exception as e:
        print(f"Error in calculating percentage difference: {e}")
        return None

# Statistical significance test
def check_statistical_significance(real_results, synthetic_results):
    try:
        # Create a 2x2 contingency table for loan status
        contingency_table = np.array([[real_results['FullyPaidLoans'], real_results['DefaultLoans']],
                                       [synthetic_results['FullyPaidLoans'], synthetic_results['DefaultLoans']]])
        chi2, p_value = stats.chi2_contingency(contingency_table)

        # T-test for total interest paid
        t_stat, p_value_interest = stats.ttest_ind(real_results['MeanLogTotalInterest'], synthetic_results['MeanLogTotalInterest'])
        
        return chi2, p_value, t_stat, p_value_interest
    except Exception as e:
        print(f"Error in statistical significance test: {e}")
        return None, None, None, None

# Visualization function
def create_visualizations(real_results, synthetic_results, name):
    # Bar plot for proportions of Fully Paid loans
    plt.figure(figsize=(8, 5))
    datasets = [real_results['Dataset'], synthetic_results['Dataset']]
    proportions = [real_results['ProportionFullyPaid'], synthetic_results['ProportionFullyPaid']]
    
    # Create a color palette based on the unique values in 'day'
    palette = sns.color_palette('viridis', n_colors=len(real_results['Dataset']))
    sns.set_palette(palette)

    # Updated seaborn bar plot
    sns.barplot(x=datasets, y=proportions, palette='viridis')
    plt.title('Proportion of Fully Paid Loans')
    plt.ylabel('Proportion of Fully Paid Loans')
    plt.ylim(0, 1)
    plt.savefig(f'{name}_proportion_fully_paid_loans.png')
    !mv f'{name}_proportion_fully_paid_loans.png' ./datasets/ahn1-google-drive/lendingclub/qad/img/
    plt.close()

    # Bar plot for mean log total interest paid
    plt.figure(figsize=(8, 5))
    mean_log_interest = [real_results['MeanLogTotalInterest'], synthetic_results['MeanLogTotalInterest']]
    
    sns.barplot(x=datasets, y=mean_log_interest, palette='viridis')
    plt.title('Mean Log Total Interest Paid Comparison')
    plt.ylabel('Mean Log Total Interest Paid')
    plt.savefig(f'{name}_mean_log_total_interest_paid_comparison.png')
    !mv f'{name}_mean_log_total_interest_paid_comparison.png' ./datasets/ahn1-google-drive/lendingclub/qad/img/
    plt.close()

# Main function to run the evaluation
def main(real_data_path, synthetic_data_path, name):
    real_data, synthetic_data = load_data(real_data_path, synthetic_data_path)
    
    if real_data is not None and synthetic_data is not None:
        real_results = evaluate_loans(real_data, 'Real Data')
        synthetic_results = evaluate_loans(synthetic_data, 'Synthetic Data')
        
        percentage_difference = calculate_percentage_difference(real_results['MeanLogTotalInterest'], synthetic_results['MeanLogTotalInterest'])
        if percentage_difference is not None:
            print(f"Percentage Difference in Mean Log Total Interest: {percentage_difference:.2f}%")
        
        chi2, p_value, t_stat, p_value_interest = check_statistical_significance(real_results, synthetic_results)
        if chi2 is not None and p_value is not None:
            print(f"Chi-squared statistic: {chi2}, P-value for loan status: {p_value}")
            if p_value < 0.05:
                print("The difference in loan status proportions is statistically significant.")
            else:
                print("The difference in loan status proportions is not statistically significant.")
        
        if t_stat is not None and p_value_interest is not None:
            print(f"T-statistic for mean log total interest: {t_stat}, P-value for mean log total interest: {p_value_interest}")
            if p_value_interest < 0.05:
                print("The difference in mean log total interest paid is statistically significant.")
            else:
                print("The difference in mean log total interest paid is not statistically significant.")
        
        create_visualizations(real_results, synthetic_results, name)
    else:
        print("Data loading failed.")

In [None]:
import os
plist=[]
paths = [os.environ["SYNTH_N"], os.environ["SYNTH_G"],
        os.environ["FAKER"], os.environ["FAKER_P"]]

for p in paths:
    path = os.path.join(os.environ["PATH_START"], p)
    plist.append(path)

if __name__ == "__main__":
    for p in plist:
        main('/datasets/ahn1-google-drive/lendingclub/qad/clean_5yr.csv.gz', p, p.split('.csv')[-1])

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f25f0418-9119-475c-bcf9-d4049319fe7d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>