## Navigation
1. [Start Here](hey.ipynb)
1. [Load Data and Clean](/eda.ipynb)
1. [To Clean, or Not To Clean?](eval_v1.ipynb)
1. Generate Datasets
    1. [Faker Naive](faker_naive.ipynb)
    1. [Faker Plus](faker_plus.ipynb)
    1. [SDV Naive](sdv_v1.ipynb)
    1. [SDV More Better](sdv_v2.ipynb)
    1. [SDV TVAE]()
1. Compare and Evaluate Performance
    1. [First impressions](eval_v2.ipynb)
    1. [Loan financial models](eval_v3.ipynb)
    1. [Predicting default risk](eval_v4.ipynb)
    1. [How hackable]()

### Logistic regression model for predicting loan default risk

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
import warnings

# Suppress common warnings that do not impact results
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

def load_and_prepare_data(df):
    """
    Prepare the DataFrame for modeling.
    
    Parameters:
    - df: DataFrame, the input data containing loan data.

    Returns:
    - df: DataFrame, prepared data for modeling.
    """
    try:
        print("Data loaded successfully.")

        # Filter to keep only relevant columns
        relevant_columns = [
            'loan_amnt', 'unified_dti', 'unified_annual_inc', 
            'fico_range_avg', 'home_ownership', 'sub_grade', 
            'loan_status', 'issue_d'
        ]

        # Ensure all relevant columns exist in the DataFrame
        missing_cols = set(relevant_columns) - set(df.columns.tolist())
        if missing_cols:
            raise ValueError(f"Missing columns in the dataset: {missing_cols}")

        # Select relevant columns
        df = df[relevant_columns].copy()  # Use .copy() to avoid setting on a slice

        # Convert 'issue_d' to datetime and extract the year
        df['issue_d'] = pd.to_datetime(df['issue_d'], errors='coerce')
        df['issue_year'] = df['issue_d'].dt.year  # Extract year for further analysis

        return df

    except Exception as e:
        print(f"An error occurred while preparing data: {e}")
        raise

def encode_features(df):
    """
    Encode categorical features and the target variable.

    Parameters:
    - df: DataFrame, the input data containing categorical features.

    Returns:
    - df: DataFrame, the input data with encoded features.
    """
    # Check for unique values in 'loan_status' before encoding
    if 'loan_status' not in df or df['loan_status'].isnull().any():
        raise ValueError("The 'loan_status' column is missing or contains null values.")

    unique_labels = df['loan_status'].unique()
    expected_labels = ['Default', 'Fully Paid']

    # Check for unexpected values
    if not set(unique_labels).issubset(expected_labels):
        raise ValueError(f"Unexpected values in 'loan_status': {unique_labels}. Expected: {expected_labels}")

    # Encode the target variable: 'Default' = 1, 'Fully Paid' = 0
    df['loan_status'] = df['loan_status'].apply(lambda x: 1 if x == 'Default' else 0)

    # Check if encoding was successful
    if df['loan_status'].isnull().any():
        raise ValueError("Encoding of 'loan_status' resulted in null values.")

    # Encode categorical features using Label Encoder
    categorical_features = ['home_ownership', 'sub_grade']
    for column in categorical_features:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column].astype(str))

    return df

def train_model(X_train, y_train):
    """
    Train the Logistic Regression model.

    Parameters:
    - X_train: DataFrame, features for training.
    - y_train: Series, target variable for training.

    Returns:
    - model: LogisticRegression, trained model.
    """
    # Ensure y_train is a binary array
    if not pd.api.types.is_numeric_dtype(y_train):
        raise ValueError("y_train must be numeric.")

    model = LogisticRegression(max_iter=500, solver='lbfgs', n_jobs=-1)  # Use all available CPU cores
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model and return performance metrics.

    Parameters:
    - model: LogisticRegression, the trained model.
    - X_test: DataFrame, features for testing.
    - y_test: Series, true labels for testing.

    Returns:
    - results: dict, containing accuracy, F1 score, and classification report.
    """
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    report = classification_report(y_test, predictions, output_dict=True)

    results = {
        'accuracy': accuracy,
        'f1_score': f1,
        'classification_report': report
    }

    return results

def print_model_formula(model, feature_names):
    """
    Print the logistic regression model formula.

    Parameters:
    - model: LogisticRegression, the trained model.
    - feature_names: list, names of the features.
    """
    coefficients = model.coef_[0]
    intercept = model.intercept_[0]
    formula = f"Logit(P(Default)) = {intercept:.4f} + " + \
              " + ".join([f"{coeff:.4f} * {name}" for coeff, name in zip(coefficients, feature_names)])
    print("\nModel Formula:")
    print(formula)

def save_results(results, output_file):
    """
    Save the evaluation results to a CSV file.

    Parameters:
    - results: dict, evaluation results to save.
    - output_file: str, the path to the output CSV file.
    """
    # Create a DataFrame from the results
    report_df = pd.DataFrame(results['classification_report']).transpose()
    report_df.to_csv(output_file, index=True)

    # Save accuracy and F1 score as separate entries
    summary_df = pd.DataFrame({
        'Metric': ['Accuracy', 'F1 Score'],
        'Value': [results['accuracy'], results['f1_score']]
    })
    
    # Append summary to the report
    summary_df.to_csv(output_file, mode='a', index=False)

    print(f"Results saved to {output_file}")

def main(df, output_file='evaluation_results.csv'):
    """
    Main function to execute the loan default risk prediction.

    Parameters:
    - df: DataFrame, the input data containing loan data.
    - output_file: str, the path to save evaluation results.
    """
    # Prepare data
    df = load_and_prepare_data(df)

    # Encode features
    df = encode_features(df)

    # Define features (X) and target (y)
    X = df.drop(['loan_status', 'issue_d'], axis=1)  # Drop target and datetime
    y = df['loan_status']

    # Additional check for y
    if y.isnull().any() or not set(y.unique()).issubset([0, 1]):
        raise ValueError("The target variable 'loan_status' must be binary (0 or 1) and cannot contain null values.")

    # Split the data into training and test sets with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale the features to improve model performance
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train the model
    model = train_model(X_train, y_train)

    # Print the model formula
    print_model_formula(model, X.columns)

    # Evaluate the model
    results = evaluate_model(model, X_test, y_test)

    # Save results to CSV
    save_results(results, output_file)

In [None]:
# Replace 'path_to_file.csv' with the actual path to your CSV file
import os
plist=[]
paths = [os.environ["SYNTH_N"], os.environ["SYNTH_G"],
        os.environ["FAKER"], os.environ["FAKER_P"]]

for p in paths:
    path = os.path.join(os.environ["PATH_START"], p)
    plist.append(path)

plist

In [None]:
# Get the filename for each path
for p in plist:
    # Get the basename of the path
    output_basename = os.path.basename(p)
    # Remove '.csv' from the basename
    output_basename = output_basename[:-4]
    output_basename = output_basename + '_evaluation_results.csv'
    print(output_basename)
    df = pd.read_csv(p, low_memory=False)
    main(df, output_file=output_basename)
    print('\n\n')

### Decision Tree

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score, f1_score, classification_report
import graphviz
import warnings

# Suppress common warnings that do not impact results
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

def load_and_prepare_data(df):
    """
    Prepare the DataFrame for modeling.
    
    Parameters:
    - df: DataFrame, the input data containing loan data.

    Returns:
    - df: DataFrame, prepared data for modeling.
    """
    try:
        print("Data loaded successfully.")

        relevant_columns = [
            'loan_amnt', 'unified_dti', 'unified_annual_inc', 
            'fico_range_avg', 'home_ownership', 'sub_grade', 
            'loan_status', 'issue_d'
        ]

        missing_cols = set(relevant_columns) - set(df.columns.tolist())
        if missing_cols:
            raise ValueError(f"Missing columns in the dataset: {missing_cols}")

        df = df[relevant_columns].copy()
        df['issue_d'] = pd.to_datetime(df['issue_d'], errors='coerce')
        df['issue_year'] = df['issue_d'].dt.year

        return df

    except Exception as e:
        print(f"An error occurred while preparing data: {e}")
        raise

def encode_features(df):
    if 'loan_status' not in df or df['loan_status'].isnull().any():
        raise ValueError("The 'loan_status' column is missing or contains null values.")

    unique_labels = df['loan_status'].unique()
    expected_labels = ['Default', 'Fully Paid']
    if not set(unique_labels).issubset(expected_labels):
        raise ValueError(f"Unexpected values in 'loan_status': {unique_labels}. Expected: {expected_labels}")

    df['loan_status'] = df['loan_status'].apply(lambda x: 1 if x == 'Default' else 0)

    if df['loan_status'].isnull().any():
        raise ValueError("Encoding of 'loan_status' resulted in null values.")

    categorical_features = ['home_ownership', 'sub_grade']
    for column in categorical_features:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column].astype(str))

    return df

def train_model(X_train, y_train):
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    report = classification_report(y_test, predictions, output_dict=True)

    results = {
        'accuracy': accuracy,
        'f1_score': f1,
        'classification_report': report
    }

    return results

def save_results(results, output_file):
    report_df = pd.DataFrame(results['classification_report']).transpose()
    report_df.to_csv(output_file, index=True)

    summary_df = pd.DataFrame({
        'Metric': ['Accuracy', 'F1 Score'],
        'Value': [results['accuracy'], results['f1_score']]
    })
    
    summary_df.to_csv(output_file, mode='a', index=False)
    print(f"Results saved to {output_file}")

def visualize_tree(model, feature_names):
    print("Visualizing decision tree...")
    dot_data = export_graphviz(model, out_file=None, 
                                feature_names=feature_names,
                                class_names=['Fully Paid', 'Default'],
                                filled=True, rounded=True,  
                                special_characters=True)  
    graph = graphviz.Source(dot_data)  
    graph.render("decision_tree.pdf")  # Save the tree as a PDF file
    graph.view()  # Open the tree visualization

def main(df, output_file='evaluation_results.csv'):
    df = load_and_prepare_data(df)
    df = encode_features(df)

    # Define features (X) and target (y)
    X = df.drop(['loan_status', 'issue_d'], axis=1)  # Drop target and datetime
    y = df['loan_status']

    # Additional check for y
    if y.isnull().any() or not set(y.unique()).issubset([0, 1]):
        raise ValueError("The target variable 'loan_status' must be binary (0 or 1) and cannot contain null values.")

    # Split the data into training and test sets with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale the features to improve model performance
    print("Scaling features...")
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train the model
    print("Training model...")
    model = train_model(X_train, y_train)

    # Get feature importances and select the top 3 features
    print("Feature importances:", model.feature_importances_)
    feature_importances = model.feature_importances_
    feature_names = X.columns
    top_features_indices = np.argsort(feature_importances)[-3:]  # Get indices of top 3 features
    top_features = feature_names[top_features_indices]

    # Filter the data to keep only the top features
    print("Selecting top features...")
    X_train_top = X_train[:, top_features_indices]
    X_test_top = X_test[:, top_features_indices]

    # Train the model again with only the top 3 features
    print("Training model with only the top features...")
    model_top = train_model(X_train_top, y_train)

    # Evaluate the model with top features
    print("Evaluating model with only the top features...")
    results = evaluate_model(model_top, X_test_top, y_test)  # Correctly pass y_test

    # Save results to CSV
    save_results(results, output_file)
    visualize_tree(model_top, top_features)

In [None]:
import os
plist=[]
paths = [os.environ["SYNTH_N"], os.environ["SYNTH_G"],
        os.environ["FAKER"], os.environ["FAKER_P"], os.environ["CLEAN"]]

for p in paths:
    path = os.path.join(os.environ["PATH_START"], p)
    plist.append(path)

plist

In [None]:
# Get the filename for each path
for p in plist:
    # Get the basename of the path
    output_basename = os.path.basename(p)
    # Remove '.csv' from the basename
    output_basename = output_basename[:-4]
    output_basename = output_basename + '_d-tree_evaluation_results.csv'
    print(output_basename)
    df = pd.read_csv(p, low_memory=False)
    main(df, output_file=output_basename)