## Navigation
1. [Start Here](hey.ipynb)
1. [Load Data and Clean](/eda.ipynb)
1. [To Clean, or Not To Clean?](eval_v1.ipynb)
1. Generate Datasets
    1. [Faker Naive](faker_naive.ipynb)
    1. [Faker Plus](faker_plus.ipynb)
    1. [SDV Naive](sdv_v1.ipynb)
    1. [SDV More Better](sdv_v2.ipynb)
    1. [SDV TVAE]()
1. Compare and Evaluate Performance
    1. [First impressions](eval_v2.ipynb)
    1. [Loan financial models](eval_v3.ipynb)
    1. [Predicting default risk](eval_v4.ipynb)
    1. [How hackable]()

# Initial Performance Evaluation
#### Raw vs. Cleaned vs. Synthetic on Raw vs. Synthetic on Cleaned

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# Display all the things
pd.set_option('display.float', '{:.2f}'.format)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 200)

In [None]:
# Import the raw data
# Drop all columns *except* for loan_amnt, loan_status, and dti
raw_data = pd.read_csv("FILEPATH", low_memory=False, compression='gzip')

In [None]:
# Count the unusable rows because they contain NaN values
print('Number of unusable rows in raw data:', raw_data[['loan_amnt', 'loan_status', 'dti']].isna().sum())
raw_data = raw_data.dropna(subset=['loan_amnt', 'loan_status', 'dti'])
raw_data = raw_data[['loan_amnt', 'loan_status', 'dti']]
# Drop rows where loan_amnt is under $1000
print('Number of too small loan amount rows in raw data:', raw_data['loan_amnt'].lt(1000).sum())
raw_data = raw_data[raw_data['loan_amnt'] >= 1000]
raw_data.info(verbose=True)

In [None]:
# Create the same number of rows for the synthetic datasets
n_rows = len(raw_data)
n_rows

In [None]:
# Synthesize the raw data
import sdv
from sdv.metadata import Metadata

metadata = Metadata.detect_from_dataframe(data=raw_data)
metadata.save_to_json(filepath='raw_synth_metadata_v1.json')

from sdv.single_table import GaussianCopulaSynthesizer

# Step 1: Create the synthesizer
synthesizer = GaussianCopulaSynthesizer(metadata)

In [6]:
# Step 2: Train the synthesizer
synthesizer.fit(raw_data)

In [7]:
# Step 3: Generate synthetic data
synthesized_from_raw = synthesizer.sample(num_rows=n_rows)

In [8]:
# Import my cleaned data
cleaned_data = pd.read_csv("FILEPATH", low_memory=False, compression='gzip')
# Count the unusable rows because they contain NaN values
print('Number of unusable rows in cleaned data:', cleaned_data[['loan_amnt', 'loan_status', 'dti']].isna().sum())
cleaned_data = cleaned_data.dropna(subset=['loan_amnt', 'loan_status', 'dti'])
cleaned_data = cleaned_data[['loan_amnt', 'loan_status', 'dti']]
# Drop rows where loan_amnt is under $1000
print('Number of too small loan amount rows in cleaned data:', cleaned_data['loan_amnt'].lt(1000).sum())
cleaned_data = cleaned_data[cleaned_data['loan_amnt'] >= 1000]
cleaned_data.info(verbose=True)

In [9]:
# Default synthesized from clean
# Synthesize the raw data
import sdv
from sdv.metadata import Metadata

metadata = Metadata.detect_from_dataframe(data=raw_data)
metadata.save_to_json(filepath='clean_synth_metadata_v1.json')

from sdv.single_table import GaussianCopulaSynthesizer

# Step 1: Create the synthesizer
synthesizer = GaussianCopulaSynthesizer(metadata)

# Step 2: Train the synthesizer
synthesizer.fit(cleaned_data)

# Step 3: Generate synthetic data
synthesized_from_clean = synthesizer.sample(num_rows=len(cleaned_data))

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

def evaluate_models(raw_data, clean_data, synthesized_from_raw, synthesized_from_clean):
    # Define a function to train and evaluate a model
    def train_and_evaluate(X, y):
        # Split into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Train the model
        model = LogisticRegression(max_iter=500)  # Increased max_iter to 500
        model.fit(X_train, y_train)
        
        # Make predictions
        predictions = model.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, predictions)
        
        # Determine the averaging method for F1 score based on the number of unique classes
        unique_classes = np.unique(y)
        average_method = 'binary' if len(unique_classes) == 2 else 'macro'  # Use 'macro' for multiclass
        
        f1 = f1_score(y_test, predictions, average=average_method, labels=unique_classes)
        return accuracy, f1

    # Prepare the raw real data
    X_raw = raw_data.drop('loan_status', axis=1)
    y_raw = raw_data['loan_status']
    accuracy_raw, f1_raw = train_and_evaluate(X_raw, y_raw)

    # Prepare the clean real data
    X_clean = clean_data.drop('loan_status', axis=1)
    y_clean = clean_data['loan_status']
    accuracy_clean, f1_clean = train_and_evaluate(X_clean, y_clean)

    # Prepare the synthesized data trained on raw data
    X_synthesized_raw = synthesized_from_raw.drop('loan_status', axis=1)
    y_synthesized_raw = synthesized_from_raw['loan_status']
    accuracy_synthesized_raw, f1_synthesized_raw = train_and_evaluate(X_synthesized_raw, y_synthesized_raw)

    # Prepare the synthesized data trained on clean data
    X_synthesized_clean = synthesized_from_clean.drop('loan_status', axis=1)
    y_synthesized_clean = synthesized_from_clean['loan_status']
    accuracy_synthesized_clean, f1_synthesized_clean = train_and_evaluate(X_synthesized_clean, y_synthesized_clean)

    # Create a DataFrame to store the results
    results_df = pd.DataFrame({
        'Dataset': [
            'Raw Real Data',
            'Synthesized from Raw Data',
            'Clean Real Data',
            'Synthesized from Clean Data'
        ],
        'Accuracy': [
            accuracy_raw,
            accuracy_synthesized_raw,
            accuracy_clean,
            accuracy_synthesized_clean
        ],
        'F1 Score': [
            f1_raw,
            f1_synthesized_raw,
            f1_clean,
            f1_synthesized_clean
        ]
    })

    # Print the results DataFrame
    print(results_df)
    results_df.to_csv('results.csv', index=False)

# Evaluate the models
evaluate_models(raw_data, cleaned_data, synthesized_from_raw, synthesized_from_clean)

In [None]:
"""
VISUALIZE THE RESULTS
"""
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

results_df = pd.read_csv('results.csv')

# Set up the visualizations
plt.figure(figsize=(12, 5))

# Plot Accuracy
plt.subplot(1, 2, 1)
sns.barplot(x='Dataset', y='Accuracy', data=results_df, palette='viridis')
plt.title('Model Accuracy by Dataset')
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.xticks(rotation=15)

# Plot F1 Score
plt.subplot(1, 2, 2)
sns.barplot(x='Dataset', y='F1 Score', data=results_df, palette='viridis')
plt.title('F1 Score by Dataset')
plt.ylim(0, 1)
plt.ylabel('F1 Score')
plt.xticks(rotation=15)

# Show the plots
plt.tight_layout()
plt.show()
plt.savefig('clean_vs_raw_results.png')