## Navigation
1. [Start Here](hey.ipynb)
1. [Load Data and Clean](/eda.ipynb)
1. [To Clean, or Not To Clean?](eval_v1.ipynb)
1. Generate Datasets
    1. [Faker Naive](faker_naive.ipynb)
    1. [Faker Plus](faker_plus.ipynb)
    1. [SDV Naive](sdv_v1.ipynb)
    1. [SDV More Better](sdv_v2.ipynb)
    1. [SDV TVAE]()
1. Compare and Evaluate Performance
    1. [First impressions](eval_v2.ipynb)
    1. [Loan financial models](eval_v3.ipynb)
    1. [Predicting default risk](eval_v4.ipynb)
    1. [How hackable]()

# Naive Synthetic Data Generation using Faker()

In [1]:
# Import libraries I use in this notebook
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
from random import randint

# "magic" command to make plots show up in the notebook
%matplotlib inline
sns.set_style('whitegrid')

# Display all the things
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)

# Initiatlize the Faker library
from faker import Faker
fake = Faker()

In [2]:
# Import the rejected dataset (post cleansing) into a Pandas DataFrame
accepted_df = pd.read_csv('FILEPATH', compression='gzip', low_memory=False)

In [None]:
# Do all columns in rejected_df have the same count of items?
accepted_df.count()

In [4]:
# Store our total item count so we can make the right number of rows later
n_rows = len(accepted_df)

Instead of generating completely random data, let's try to be slightly less naive and attempt to preserve some of the statistical properties of the original. That means, we first need to establish the min and max of float columns, range for integers, and labels for the categorial variables.

In [None]:
# Find the min and max for all numeric columns
# Save to a csv and write that metadata to a dictionary I can use later with Faker

def process_df(df):
    """
    Process the DataFrame to compute minimum and maximum values for numeric columns.

    Args:
        df (pd.DataFrame): The input DataFrame to process.

    Returns:
        pd.DataFrame: A new DataFrame containing column names, min, and max values.
    """
    # Initialize an empty list to store results
    res = []
    
    # Keep track of successfully processed column names
    proc_cols = []

    # Iterate through each column in the original DataFrame
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            try:
                # Calculate minimum and maximum for numeric columns
                min_val = df[col].min()
                max_val = df[col].max()

                # Append the results to the list
                res.append({
                    'Col': col,   # Shortened column name
                    'Min': min_val,  # Shortened column name
                    'Max': max_val   # Shortened column name
                })

                # Add the column name to the processed list
                proc_cols.append(col)

            except Exception as e:
                print(f"Error processing column '{col}': {e}")
        else:
            # Skip non-numeric columns and print a message
            print(f"Skipping non-numeric column: '{col}'")

    # Check if any columns were processed successfully
    if not res:
        print("No numeric columns found to process.")
        return pd.DataFrame(columns=['Col', 'Min', 'Max'])  # Return an empty DataFrame with proper columns

    # Create a new DataFrame from the results
    new_df = pd.DataFrame(res)

    # Print total columns processed successfully and their names
    print(f"Total columns processed successfully: {len(proc_cols)}")
    print("Processed columns:", proc_cols)

    return new_df

# Example usage
if __name__ == "__main__":
    # Process the DataFrame
    result_df = process_df(accepted_df)

    # Save the new DataFrame to a CSV file
    result_df.to_csv('processed_data.csv', index=False)

    # Convert the DataFrame to a dictionary for later use
    result_dict = result_df.set_index('Col').T.to_dict()
    
    # Print the resulting dictionary
    print("Resulting dictionary:", result_dict)

    # Display the new DataFrame
    print(result_df)

In [6]:
# Create a new dataframe
faker_dfa = pd.DataFrame()

In [None]:
# Generate rows of data for the dates
def random_dates(start, end, n, freq, seed=None):
    if seed is not None:
        np.random.seed(seed)

    dr = pd.date_range(start, end, freq=freq)
    return pd.to_datetime(np.sort(np.random.choice(dr, n)))

faker_dfa['issue_d'] = random_dates('2013-01-01', '2017-12-31', n_rows, 'B')
faker_dfa['issue_d'] = faker_dfa['issue_d'].dt.strftime('%Y')
faker_dfa['issue_d'].value_counts()

In [None]:
# Find the range of earliest_cr_line
accepted_df['earliest_cr_line'].value_counts()

In [None]:
# Find the range of issue_d
import datetime
accepted_df['earliest_cr_line'] = pd.to_datetime(accepted_df['earliest_cr_line'])
mini = accepted_df['earliest_cr_line'].dt.year.min()
maxi = accepted_df['earliest_cr_line'].dt.year.max()
faker_dfa['earliest_cr_line'] = [randint(mini, maxi) for x in range(n_rows)]
faker_dfa['earliest_cr_line'].describe().apply(lambda x: format(x, 'f'))

In [10]:
# Generate rows of data for each column in the result dictionary
# Add new columns the same way by looping through the columns in the results dictionary
for col in result_dict:
    mini = int(result_dict[col]['Min'])
    maxi = int(result_dict[col]['Max'])
    faker_dfa[col] = [randint(mini, maxi) for x in range(n_rows)]
    faker_dfa[col].describe().apply(lambda x: format(x, 'f'))

In [11]:
# Use faker to generate zip for faker_dfa and states for addr_state
faker_dfa['addr_state'] = [fake.state_abbr() for _ in range(n_rows)]

In [None]:
# How many columns are in accepted_df but are not in our faker_dfa? Print the names of the columns
missing_cols = accepted_df.columns.difference(faker_dfa.columns)
print(missing_cols)

In [13]:
# Process the remaining columns as categorical columns from missing_cols
# Loop through missing_cols and populate with random elements
for col in missing_cols:
    # Get unique elements once
    unique_elements = accepted_df[col].unique()
    # Use NumPy to generate random choices efficiently
    faker_dfa[col] = np.random.choice(unique_elements, size=n_rows)

In [None]:
# Take a quick look at our new dataframe
faker_dfa.info(verbose=True)

In [None]:
# Find any difference between the column names and the faker created dataframe
print(faker_dfa.columns.difference(accepted_df.columns))
print(accepted_df.columns.difference(faker_dfa.columns))

In [None]:
# Remove the old dataframe
del accepted_df
# Take a look at our memory usage
faker_dfa.memory_usage(index=False, deep=True).to_csv('FILEPATH')
faker_dfa.memory_usage(index=False, deep=True)

In [18]:
# Save our new faker created dataframe
faker_dfa.to_csv('FILEPATH', index=False)