## Navigation
1. [Start Here](hey.ipynb)
1. [Load Data and Clean](/eda.ipynb)
1. [To Clean, or Not To Clean?](eval_v1.ipynb)
1. Generate Datasets
    1. [Faker Naive](faker_naive.ipynb)
    1. [Faker Plus](faker_plus.ipynb)
    1. [SDV Naive](sdv_v1.ipynb)
    1. [SDV More Better](sdv_v2.ipynb)
    1. [SDV TVAE]()
1. Compare and Evaluate Performance
    1. [First impressions](eval_v2.ipynb)
    1. [Loan financial models](eval_v3.ipynb)
    1. [Predicting default risk](eval_v4.ipynb)
    1. [How hackable]()

# Synthetic Data Vault v 2.0
> #### Game mode: Making an effort
Training on cleaned raw data with enhanced preprocessing transformers, calculated numerical distributions, and metadata constraints

### Generate the metadata for SDV

In [None]:
import sdv
print(sdv.__version__)

In [15]:
# Get started by creating a blank SingleTableMetadata 
from sdv.metadata import SingleTableMetadata
import sdv.metadata
metadata = SingleTableMetadata()

In [29]:
# Put our clean(er) data into a dataframe
import pandas as pd
# Display all the things
pd.set_option('display.max_columns', 120)
pd.set_option('display.max_rows', 500)

real_data = pd.read_csv('FILEPATH',compression='gzip')

In [30]:
# Change issue_d and earliest_cr_line to datetime
import datetime
real_data['issue_d'] = pd.to_datetime(real_data['issue_d']).dt.year
real_data['earliest_cr_line'] = pd.to_datetime(real_data['earliest_cr_line']).dt.year

In [31]:
# Automatically detect the metadata based on the actual data
from sdv.metadata import Metadata

metadata = Metadata.detect_from_dataframe(
    data=real_data)

# Save the metadata
if os.path.exists(filename):
    os.remove(filename)
metadata.save_to_json(filename)

In [None]:
print(metadata)
# Validate the metadata
if metadata.validate() is None:
    print('Metadata is valid!')
else:
    print('Metadata is not valid!')

In [33]:
# Update metadata
metadata.update_column(
    column_name='issue_d',
    sdtype='datetime',
    datetime_format='%Y')
metadata.update_column(
    column_name='earliest_cr_line',
    sdtype='datetime',
    datetime_format='%Y')
metadata.update_column(
    column_name='earliest_cr_line',
    sdtype='datetime',
    datetime_format='%Y')
metadata.update_column(
    column_name='addr_state',
    sdtype='state_abbr')
metadata.update_column(
    column_name='settlement_flag',
    sdtype='boolean')
    
# Save the metadata
if os.path.exists(filename):
    os.remove(filename)
metadata.save_to_json(filename)

### Gaussian + constraints

In [34]:
import pandas as pd
import numpy as np
from scipy import stats

def fit_distribution(data):
    # Remove NaN values and check if there's enough data
    data = data.dropna()
    if len(data) < 2:  # Need at least two points to fit a distribution
        return None, None

    # Define the distributions to test
    distributions = {
        'norm': stats.norm,
        'beta': stats.beta,
        'gamma': stats.gamma,
        'uniform': stats.uniform,
        'truncnorm': stats.truncnorm
    }

    # Fit each distribution to the data and calculate the AIC
    aic_results = {}
    for name, distribution in distributions.items():
        try:
            if name == 'beta':
                # Scale data to [0, 1]
                data_min, data_max = np.min(data), np.max(data)
                data_range = data_max - data_min
                scaled_data = (data - data_min) / data_range if data_range > 0 else data
                params = distribution.fit(scaled_data, floc=0, fscale=1)  # Fix loc and scale for beta
            
            elif name == 'truncnorm':
                # Fit truncated normal distribution
                a, b = (np.min(data) - np.mean(data)) / np.std(data), (np.max(data) - np.mean(data)) / np.std(data)
                params = distribution.fit(data, a=a, b=b)
                
            else:
                params = distribution.fit(data)

            # Calculate the AIC
            log_likelihood = np.sum(distribution.logpdf(data, *params))
            k = len(params)  # Number of parameters
            aic = 2 * k - 2 * log_likelihood
            aic_results[name] = aic
            
        except Exception as e:
            print(f"Could not fit distribution {name}: {e}")

    # Find the distribution with the lowest AIC
    if aic_results:
        best_fit_name = min(aic_results, key=aic_results.get)
        return best_fit_name
    else:
        return None

def analyze_dataframe(df):
    numerical_distributions = {}
    for column in df.select_dtypes(include=[np.number]).columns:
        dist_name = fit_distribution(df[column])
        numerical_distributions[column] = dist_name
    return numerical_distributions

In [None]:
# Analyze the DataFrame
numerical_dist = analyze_dataframe(real_data)
numerical_dist

In [36]:
# Creating the synthesizer with guassian kde distribution
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(
    metadata,  # required
    enforce_min_max_values=True,
    enforce_rounding=True,
    numerical_distributions=numerical_dist,
    default_distribution='gamma',
)

In [37]:
# Build constraints
my_constraints = {
    'constraint_class': 'FixedNullCombinations',
    'constraint_parameters': {
        'column_names': ['settlement_flag', 'settlement_amount']
    },
    'constraint_class': 'Positive',
    'constraint_parameters': {
        'column_name': 'loan_amnt'
    }
}

In [38]:
# To learn a machine learning model based on your real data, use the fit method.
synthesizer.add_constraints(constraints=[my_constraints])

In [39]:
# Fit the data
synthesizer.fit(real_data)

In [40]:
# Get learned distribution
if os.path.exists(filename):
    os.remove(filename)

# After fitting, you can access the learned distribution for each column
learned_dist = synthesizer.get_learned_distributions()
pd.DataFrame(learned_dist).to_csv(filename, index=False)

In [41]:
# Saving the trained synthesizer as a Python pickle file for future use
synthesizer.save(filepath=filename)

### Sampling data with the synthesizer

In [None]:
# How many rows do we need?
n_rows = len(real_data)
print('Number of rows:', n_rows)
batch = round(n_rows/20)

# Sample data
if os.path.exists(filename):
    os.remove(filename)

synthetic_data = synthesizer.sample(
    num_rows=n_rows,
    batch_size=batch,
    output_file_path=filename
)

### Diagnostics and evaluation

In [None]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic_report = run_diagnostic(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata)

# Save diagnostic report to a text file
diagnostic_report.save(filepath='FILEPATH')

### Quality check

In [45]:
from sdmetrics.visualization import get_column_plot

%matplotlib inline

import kaleido

for col in real_data.columns:
    fig = get_column_plot(
        real_data=real_data,
        synthetic_data=synthetic_data,
        column_name=col
    )
    filename = 'sdv_g_eval_' + str(col) + '.jpg'
    try:
        fig.to_image('jpg', scale=1.5)
        fig.write_image(file=filename, format='jpg')
    except Exception as e:
        print(e)

In [None]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata,
    verbose=True)

# Save report to a csv
cols = quality_report.get_details(property_name='Column Shapes')
cols.to_csv('FILEPATH', index=False)

# Save diagnostic report to a pkl file
quality_report.save(filepath='FILEPATH')

# Create column quality visualizations
fig = quality_report.get_visualization(property_name='Column Shapes')

try:
    fig.to_image('jpg', scale=1.5)
    fig.write_image(file=filename, format='jpg')
except Exception as e:
    print(e)