## Navigation
1. [Start Here](hey.ipynb)
1. [Load Data and Clean](/eda.ipynb)
1. [To Clean, or Not To Clean?](eval_v1.ipynb)
1. Generate Datasets
    1. [Faker Naive](faker_naive.ipynb)
    1. [Faker Plus](faker_plus.ipynb)
    1. [SDV Naive](sdv_v1.ipynb)
    1. [SDV More Better](sdv_v2.ipynb)
    1. [SDV TVAE]()
1. Compare and Evaluate Performance
    1. [First impressions](eval_v2.ipynb)
    1. [Loan financial models](eval_v3.ipynb)
    1. [Predicting default risk](eval_v4.ipynb)
    1. [How hackable]()

# Analyze Results: First Impressions
### Comparing Basic and Descriptive Statistics

In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis

class DataEvaluator:
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def calculate_statistics(self):
        stats = {}
        for column in self.dataframe.columns:
            if self.dataframe[column].isnull().any():
                print(f"Warning: Column '{column}' contains null values. These will be ignored in calculations.")
            
            if pd.api.types.is_numeric_dtype(self.dataframe[column]):
                stats[column] = {
                    'mean': self.dataframe[column].mean(),
                    'variance': self.dataframe[column].var(),
                    'skewness': skew(self.dataframe[column].dropna()),  # Drop nulls for skewness
                    'kurtosis': kurtosis(self.dataframe[column].dropna())  # Drop nulls for kurtosis
                }
            elif pd.api.types.is_categorical_dtype(self.dataframe[column]) or pd.api.types.is_object_dtype(self.dataframe[column]):
                stats[column] = {
                    'unique_values': self.dataframe[column].nunique(),
                    'top_value': self.dataframe[column].mode(),  # Get the first mode value
                    'frequency': self.dataframe[column].value_counts().to_dict()  # Convert to dictionary for easier writing
                }
            else:
                stats[column] = 'Unsupported data type'

        return stats

    def save_statistics_to_csv(self, filename):
        stats = self.calculate_statistics()
        stats_df = pd.DataFrame.from_dict(stats, orient='index')
        stats_df.to_csv(filename)

    def create_visualizations(self, output_dir):
        for column in self.dataframe.columns:
            try:
                if pd.api.types.is_numeric_dtype(self.dataframe[column]):
                    plt.figure(figsize=(10, 6))
                    plt.hist(self.dataframe[column].dropna(), bins=30, color='blue', alpha=0.7)
                    plt.title(f'Histogram of {column}')
                    plt.xlabel(column)
                    plt.ylabel('Frequency')
                    plt.grid(axis='y', alpha=0.75)
                    plt.savefig(f'{output_dir}/{column}_histogram.png')
                    plt.close()

                elif pd.api.types.is_categorical_dtype(self.dataframe[column]) or pd.api.types.is_object_dtype(self.dataframe[column]):
                    plt.figure(figsize=(10, 6))
                    self.dataframe[column].value_counts().plot(kind='bar', color='orange', alpha=0.7)
                    plt.title(f'Bar Plot of {column}')
                    plt.xlabel(column)
                    plt.ylabel('Frequency')
                    plt.xticks(rotation=45)
                    plt.grid(axis='y', alpha=0.75)
                    plt.savefig(f'{output_dir}/{column}_bar_plot.png')
                    plt.close()
            except Exception as e:
                print(f"Error processing column '{column}': {e}. Skipping this column.")

    def compare_statistics(self, other_evaluator):
        stats_self = self.calculate_statistics()
        stats_other = other_evaluator.calculate_statistics()
        comparison_results = []

        for column in stats_self.keys():
            if column in stats_other:
                self_mean = stats_self[column].get('mean', None)
                other_mean = stats_other[column].get('mean', None)
                self_variance = stats_self[column].get('variance', None)
                other_variance = stats_other[column].get('variance', None)
                self_skewness = stats_self[column].get('skewness', None)
                other_skewness = stats_other[column].get('skewness', None)
                self_kurtosis = stats_self[column].get('kurtosis', None)
                other_kurtosis = stats_other[column].get('kurtosis', None)

                comparison_results.append({
                    'column': column,
                    'self_mean': self_mean,
                    'other_mean': other_mean,
                    'mean_difference': (self_mean - other_mean) if (self_mean is not None and other_mean is not None) else None,
                    'self_variance': self_variance,
                    'other_variance': other_variance,
                    'variance_difference': (self_variance - other_variance) if (self_variance is not None and other_variance is not None) else None,
                    'self_skewness': self_skewness,
                    'other_skewness': other_skewness,
                    'skewness_difference': (self_skewness - other_skewness) if (self_skewness is not None and other_skewness is not None) else None,
                    'self_kurtosis': self_kurtosis,
                    'other_kurtosis': other_kurtosis,
                    'kurtosis_difference': (self_kurtosis - other_kurtosis) if (self_kurtosis is not None and other_kurtosis is not None) else None,
                })

        # Create a DataFrame from the comparison results
        comparison_df = pd.DataFrame(comparison_results)
        return comparison_df

    def save_comparison_to_csv(self, other_evaluator, filename):
        comparison_df = self.compare_statistics(other_evaluator)
        comparison_df.to_csv(filename, index=False)

    # Generate the comparison visualizations
    def visualize_comparison(self, other_evaluator, output_dir):
        if os.path.exists(output_dir):
            shutil.rmtree(output_dir)
        os.makedirs(output_dir)

        # Create a DataFrame from the comparison results
        comparison_df = self.compare_statistics(other_evaluator)
        comparison_df = comparison_df[['column', 'self_mean', 'other_mean']].dropna()

        # Calculate percentage difference
        comparison_df['percentage_difference'] = ((comparison_df['self_mean'] - comparison_df['other_mean']) / comparison_df['other_mean']) * 100

        # Plotting the percentage difference
        plt.figure(figsize=(12, 6))
        plt.bar(comparison_df['column'], comparison_df['percentage_difference'], color='purple', alpha=0.7)

        plt.xlabel('Columns')
        plt.ylabel('Percentage Difference (%)')
        plt.title('Percentage Difference between Means of Evaluators')
        plt.xticks(rotation=45)
        plt.axhline(0, color='red', linewidth=0.8, linestyle='--')  # Line at 0 for reference
        plt.tight_layout()
        plt.savefig(f'{output_dir}/percentage_difference.png')
        plt.close()

In [None]:
# Replace 'path_to_file.csv' with the actual path to your CSV file
import os
plist=[]
paths = [os.environ["SYNTH_N"], os.environ["SYNTH_G"],
        os.environ["FAKER"], os.environ["FAKER_P"]]

for p in paths:
    path = os.path.join(os.environ["PATH_START"], p)
    plist.append(path)

plist

In [None]:
# Load your datasets into pandas DataFrames
real = os.path.join(os.environ["PATH_START"], os.environ["CLEAN"])
df1 = pd.read_csv(real)

for p in plist:
    # Load your datasets into pandas DataFrames
    df2 = pd.read_csv(p)

    # Create an instance of the DataEvaluator class
    evaluator1 = DataEvaluator(df1)
    evaluator2 = DataEvaluator(df2)

    # Call the write_statistics_to_file method
    evaluator1.save_statistics_to_csv(os.environ["PATH_START"] +'real_stats.csv')
    evaluator2.save_statistics_to_csv(f'{p.strip(".csv")}_stats.csv')

    # Call the create_visualizations method
    """
    if not os.path.exists(os.environ["PATH_START"] + 'visualizations_real'):
        os.makedirs(os.environ["PATH_START"] + 'visualizations_real')
    if not os.path.exists(f'visualizations_{p.strip(".csv")}'):
        os.makedirs(f'visualizations_{p.strip(".csv")}')
    evaluator1.create_visualizations(os.environ["PATH_START"] + 'visualizations_real')
    evaluator2.create_visualizations(f'visualizations_{p.strip(".csv")}')
    """
    # Call the compare_statistics method
    comparison_results = evaluator1.compare_statistics(evaluator2)
    print(comparison_results)
    # Save comparison results to CSV
    evaluator1.save_comparison_to_csv(evaluator2, f'{p.strip(".csv")}_comparison_results.csv')
    # Create a visualization of the comparison
    evaluator1.visualize_comparison(evaluator2, p.strip(".csv"))