In [1]:
!pip install dask_cuda



In [2]:
import os
import processor
import numpy as np
import tensorflow as tf
import random
import time
import sys

import synthetic_data_generator
import evaluation

from dask_cuda import LocalCUDACluster
from dask.distributed import Client

In [3]:
#if running in Google Colab
sys.path.append(cwd)

In [4]:
cluster = LocalCUDACluster()
client = Client(cluster)

INFO:distributed.http.proxy:To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
INFO:distributed.scheduler:State start
INFO:distributed.diskutils:Found stale lock file and directory '/tmp/dask-scratch-space/scheduler-75dq5jsl', purging
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:37793
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:8787/status
INFO:distributed.scheduler:Registering Worker plugin shuffle
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:37357'
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:35293', name: 0, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:35293
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:58770
INFO:distributed.scheduler:Receive client connection: Client-0153b84a-aef3-11ef-ae43-0242ac1c000c
INFO:distributed.core:Starting 

In [5]:
os.makedirs("actual_data", exist_ok=True)
os.makedirs("transformed_data", exist_ok=True)
os.makedirs("result_data", exist_ok=True)

In [6]:
def set_global_seed(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)

set_global_seed(42)

In [7]:
start_time = time.time()
existing_data_path = 'Credit.csv'
column_name = 'Amount'
output_dir = 'synthetic_data_1B'
target_count = 1_000_000_000

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

if not any(os.scandir(output_dir)):
    print("Output directory is empty. Generating synthetic data...")
    synthetic_data_generator.generate_synthetic_data(existing_data_path,
                                                 column_name, output_dir,target_count)
    print(f"Synthetic data generated and stored in: {output_dir}")
else:
    print(f"Output directory '{output_dir}' is not empty. Skipping synthetic data generation.")
elapsed_time = (time.time() - start_time) / 60
print(f"Time taken: {elapsed_time:.2f} minutes")

Output directory is empty. Generating synthetic data...
Generating chunk 1...
Generating chunk 2...
Generating chunk 3...
Generating chunk 4...
Generating chunk 5...
Generating chunk 6...
Generating chunk 7...
Generating chunk 8...
Generating chunk 9...
Generating chunk 10...
Generating chunk 11...
Generating chunk 12...
Generating chunk 13...
Generating chunk 14...
Generating chunk 15...
Generating chunk 16...
Generating chunk 17...
Generating chunk 18...
Generating chunk 19...
Generating chunk 20...
Generating chunk 21...
Generating chunk 22...
Generating chunk 23...
Generating chunk 24...
Generating chunk 25...
Generating chunk 26...
Generating chunk 27...
Generating chunk 28...
Generating chunk 29...
Generating chunk 30...
Generating chunk 31...
Generating chunk 32...
Generating chunk 33...
Generating chunk 34...
Generating chunk 35...
Generating chunk 36...
Generating chunk 37...
Generating chunk 38...
Generating chunk 39...
Generating chunk 40...
Generating chunk 41...
Generating

In [9]:
#Read the Files
ddf = processor.read_input_data(output_dir,column_name)

In [12]:
start_time = time.time()
for i, partition in enumerate(ddf.to_delayed()):
    processor.process_partition(partition.compute(), i)
elapsed_time = (time.time() - start_time) / 60
print(f"Time taken: {elapsed_time:.2f} minutes")

Time taken: 6.52 minutes


In [14]:
start_time = time.time()
actual_folder = "actual_data"
result_folder = "result_data"
errors = evaluation.calculate_errors(actual_folder, result_folder, metric="rmse")
mean_error = np.mean(errors)
print("Mean Error : ", mean_error)
elapsed_time = (time.time() - start_time) / 60
print(f"Time taken: {elapsed_time:.2f} minutes")

Mean Error :  123.2737509199933
Time taken: 0.40 minutes


**Time Taken** :


*   Generating a data with 1 B records : 1 minute 38 seconds(One-time activity)
*   Data Transformation and inv transformation using GMM : 6 minute 52 seconds
*   Getting error metrics : 40 Seconds


**Total execution time**:

*   First Time : ~9 minutes 10 seconds
*   Thereafter :  ~7 minutes 30 seconds









In [12]:
import shutil
import os

def delete_folder(folder_path):
    """
    Deletes an entire folder and its contents.

    Args:
        folder_path (str): The path to the folder to delete.

    Returns:
        None
    """
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        shutil.rmtree(folder_path)
        print(f"Folder '{folder_path}' deleted successfully.")
    else:
        print(f"Folder '{folder_path}' does not exist or is not a directory.")

In [16]:
folder_path = '/content/result_data'
try:
    shutil.rmtree(folder_path)
except Exception as e:
    print(f"Error deleting folder: {e}")