In [None]:
import dask.dataframe as dd
from dask.distributed import Client
from sklearn.preprocessing import StandardScaler
import pandas as pd
from google.colab import drive

# Initialize the Dask client
client = Client()

# Mount Google Drive
drive.mount('/content/drive')

# Folder path on Google Drive
folder_path = '/content/drive/My Drive/BB_Normalized_Monthly_Final/'

# File names to process
file_names = [
    'combined_data_minus_183_part_A.parquet',
    'combined_data_minus_183_part_B.parquet',
    'combined_data_minus_183_part_C.parquet'
]

# Process each file sequentially
for file_name in file_names:
    # Load the data using Dask
    data = dd.read_parquet(folder_path + file_name)

    # Identify numerical columns
    numerical_cols = data.select_dtypes(include=['number']).columns

    # Apply standard scaling to numerical columns using a custom transformation function
    def scale_columns(df):
        scaler = StandardScaler()
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
        return df

    # Use map_partitions with the custom function
    data = data.map_partitions(scale_columns)

    # One-hot encoding is handled similarly if needed, using dask_ml.preprocessing or similar approach

    # Compute the final DataFrame and save the results
    vectorized_data = data.compute()
    output_file_name = file_name.replace('.parquet', '_vectorized.parquet')
    vectorized_data.to_parquet(folder_path + output_file_name)
    print(f"Vectorized data saved to {output_file_name}")

    # Clean up to free memory
    del data, vectorized_data

# Close the Dask client
client.close()


Perhaps you already have a cluster running?
Hosting the HTTP server on port 38597 instead
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:34293
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:38597/status
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:45933'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:43787'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:41487'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:44909'
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:40619', name: 1, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:40619
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:49476
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:41921', name: 3, status: init, memory: 0, processing: 0>
INFO:distributed.schedul

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Vectorized data saved to combined_data_minus_183_part_A_vectorized.parquet




Vectorized data saved to combined_data_minus_183_part_B_vectorized.parquet


INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:45933'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:43787'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:41487'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:44909'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:49490; closing.
INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:49476; closing.
INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:49508; closing.
INFO:distributed.scheduler:Remove worker <WorkerState 'tcp://127.0.0.1:44193', name: 0, status: closing, memory: 0, pro

Vectorized data saved to combined_data_minus_183_part_C_vectorized.parquet


INFO:distributed.scheduler:Scheduler closing due to unknown reason...
INFO:distributed.scheduler:Scheduler closing all comms
