In [None]:
#####################################################################
#
# generate tabular classification datasets
#
#####################################################################

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html

In [4]:
from datetime import datetime
from sklearn.datasets import make_classification
import numpy as np
import os

In [5]:
# specify parameters
P = ! gcloud config list --format 'value(core.project)'
PROJECT_ID = P[0]
REGION = "us-central1"
BUCKET_NAME = f"bkt-{PROJECT_ID}-central1-data"
BUCKET_PATH = f"gs://{BUCKET_NAME}"
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

In [6]:
#####################################################################
#
# generate data
#
#####################################################################

In [18]:
# sklearn make_classification
n_rows = 1000
n_cols = 8
n_informative = 6
n_redundant = 2
n_repeated = 0
seed = 3816

# additional random cols
n_random_cols = 2

In [19]:
ds = make_classification(  n_samples = n_rows
                         , n_features = n_cols
                         , n_informative = n_informative
                         , n_redundant = n_redundant
                         , n_repeated = n_repeated
                         , n_classes = 2
                         , n_clusters_per_class = 2
                         , weights = None
                         , flip_y = 0.01
                         , shift = 0.0
                         , scale = 1.0
                         , random_state = seed
                         , shuffle = True
                        )

inputs_informative = ds[0]
labels = ds[1].reshape((n_rows, 1))

# random 
inputs_random = np.random.rand(n_rows, n_random_cols)

# combine
inputs = np.append(inputs_informative, inputs_random, axis = 1)
data = np.append(labels, inputs, axis = 1)

# headers write
informative_col_headers = ",".join([f"x_inf_{i}" for i in range(1, n_informative + 1)])
redundant_col_headers = ",".join([f"x_red_{i}" for i in range(1, n_redundant + 1)])
# non repeated right now, add that here if needed
random_col_headers = ",".join([f"x_ran_{i}" for i in range(1, n_random_cols + 1)])
header = "label," + informative_col_headers + "," + redundant_col_headers + "," + random_col_headers

data_filename = "data.csv"
np.savetxt(data_filename, data, delimiter=',', header = header, comments='')

In [20]:
#####################################################################
#
# upload to bigquery and GCS
#
#####################################################################

In [21]:
# specify parameters
dataset_id = "ds_central1" # bq ds ID

data_type = "tab" # tab = tabular
problem_type = "class"
input_total = "10"
rows_total = "1k"
split = "oos" # tra tes val oos
random_seed = seed
table_id = f"{data_type}_{problem_type}_{input_total}inps_{rows_total}rows_{split}_{random_seed}"

# run the functions below
bq_write(PROJECT_ID, REGION, dataset_id, table_id, data_filename)
write_gcs(PROJECT_ID, BUCKET_NAME, data_filename)

Loaded 1000 rows into ds_central1:/projects/ap-alto-ml-1000/datasets/ds_central1/tables/tab_class_10inps_1krows_oos_3816.


In [1]:
############################################################################################################
#      helper functions
############################################################################################################
def bq_write(PROJECT_ID, REGION, dataset_id, table_id, data_filename):
    from google.cloud import bigquery
    
    client = bigquery.Client(location = REGION, project = PROJECT_ID)

    dataset = client.get_dataset(dataset_id)
    table_ref = dataset.table(table_id)
    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV
        , create_disposition = "CREATE_IF_NEEDED"
        , skip_leading_rows = 1
        , write_disposition = "WRITE_TRUNCATE" # WRITE_TRUNCATE WRITE_APPEND
        , autodetect = True
    )

    with open(data_filename, 'rb') as source_file:
        job = client.load_table_from_file(
            source_file
            , table_ref
            , location= REGION
            ,  job_config=job_config
        )

    job.result()

    print('Loaded {} rows into {}:{}.'.format(
        job.output_rows, dataset_id, table_ref.path))
    
def write_gcs(PROJECT_ID, BUCKET_NAME, data_filename):
    from google.cloud import storage
    storage_client = storage.Client(project = PROJECT_ID)
    bucket = storage_client.get_bucket(BUCKET_NAME)
    storage_path = os.path.join("gs://", bucket.name, table_id + ".csv")
    blob = storage.blob.Blob.from_string(storage_path, client=storage_client)
    blob.upload_from_filename(data_filename)