# Setup HIVE-style Partitioned Data Generator

In [None]:
import string
import random
import os
import duckdb
duckdb.__version__

In [None]:
MAX_MEMORY = "25GB" # increase to available python memory -25%
DUCKDB_FILE = "data/gendata.duckdb"
TMP_DIR = "data/"
DATA_FOLDER = "data/exp5" 

# S3 Uploads
AWS_ACCESS_KEY=''
AWS_SECRET_ACCESS_KEY=''
BUCKET = "ayushman-hops"

# HDFS Uploads
HOPS_HOST=''
HOPS_API_KEY=''
HDFS_PATH = "/Projects/testproj/Resources/"


In [None]:
!mkdir -p {TMP_DIR}
!mkdir -p {DATA_FOLDER}
# !ls -lR data/

In [None]:
def gendata(ROWS, PARTITIONS):
    with duckdb.connect(DUCKDB_FILE, config={'memory_limit': MAX_MEMORY,
                                              'temp_directory': TMP_DIR}) as con:
        con.execute("DROP TABLE IF EXISTS gendata")

        id_cols = ', '.join([f"CAST(floor(random()*{card}) AS INT) as id{i}" 
                             for (i, card) in enumerate(PARTITIONS+[10]*(round(COLS/4)-len(PARTITIONS)))])
        float_cols = ', '.join([f"random() as rand{i}" for i in range(round(COLS/4))])
        string_cols = ', '.join([f"md5('{''.join(random.sample(string.ascii_letters,20))}') as str_hash{i}" 
                                 for i in range(round(COLS/4))])
        dt_cols = ', '.join([f"to_timestamp({''.join(random.sample(string.digits,9))}) as dt{i}" 
                             for i in range(round(COLS/4))])

        con.execute(f"""CREATE TABLE gendata AS (SELECT {id_cols}, 
                                                        {float_cols},
                                                        {string_cols},
                                                        {dt_cols}
                                                 FROM range({ROWS}) tbl(x));""")

        df = con.execute("SELECT * FROM gendata LIMIT 10").fetchdf()
    #df

#gendata(ROWS, PARTITIONS)

In [None]:
COLS=20
PARTITIONS=[2]
id_cols = ', '.join([f"CAST(floor(random()*{card}) AS INT) as id{i}" 
                             for (i, card) in enumerate(PARTITIONS+[10]*(round(COLS/4)-len(PARTITIONS)))])
print(id_cols)

In [None]:
def write_parquet(ROW_SHORT, export=False):
    with duckdb.connect(DUCKDB_FILE, config={'memory_limit': MAX_MEMORY,
                                             'temp_directory': TMP_DIR}) as con:
        con.execute(f"SET threads='{FILES_PER_PARTITION}';")
        con.execute(f"SET preserve_insertion_order=false;")
        con.execute(f"PRAGMA memory_limit='{MAX_MEMORY}';")
        #con.register_filesystem(hopsfs_fsspec)

        WRITE_PATH = f"{DATA_FOLDER}/{ROW_SHORT}/genpart{len(PARTITIONS)}/"
        print(WRITE_PATH)

        part_cols = ', '.join([f"id{i}" for (i, card) in enumerate(PARTITIONS)])
        if export:
            con.execute(f"""EXPORT DATABASE '{WRITE_PATH}' (FORMAT PARQUET);""")
        else:
            if PARTITION:
                con.execute(f"""COPY (SELECT * FROM gendata) TO '{WRITE_PATH}' 
                        (FORMAT PARQUET, 
                        PARTITION_BY ({part_cols}), 
                        ROW_GROUP_SIZE {ROW_GROUP_SIZE},
                        ALLOW_OVERWRITE TRUE)""")
            else:
                for i in range(FILES_PER_PARTITION):
                    WRITE_PATH = f"{DATA_FOLDER}/{ROW_SHORT}/genpart{len(PARTITIONS)}/data_{i}.parquet"
                    con.execute(f"""COPY (SELECT * FROM gendata) TO '{WRITE_PATH}' 
                            (FORMAT PARQUET, 
                            ROW_GROUP_SIZE {ROW_GROUP_SIZE},
                            ALLOW_OVERWRITE TRUE)""")
#write_parquet(ROW_SHORT)

In [None]:
# print(WRITE_PATH)
# !ls -l {WRITE_PATH}
# #!ls -la /tmp/gendata/rowsize20m/genpart3/id0=0/id1=0/id2=0

In [None]:
import boto3

def upload_s3(path):
    session = boto3.Session(
       aws_access_key_id=AWS_ACCESS_KEY,
       aws_secret_access_key=AWS_SECRET_ACCESS_KEY
    )
    s3 = boto3.client('s3')
    print(f'Uploading {path} to S3')
    for root,dirs,files in os.walk(path):
        for file in files:
            s3.upload_file(os.path.join(root,file), BUCKET, os.path.join(root,file))
    print(f'Finished uploading to S3...')


In [None]:
import os
import requests
import hopsworks
from hops import hdfs

def copy_to_hdfs(src_path):
    project = hopsworks.login(host=HOPS_HOST,
                                  port=443,
                                  api_key_value=HOPS_API_KEY)
    target_path = HDFS_PATH + src_path + '/../' #step back one directory so it doesn't create two same directories
    if(not hdfs.exists(target_path)):
        hdfs.mkdir(target_path)
    print(f'Uploading {src_path} to HDFS...')
    hdfs.copy_to_hdfs(src_path, target_path, overwrite=True)
    print(f'Finished uploading to HDFS...')


## Generate Many Partitions in Many Folders

In [None]:
'''''''''''''''''''''''''''
'''''''''''''''''''''''''''
''' Start of generator  '''
'''''''''''''''''''''''''''
'''''''''''''''''''''''''''

ROWS = 42000000
ROW_SHORT = "16parts"
PARTITIONS = [16]
PARTITION = True # Set to False to avoid hive-style partitioning
ROW_GROUP_SIZE = 1000000
S3 = True  # S3 Uploads
HOPSFS = True  # HDFS Uploads
COLS = 20 # number of columns, 50% integer, 50% float
FILES_PER_PARTITION = 1 # files per partition/number of writer threads
EXPORT = False # Export the whole DB as one parquet file

if len(PARTITIONS) > round(COLS/2):
    print(f"Only half of the colums are reserved for id columns, please reduce the number of partition keys to <= {round(COLS/2)}")

In [None]:
!mkdir -p {TMP_DIR}
!mkdir -p {DATA_FOLDER}


for i in range(len(PARTITIONS)):
    if not os.path.exists(f"{DATA_FOLDER}/{ROW_SHORT}/"):
        os.mkdir(f"{DATA_FOLDER}/{ROW_SHORT}/")
    if not os.path.exists(f"{DATA_FOLDER}/{ROW_SHORT}/genpart{i+1}/"):
        os.mkdir(f"{DATA_FOLDER}/{ROW_SHORT}/genpart{i+1}/")

WRITE_PATH = f"{DATA_FOLDER}/{ROW_SHORT}/genpart{len(PARTITIONS)}"
print(WRITE_PATH)

In [None]:
!ls -l {DATA_FOLDER}/
!ls -lh data/exp5/

In [None]:
gendata(ROWS, PARTITIONS)
write_parquet(ROW_SHORT, EXPORT)

if S3:
    upload_s3(WRITE_PATH)
if HOPSFS:
    copy_to_hdfs(WRITE_PATH)

In [None]:
print(WRITE_PATH)
!ls -lh {WRITE_PATH}/id0=0/


# Generate Many Partitions in Single Folder

In [None]:
import duckdb
import os
import shutil
from concurrent.futures import ThreadPoolExecutor
from hops import hdfs
#from pydoop import hdfs
#from fsspec.implementations.arrow import HadoopFileSystem

In [None]:
NUMBER_OF_FILES = 100000
GEN_PATH = f'data/gen100k'
HDFS_PATH = "/Projects/testproj/Resources/"

In [None]:
!mkdir -p {GEN_PATH}

for i in range(NUMBER_OF_FILES):
    if (not os.path.exists(f"{GEN_PATH}/part_{str(i)}.test")):
        with open(f"{GEN_PATH}/part_{str(i)}.test", mode="w") as f:
            f.write("")
            f.close()

#upload_s3(GEN_PATH)
copy_to_hdfs(GEN_PATH)

In [None]:
src_path = GEN_PATH
project = hopsworks.login(host=HOPS_HOST,
                              port=443,
                              api_key_value=HOPS_API_KEY)
target_path = HDFS_PATH + src_path + '/../' #step back one directory so it doesn't create two same directories
if(not hdfs.exists(target_path)):
    hdfs.mkdir(target_path)
print(f'Uploading {src_path} to HDFS...')
hdfs.copy_to_hdfs(src_path, target_path, overwrite=True)
print(f'Finished uploading to HDFS...')