# Processing TAQ Data
Example of how to process a collection of csv files, that are compressed, and arranged in a bespoke directory structure. The structure is **ROOT**/**DATE**/**A-Z First Letter of Ticker**/**TICKER**.csv.gz. All files come from and S3 bucket.

## Architecture and Data Flow
![Architecture](images/csv_arch.png "Architecture")

To process the csv.gz files, first the provided tarball with data [algoseek-marketdata.tar.gz](algoseek-marketdata.tar.gz) is untarred and copied to an S3 bucket FinSpace can read from. To process the data files, the files are copied into the general purpose cluster, processed into memory by the cluster. The memory contents are then first saved to disk and then added to the managed database. Once the database has been updated the dataview of the database is updated to present the latest database state (with the added data), and the clusters using the database are also updated to use the latest state of the view. Finally the historical database (HDB) is queried to show the new data available for query from the HDB. 

## Algoseek LLC Data
Trade and Quote data has been provided by [AlgoSeek LLC](https://www.algoseek.com/), you can learn more about their data offerings from their home page.


In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
import subprocess
import boto3
import json
import datetime

from env import *
import pykx as kx
import awswrangler as wr

from managed_kx import *

In [2]:
# ----------------------------------------------------------------
DB_NAME="DEMO_DB"
DBVIEW_NAME=f"{DB_NAME}_VIEW"
SCALING_GROUP_NAME="DEMO_SCALING_GROUP"
VOLUME_NAME="DEMO_SHARED_VOLUME"
CODEBASE="demo"
CLUSTER_NAME="demo_csv_cluster"

HDB_CLUSTER_NAME="demo_hdb_cluster"

# S3 Destinations
S3_CODE_PATH="code"
S3_DATA_PATH="data"
SOURCE_DATA_DIR="demo"
# ----------------------------------------------------------------
WORKING_DIR=f"/opt/kx/app/shared/{VOLUME_NAME}/{CLUSTER_NAME}"

LOCAL_DATA_HOME="algoseek-marketdata/us-equity-taq-faang"

## YOU MUST CHANGE THIS TO A DIRECTORY THAT WORKS FOR YOUR ACCOUNT
S3_DATA_HOME=f"s3://kdb-demo-{ACCOUNT_ID}-kms/algoseek-marketdata/us-equity-taq-faang"

# days supplied in Tarball
start = datetime.date(2021,1,4)
end = datetime.date(2021,1,5)

dlist = pd.date_range(start, end, freq='B')

ALL_DATES = []

# convert all to date
for d in dlist:
    ALL_DATES.append(d.date())

# ----------------------------------------------------------------

# set pykx local q console width and height
kx.q("\c 500 500")

pykx.Identity(pykx.q('::'))

In [3]:
# Create AWS Session for working with FinSpace service
session=None

if AWS_ACCESS_KEY_ID is None:
    print("Using Defaults ...")
    # create AWS session: using access variables
    session = boto3.Session()
else:
    print("Using variables ...")
    session = boto3.Session(
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        aws_session_token=AWS_SESSION_TOKEN
    )

# create finspace client
client = session.client(service_name='finspace', endpoint_url=ENDPOINT_URL)

Using Defaults ...


# Stage TAQ Data to S3
Copy the supplied sample TAQ data to an S3 bucket. Then the data will be processed by GP cluster from the S3 location. 

In [4]:
!rm -rf algoseek-marketdata

In [5]:
!tar xzf algoseek-marketdata.tar.gz

In [6]:
# Stage TAQ data to S3
if AWS_ACCESS_KEY_ID is not None:
    cp = f"""
export AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID}
export AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY}
export AWS_SESSION_TOKEN={AWS_SESSION_TOKEN}

aws s3 rm --recursive {S3_DATA_HOME} --quiet
aws s3 sync --exclude .DS_Store {LOCAL_DATA_HOME} {S3_DATA_HOME} --quiet
aws s3 ls --recursive {S3_DATA_HOME}/
"""
else:
    cp = f"""
aws s3 rm --recursive {S3_DATA_HOME} --quiet
aws s3 sync --exclude .DS_Store {LOCAL_DATA_HOME} {S3_DATA_HOME} --quiet
aws s3 ls --recursive {S3_DATA_HOME}/
"""
    
# execute the S3 copy
os.system(cp)

2024-08-13 16:25:55   15021638 algoseek-marketdata/us-equity-taq-faang/2021/20210104/A/AMZN.csv.gz
2024-08-13 16:25:55   19321692 algoseek-marketdata/us-equity-taq-faang/2021/20210104/F/FB.csv.gz
2024-08-13 16:25:55   14132728 algoseek-marketdata/us-equity-taq-faang/2021/20210104/G/GOOG.csv.gz
2024-08-13 16:25:55   10610310 algoseek-marketdata/us-equity-taq-faang/2021/20210104/N/NFLX.csv.gz
2024-08-13 16:25:55   11285284 algoseek-marketdata/us-equity-taq-faang/2021/20210105/A/AMZN.csv.gz
2024-08-13 16:25:55   11544605 algoseek-marketdata/us-equity-taq-faang/2021/20210105/F/FB.csv.gz
2024-08-13 16:25:55    9736261 algoseek-marketdata/us-equity-taq-faang/2021/20210105/G/GOOG.csv.gz
2024-08-13 16:25:55    8014976 algoseek-marketdata/us-equity-taq-faang/2021/20210105/N/NFLX.csv.gz


0

# Before State of HDB
This is the state/contents of the HDB before we process the CSV files as new date. There will be a table (taq) with no data.

In [7]:
# Query the HDB for before state
hdb = get_pykx_connection(client, 
                          environmentId=ENV_ID, clusterName=HDB_CLUSTER_NAME, 
                          userName=KDB_USERNAME, boto_session=session)

tables = hdb("tables[]").py()

# inventory of tables in the database and rows in each
print(80*'=')
print("Tables and Counts")
display( hdb("tables[]!count each value each tables[]") )

# For each table: schema, and samples and counts
for t in tables:
    print(80*'=')
    print (f'Table: {t}')
    print(80*'-')
    display( hdb(f"meta {t}") )
    display( hdb(f"select rows:count i by date from {t}") )
    display( hdb(f"select rows:count i by date,Ticker from {t}") )
    display( hdb(f"select from {t} where date = max date, i<3") )


Tables and Counts


Table: taq
--------------------------------------------------------------------------------


Unnamed: 0_level_0,t,f,a
c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
date,"""d""",,
Ticker,"""s""",,p
Timestamp,"""n""",,
EventType,"""s""",,
Price,"""f""",,
Quantity,"""j""",,
Exchange,"""s""",,
Conditions,"""s""",,


Unnamed: 0_level_0,rows
date,Unnamed: 1_level_1


Unnamed: 0_level_0,Unnamed: 1_level_0,rows
date,Ticker,Unnamed: 2_level_1


Unnamed: 0,date,Ticker,Timestamp,EventType,Price,Quantity,Exchange,Conditions


# Process from Cluster
Connect to the GP cluster and use it to process the list of external S3 files (csv.gz format). The list of S3 files to process will be given to the cluster and processed with the process_s3_csvgz function in q provided by this notebook.

## parse_csvgz
```
parse_csvgz:{[schema;file] (schema;enlist csv) 0: .Q.gz "c"$read1 hsym `$ string file};
```
This function will process a a local file (csv.gz) into an in-memory table taking a local file and schema as arguments. The file string is turned into a file handle using [hsym](https://code.kx.com/q/ref/hsym/), then the file contents is streamed using [read1](https://code.kx.com/q/ref/read1/), which is then uncompressed using [.Q.gz](https://code.kx.com/q/ref/dotq/#gz-gzip). The resulting csv stream is then parsed into a table using [enlist](https://code.kx.com/q/ref/enlist/) using the provided schema

Arguments:   
- schema: string for the table [q reference](https://code.kx.com/q/ref/file-text/#load-csv)   
- file: file (csv.gz format) to parse   

## process_s3_csvgz
```
process_s3_csvgz:{[schema;work_dir;s3_object]
    r:.aws.s3.get_object[s3_object;work_dir];
    raze {t:parse_csvgz[x;`$y]; hdel hsym`$y; t}[schema] each r`containerFileDestinationPath
};
```
This function will process an S3 based file (csv.gz) into an in-memory table, levering the parse_csvgz function. The function will copy the S3 file to a working directory (arguments of function), process it with parse_csvgz and clean up after itself by deleting the copied file.

The function copies the S3 object to the local working directory with [.aws.s3.get_object](https://docs.aws.amazon.com/finspace/latest/userguide/interacting-with-kdb-q-apis.html). Then gives the copied file location to the already defined function **parse_csvgz** to turn the file contents into a table (t) using the given schema, the file is then deleted with [hdel](https://code.kx.com/q/ref/hdel/) and the table t is returned. The [each](https://code.kx.com/q/ref/each/) function is used because the containerFileDestinationPath returned by [.aws.s3.get_object](https://docs.aws.amazon.com/finspace/latest/userguide/interacting-with-kdb-q-apis.html) is a column of a table returned by the get_object function, a list of files copied. 

Arguments:   
- schema: string for the table [q reference](https://code.kx.com/q/ref/file-text/#load-csv)   
- work_dir: path on local machine for where S3 object (files) will be copied to before   
- s3_object: S3 object (.csv.gz file) that is the file to read into a table             

# Parsing List of Files
Parse the list (slist) of S3 objects into one table

```
taq:raze{.Q.gc[]; process_s3_csvgz [x; y; z]}["DNSSFJSS";wd] peach slist;
```
The function uses [peach](https://code.kx.com/q/ref/each/) to process each S3 object in slist to a table using already defined function **process_s3_csvgz** creating a list of tables, then that list of tables is converted into one table using [raze](https://code.kx.com/q/ref/raze/).

For memory efficiency, there is a call to free up memory using [.Q.gc](https://code.kx.com/q/ref/dotq/#gc-garbage-collect).

# Add to Database: Create the Changeset
Will create a changeset for the database that includes the two tables (df1 and df2) and will create a changeset as a new date partition of the database with the in-memory tables splayed to disk for today's date.

Code run on the cluster saves in-memory tables to disk and then adds those files to the maanged database as a changeset.

### dirR
```
diR:{$[11h=type d:key x;raze x,.z.s each` sv/:x,/:d;d]};
```
This function will recursively list the contents of a given directory x

### nuke
```
nuke:hdel each desc diR@;
```
This function will delete all contents of the given directory and the given directory as well.

### pdpft
```
pdpft:{[d;p;f;t] 
    i:iasc t f; 
    tab:.Q.en[d;`. t]; 
    .[{[d;t;i;c;a]@[d;c;:;a t[c]i]}[d:.Q.par[d;p;t];tab;i;;]]peach flip(c;)(::;`p#)f=c:cols t; 
    @[d;`.d;:;f,c where not f=c]; t 
};
```
This is a parallel version of [.Q.pdft](https://code.kx.com/q/ref/dotq/#dpft-save-table), will use available slave threads to save splayed table columns in parallel

### saveTables
```
saveTables:{[db;path;d]
    .aws.get_latest_sym_file[db;path];
    t:tables`.;
    t@:where `g=attr each t@\:`Ticker;
    {pdpft[hsym`$x;y;`Ticker;z]}[path;d] each tables`.;
/    {.Q.dpft[hsym`$x;y;`Ticker;z]}[path;d] each tables`.;

    dt:string d;
    dict:flip`input_path`database_path`change_type!(
        (`$path,dt;`$path,"sym");
        (`$"/",dt,"/";`$"/");`PUT`PUT);
    cid:.aws.create_changeset[db;dict];
    
    nuke hsym`$path,string[d];
    hdel hsym`$path,"sym";

    @[;`Ticker;`g#] each t;
    .Q.gc[];

    cid
};
```

This function will save all in-memory tables in the global namespace with an index (g) [attribute](https://code.kx.com/q/ref/attr/). The function prepares for saving tables by first copying the database's (**db**) most recent sym file using [get_latest_sym_file](https://docs.aws.amazon.com/finspace/latest/userguide/interacting-with-kdb-q-apis.html) to **path**. A list of tables in the global namespace with index attribute is constructed and each of those tables saved using [.Q.dpft](https://code.kx.com/q/ref/dotq/#dpts-save-table-unsorted-with-symtable) to **path** that will also update the copied database sym file also in **path**. To add the changeset, an inventory of files for the changeset is put into the dictionary **dict** and the changeset created (added) to the database with the [.aws.create_changeset](https://docs.aws.amazon.com/finspace/latest/userguide/interacting-with-kdb-q-apis.html) function. To clean up, the sym file and date directory are then deleted and finally memory is cleaned up with [.Q.gz](https://code.kx.com/q/ref/dotq/#gz-gzip).

Arguments:
- db: name of database to update   
- path: file path for saving tables   
- d: date partition to create when saving   

In [8]:
# get the full connection string
conn_str = get_kx_connection_string(client, 
                                  environmentId=ENV_ID, clusterName=CLUSTER_NAME, 
                                   userName=KDB_USERNAME, boto_session=session)

host, port, username, password = parse_connection_string(conn_str)

## Define Functions on Cluster
Use the q magic to open an IPC connection the the cluster and send the functions used for processing the data files.

### Functions Defined
- parse_csvgz   
- process_s3_csvgz    
- diR   
- nuke   
- pdpft   
- saveTables   

In [9]:
%%q --host $host --port $port --user $username --pass $password
\c 500 500 

parse_csvgz:{[schema;file] (schema;enlist csv) 0: .Q.gz "c"$read1 hsym `$ string file};

process_s3_csvgz:{[schema;work_dir;s3_object]
    r:.aws.s3.get_object[s3_object;work_dir];
    {t:parse_csvgz[x;`$y]; hdel hsym`$y; t}[schema] first r`containerFileDestinationPath
    };

diR:{$[11h=type d:key x;raze x,.z.s each` sv/:x,/:d;d]};
nuke:hdel each desc diR@;

pdpft:{[d;p;f;t] 
    i:iasc t f; 
    tab:.Q.en[d;`. t]; 
    .[{[d;t;i;c;a]@[d;c;:;a t[c]i]}[d:.Q.par[d;p;t];tab;i;;]]peach flip(c;)(::;`p#)f=c:cols t; 
    @[d;`.d;:;f,c where not f=c]; t 
    };

saveTables:{[db;path;d]
    .aws.get_latest_sym_file[db;path];
    t:tables`.;
    t@:where `g=attr each t@\:`Ticker;
    {pdpft[hsym`$x;y;`Ticker;z]}[path;d] each tables`.;
/    {.Q.dpft[hsym`$x;y;`Ticker;z]}[path;d] each tables`.;

    dt:string d;
    dict:flip`input_path`database_path`change_type!(
        (`$path,dt;`$path,"sym");
        (`$"/",dt,"/";`$"/");`PUT`PUT);
    cid:.aws.create_changeset[db;dict];
    
    nuke hsym`$path,string[d];
    hdel hsym`$path,"sym";

    @[;`Ticker;`g#] each t;
    .Q.gc[];

    cid
    };

In [10]:
gp = get_pykx_connection(client, 
                        environmentId=ENV_ID, clusterName=CLUSTER_NAME, 
                        userName=KDB_USERNAME, boto_session=session)

# Number of files for a date to process at a time, will still process all files
# Allows for monitoring process more easily when there are thousands of files for a date
LEN_CHUNK=500

for d in ALL_DATES:
    print(f'Date: {d}')
    s=datetime.datetime.now()

    # search for S3 objects
    s3_search=f'{S3_DATA_HOME}/{d.year}/{d.strftime("%Y%m%d")}/*/*.csv.gz'

    # Get list of S3 objects that will be processed into the table
    slist = wr.s3.list_objects(s3_search)
#    slist = slist[:300] ################################################################# DEBUGGING ONLY

    # Number of files to be processed
    print(f"{d} S3 Objects: {len(slist)}")
    display(slist[:1])
    print('...')
    display(slist[-1:])
    
    num_files = len(slist)

    # Continue if list is empty
    if (num_files == 0):
        print(f'List is empty for: {d}')
        continue
          
    # send working directory to cluster, this is where each S3 object will be copied to before processing into a table
    gp['wd'] = WORKING_DIR

    # send first chunk of slist to cluster
    this_chunk = slist[:LEN_CHUNK]
    gp['slist'] = this_chunk

    # process in chunks
    print(f"{datetime.datetime.now()}: Processing chunk: {len(this_chunk)} remaining: {len(slist)}", flush=True)
    
    # Process the list of S3 files into one table (df) 
    gp('taq:raze{process_s3_csvgz [x; y; z]}["DNSSFJSS";wd] peach slist')
    
    cnt = gp('count taq').py()
    print(f"     taq: {cnt:,}")

    # processed the first chunk, remove them
    del slist[:LEN_CHUNK]
    
    # process remaining chunks
    while len(slist) > 0:
        # send chunk of slist to cluster
        this_chunk = slist[:LEN_CHUNK]
        gp['slist'] = this_chunk
        
        print(f"{datetime.datetime.now()}: Processing chunk: {len(this_chunk)} remaining: {len(slist)}", flush=True)

        # Process the list of S3 files into one table (df) 
        gp('more_taq:raze{process_s3_csvgz [x; y; z]}["DNSSFJSS";wd] peach slist')

        # insert new values to taq
        gp('`taq insert more_taq')
        
        # delete more_taq
        gp('delete more_taq from `.')

        # new length of taq
        cnt = gp('count taq').py()
        print(f"     taq: {cnt:,}")
        
        # garbage collect
        gp('.Q.gc[]')
        kx.q('.Q.gc[]')
        
        # delete LEN_CHUNK from slist
        del slist[:LEN_CHUNK]
        
    print()

    # display workspace after ingest
    display("After Ingest....")
    display( gp(".Q.w[]") )

    # Delete the Date column (this will be the partition as date in schema)
    gp('delete Date from `taq')
    display( gp('.Q.gc[]').py() )

    # Group attribute on Ticker
    gp('update `g#Ticker from `taq')
    display( gp('.Q.gc[]').py() )

    # tables we have now
    print("Tables and Counts", flush=True)
    display( gp("tables[]!count each value each tables[]") )

    # display workspace before saving
    display("Before Saving....")
    display( gp(".Q.w[]") )

    # send date to cluster
    print("Saving changeset")
    gp["dt"] = d

    # save tables and collect the changeset ID
    cmd = f'cid:saveTables["{DB_NAME}";"{WORKING_DIR}/";dt]'
    gp(cmd)

    print( gp('cid').py() ) 
    
    # Newly created changset ID
    changeset_id = str(gp("cid`id"))
    display( f'New Changeset: {changeset_id}' )

    # Wait for the changeset to ingest
    wait_for_changeset_status(get_client(), environmentId=ENV_ID, databaseName=DB_NAME, changesetId=changeset_id, show_wait=True)
    print("**Done**")
    
    e = datetime.datetime.now()
    print(f"Elapsed Time for {d}: {e - s}")

    cnt = gp("count taq").py()

    # display workspace after saving
    display("After Saving....")
    display( gp(".Q.w[]") )

    # CLEAN UP ---------------------------------
    # delete table
    gp('delete taq from `.')
    gp('.Q.gc[]')
    kx.q('.Q.gc[]')
    


Date: 2021-01-04
2021-01-04 S3 Objects: 4


['s3://kdb-demo-829845998889-kms/algoseek-marketdata/us-equity-taq-faang/2021/20210104/A/AMZN.csv.gz']

...


['s3://kdb-demo-829845998889-kms/algoseek-marketdata/us-equity-taq-faang/2021/20210104/N/NFLX.csv.gz']

2024-08-13 16:25:59.581899: Processing chunk: 4 remaining: 4
     taq: 8,970,726



'After Ingest....'

2885681152

402653184

Tables and Counts


'Before Saving....'

Saving changeset
{'id': b'9MimR9zTlknSQvrQWoe3jg', 'status': b'PENDING'}


'New Changeset: 9MimR9zTlknSQvrQWoe3jg'

Status is IN_PROGRESS, total wait 0:00:00, waiting 10 sec ...
Status is IN_PROGRESS, total wait 0:00:10, waiting 10 sec ...
**Done**
Elapsed Time for 2021-01-04: 0:00:35.250433


'After Saving....'

Date: 2021-01-05
2021-01-05 S3 Objects: 4


['s3://kdb-demo-829845998889-kms/algoseek-marketdata/us-equity-taq-faang/2021/20210105/A/AMZN.csv.gz']

...


['s3://kdb-demo-829845998889-kms/algoseek-marketdata/us-equity-taq-faang/2021/20210105/N/NFLX.csv.gz']

2024-08-13 16:26:34.772127: Processing chunk: 4 remaining: 4
     taq: 6,174,244



'After Ingest....'

1342177280

201326592

Tables and Counts


'Before Saving....'

Saving changeset
{'id': b'qsimSBjoiigHEVs3XptVXw', 'status': b'PENDING'}


'New Changeset: qsimSBjoiigHEVs3XptVXw'

Status is IN_PROGRESS, total wait 0:00:00, waiting 10 sec ...
**Done**
Elapsed Time for 2021-01-05: 0:00:20.144780


'After Saving....'

# Update the HDB
Now that the database has been populated with new data, update the HDB's view to reflect the latest changeset_id and query its contents to confirm the data from the CSVs are now in the tables of the database HDB is serving up.

In [11]:
# get the list of changesets in the database
c_set_list = list_kx_changesets(client, environmentId=ENV_ID, databaseName=DB_NAME)

if len(c_set_list) != 0:
    # sort by create time
    c_set_list = sorted(c_set_list, key=lambda d: d['createdTimestamp']) 
    latest_changeset = c_set_list[-1]['changesetId']

    # Check if dataview already exists and is set to the requested changeset_id
    resp = get_kx_dataview(client=client, environmentId=ENV_ID, databaseName=DB_NAME, dataviewName=DBVIEW_NAME)

    if resp is None:
        resp = client.create_kx_dataview(
            environmentId = ENV_ID, 
            databaseName=DB_NAME, 
            dataviewName=DBVIEW_NAME,
            azMode='SINGLE',
            availabilityZoneId=AZ_ID,
            changesetId=latest_changeset, # latest changeset_id
            segmentConfigurations=[
                { 
                    'volumeName': VOLUME_NAME,
                    'dbPaths': ['/*'],  # cache all of database
    #                "onDemand": True,   # cache data onDemand (on read) else will ensure all is cached
                }
            ],
    #        readWrite=True,
            autoUpdate=False,
            description = f'Dataview of database'
        )
    elif resp['changesetId'] != latest_changeset:
        print(f"Dataview {DBVIEW_NAME} exists but needs updating, updating...")
        resp = client.update_kx_dataview(environmentId=ENV_ID, 
            databaseName=DB_NAME, 
            dataviewName=DBVIEW_NAME, 
            changesetId=latest_changeset, 
            segmentConfigurations=[
                {'dbPaths': ['/*'], 'volumeName': VOLUME_NAME}
            ]
        )
    else:
        print(f"Dataview {DBVIEW_NAME} exists with current changeset: {latest_changeset}")
    
else:
    # no changesets, do NOT create view
    print(f"No changeset in database: {DB_NAME}, Dataview {DBVIEW_NAME} not created")        


Dataview DEMO_DB_VIEW exists but needs updating, updating...


In [12]:
# wait for view to be ready
wait_for_dataview_status(client=client, environmentId=ENV_ID, databaseName=DB_NAME, dataviewName=DBVIEW_NAME, show_wait=True)

Dataview: DEMO_DB_VIEW status is UPDATING, total wait 0:00:00, waiting 30 sec ...
Dataview: DEMO_DB_VIEW status is UPDATING, total wait 0:00:30, waiting 30 sec ...
Dataview: DEMO_DB_VIEW status is UPDATING, total wait 0:01:00, waiting 30 sec ...
Dataview: DEMO_DB_VIEW status is UPDATING, total wait 0:01:30, waiting 30 sec ...
Dataview: DEMO_DB_VIEW status is UPDATING, total wait 0:02:00, waiting 30 sec ...
Dataview: DEMO_DB_VIEW status is UPDATING, total wait 0:02:30, waiting 30 sec ...
Dataview: DEMO_DB_VIEW status is UPDATING, total wait 0:03:00, waiting 30 sec ...
Dataview: DEMO_DB_VIEW status is UPDATING, total wait 0:03:30, waiting 30 sec ...
Dataview: DEMO_DB_VIEW status is UPDATING, total wait 0:04:00, waiting 30 sec ...
Dataview: DEMO_DB_VIEW status is UPDATING, total wait 0:04:30, waiting 30 sec ...
Dataview: DEMO_DB_VIEW status is UPDATING, total wait 0:05:00, waiting 30 sec ...
Dataview: DEMO_DB_VIEW status is UPDATING, total wait 0:05:30, waiting 30 sec ...
Dataview: DEMO_D

{'databaseName': 'DEMO_DB',
 'dataviewName': 'DEMO_DB_VIEW',
 'azMode': 'SINGLE',
 'availabilityZoneId': 'use1-az6',
 'changesetId': 'qsimSBjoiigHEVs3XptVXw',
 'segmentConfigurations': [{'dbPaths': ['/*'],
   'volumeName': 'DEMO_SHARED_VOLUME',
   'onDemand': False}],
 'activeVersions': [{'changesetId': 'qsimSBjoiigHEVs3XptVXw',
   'segmentConfigurations': [{'dbPaths': ['/*'],
     'volumeName': 'DEMO_SHARED_VOLUME',
     'onDemand': False}],
   'attachedClusters': [],
   'createdTimestamp': datetime.datetime(2024, 8, 13, 16, 26, 56, 661000, tzinfo=tzlocal()),
   'versionId': '2simSDGKpYgeMZHWdb7nBg'},
  {'changesetId': 'bsimNHCWawQBAubqAbhyBA',
   'segmentConfigurations': [{'dbPaths': ['/*'],
     'volumeName': 'DEMO_SHARED_VOLUME',
     'onDemand': False}],
   'attachedClusters': ['demo_csv_cluster', 'demo_hdb_cluster'],
   'createdTimestamp': datetime.datetime(2024, 8, 13, 15, 53, 10, 488000, tzinfo=tzlocal()),
   'versionId': 'fMimOLwsU0PXXjOjWnq3yg'}],
 'description': 'Dataview of

In [13]:
# Update the HDB Cluster to use updated view of database
resp=client.update_kx_cluster_databases(environmentId=ENV_ID, 
    clusterName=HDB_CLUSTER_NAME, 
    databases=[
        {'databaseName': DB_NAME, 'dataviewName': DBVIEW_NAME}
    ],
    deploymentConfiguration={
        'deploymentStrategy': 'ROLLING'
    }
)

In [14]:
# Wait for the HDB cluster to update
wait_for_cluster_status(client, environmentId=ENV_ID, clusterName=HDB_CLUSTER_NAME, show_wait=True)

Cluster: demo_hdb_cluster status is UPDATING, total wait 0:00:00, waiting 30 sec ...
Cluster: demo_hdb_cluster status is UPDATING, total wait 0:00:30, waiting 30 sec ...
Cluster: demo_hdb_cluster status is UPDATING, total wait 0:01:00, waiting 30 sec ...
Cluster: demo_hdb_cluster status is UPDATING, total wait 0:01:30, waiting 30 sec ...
Cluster: demo_hdb_cluster status is UPDATING, total wait 0:02:00, waiting 30 sec ...
Cluster: demo_hdb_cluster status is UPDATING, total wait 0:02:30, waiting 30 sec ...
Cluster: demo_hdb_cluster status is UPDATING, total wait 0:03:00, waiting 30 sec ...
Cluster: demo_hdb_cluster status is UPDATING, total wait 0:03:30, waiting 30 sec ...
Cluster: demo_hdb_cluster status is UPDATING, total wait 0:04:00, waiting 30 sec ...
Cluster: demo_hdb_cluster status is UPDATING, total wait 0:04:30, waiting 30 sec ...
Cluster: demo_hdb_cluster status is UPDATING, total wait 0:05:00, waiting 30 sec ...
Cluster: demo_hdb_cluster status is UPDATING, total wait 0:05:30,

{'status': 'RUNNING',
 'clusterName': 'demo_hdb_cluster',
 'clusterType': 'HDB',
 'volumes': [{'volumeName': 'DEMO_SHARED_VOLUME', 'volumeType': 'NAS_1'}],
 'databases': [{'databaseName': 'DEMO_DB',
   'dataviewConfiguration': {'dataviewName': 'DEMO_DB_VIEW',
    'dataviewVersionId': '2simSDGKpYgeMZHWdb7nBg',
    'changesetId': 'qsimSBjoiigHEVs3XptVXw',
    'segmentConfigurations': [{'dbPaths': ['/*'],
      'volumeName': 'DEMO_SHARED_VOLUME',
      'onDemand': False}]}}],
 'clusterDescription': 'Created with create_all notebook',
 'releaseLabel': '1.0',
 'vpcConfiguration': {'vpcId': 'vpc-0fe2b9c50f3ad382f',
  'securityGroupIds': ['sg-0c99f1cfb9c3c7fd9'],
  'subnetIds': ['subnet-04052219ec25b062b'],
  'ipAddressType': 'IP_V4'},
 'commandLineArguments': [{'key': 's', 'value': '2'}],
 'executionRole': 'arn:aws:iam::829845998889:role/kdb-all-user',
 'lastModifiedTimestamp': datetime.datetime(2024, 8, 13, 16, 40, 34, 471000, tzinfo=tzlocal()),
 'azMode': 'SINGLE',
 'availabilityZoneId': '

# Query the HDB 
Show the new data by querying the tables in the HDB. The data has changed from the data that was just added to the database.

In [15]:
# Query the HDB for after state
hdb = get_pykx_connection(client, 
                          environmentId=ENV_ID, clusterName=HDB_CLUSTER_NAME, 
                          userName=KDB_USERNAME, boto_session=session)

tables = hdb("tables[]").py()

# inventory of tables in the database and rows in each
print(80*'=')
print("Tables and Counts")
display( hdb("tables[]!count each value each tables[]") )

# For each table: schema, and samples and counts
for t in tables:
    print(80*'=')
    print (f'Table: {t}')
    print(80*'-')
    display( hdb(f"meta {t}") )
    display( hdb(f"select from {t} where date = min date, i<5") )
    display( hdb(f"select from {t} where date = max date, i<5") )
    display( hdb(f"select rows:count i by date from {t}") )
    display( hdb(f"select rows:count i by date,Ticker from {t}") )


Tables and Counts


Table: taq
--------------------------------------------------------------------------------


Unnamed: 0_level_0,t,f,a
c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
date,"""d""",,
Ticker,"""s""",,p
Timestamp,"""n""",,
EventType,"""s""",,
Price,"""f""",,
Quantity,"""j""",,
Exchange,"""s""",,
Conditions,"""s""",,


Unnamed: 0,date,Ticker,Timestamp,EventType,Price,Quantity,Exchange,Conditions


Unnamed: 0,date,Ticker,Timestamp,EventType,Price,Quantity,Exchange,Conditions
,,,,,,,,
0.0,2021.01.05,AMZN,0D04:00:00.021680902,TRADE,3190.01,63.0,ARCA,80000401.0
1.0,2021.01.05,AMZN,0D04:00:00.023083159,QUOTE BID,2000f,400.0,ARCA,1.0
2.0,2021.01.05,AMZN,0D04:00:00.023083159,QUOTE ASK,0f,0.0,ARCA,1.0
3.0,2021.01.05,AMZN,0D04:00:00.023083159,QUOTE BID NB,2000f,400.0,ARCA,1.0
4.0,2021.01.05,AMZN,0D04:00:00.023121746,QUOTE BID,2600f,100.0,ARCA,1.0


Unnamed: 0_level_0,rows
date,Unnamed: 1_level_1
2021.01.04,8970726
2021.01.05,6174244


Unnamed: 0_level_0,Unnamed: 1_level_0,rows
date,Ticker,Unnamed: 2_level_1
2021.01.04,AMZN,2221268
2021.01.04,FB,2913516
2021.01.04,GOOG,2245605
2021.01.04,NFLX,1590337
2021.01.05,AMZN,1685421
2021.01.05,FB,1720887
2021.01.05,GOOG,1550965
2021.01.05,NFLX,1216971


In [16]:
print( f"Last Run: {datetime.datetime.now()}" )

Last Run: 2024-08-13 16:40:43.019225
