# TorQ: Create Everything
This notebook will use the AWS boto3 APIs to create the needed resources for a TorQ based application. The notebook will first clone the relevant gihub code (TorQ and TorQ AMazon FinSpace Starter Pack) then proceed to create the necessary AWS resources. 

Once you have create all clusters, you can see how to query for data through the gateway, see the [pykx_query_all](pykx_query_all.ipynb) notebook

To cleanup (delete) all resources, run the [delete_all](delete_all.ipynb) notebook.

## AWS Resources Created
- Database   
- Changeset to add data to database   
- Scaling Group that will contain all clusters   
- Shared Volume   
- Dataview of database on the shared volume   
- Clusters

This notebook is based on the TorQ Amazon FinSpace starter pack but uses Scaling Groups and Shared Volumes for cost savings.

[TorQ Amazon FinSpace Starter Pack](https://dataintellecttech.github.io/TorQ-Amazon-FinSpace-Starter-Pack/)

### Branches

**TorQ-Amazon-FinSpace-Starter-Pack**: v1.0.2   
**TorQ**: v5.0.3

**Note**: For other branches, be sure to update the git clone statements below.

In [None]:
!rm -rf torq_app*.zip 

In [None]:
!rm -rf TorQ TorQ-Amazon-FinSpace-Starter-Pack

In [None]:
!git -c advice.detachedHead=false clone --depth 1 --branch  v1.0.2 https://github.com/DataIntellectTech/TorQ-Amazon-FinSpace-Starter-Pack.git 

In [None]:
!git -c advice.detachedHead=false clone --depth 1 --branch v5.0.3 https://github.com/DataIntellectTech/TorQ.git 

In [None]:
!ln -sf ../finspace_torq.q TorQ-Amazon-FinSpace-Starter-Pack
# this is the one modification over what is in the starter-pack on github

In [None]:
import os
import subprocess
import boto3
import json
import datetime

import pykx as kx

from managed_kx import *
from env import *

from clusters import *

# ----------------------------------------------------------------

TORQ_CODEBASE="TorQ"
TORQ_FINSPACE_CODEBASE="TorQ-Amazon-FinSpace-Starter-Pack"

# Source data directory
SOURCE_DATA_DIR=f"{TORQ_FINSPACE_CODEBASE}/hdb"

# Code directory
CODEBASE="torq_app"

# S3 Destinations
S3_CODE_PATH="code"
S3_DATA_PATH="data"

NODE_TYPE="kx.sg.4xlarge"

DATABASE_CONFIG=[{ 
    'databaseName': DB_NAME,
    'dataviewName': DBVIEW_NAME
    }]
CODE_CONFIG={ 's3Bucket': S3_BUCKET, 's3Key': f'{S3_CODE_PATH}/{CODEBASE}.zip' }

NAS1_CONFIG= {
        'type': 'SSD_250',
        'size': 1200
}

In [None]:
# Using credentials and create service client
session = boto3.Session()

# create finspace client
client = session.client(service_name='finspace')

# Create the Database
Create a database from the supplied data in hdb.tar.gz.  

## Stage HDB Data on S3
Using AWS cli, copy hdb to staging bucket

In [None]:
S3_DEST=f"s3://{S3_BUCKET}/{S3_DATA_PATH}/{SOURCE_DATA_DIR}/"

if AWS_ACCESS_KEY_ID is not None:
    cp = f"""
export AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID}
export AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY}
export AWS_SESSION_TOKEN={AWS_SESSION_TOKEN}

aws s3 sync --quiet --exclude .DS_Store {SOURCE_DATA_DIR} {S3_DEST}
aws s3 ls {S3_DEST}
"""
else:
    cp = f"""
aws s3 sync --quiet --exclude .DS_Store {SOURCE_DATA_DIR} {S3_DEST}
aws s3 ls {S3_DEST}
"""
    
# execute the S3 copy
os.system(cp)

## Create Managed Database
Using the AWS APIs, create a managed database in Managed kdb Insights.

In [None]:
# assume it exists
create_db=False

try:
    resp = client.get_kx_database(environmentId=ENV_ID, databaseName=DB_NAME)
    resp.pop('ResponseMetadata', None)
except:
    # does not exist, will create
    create_db=True

if create_db:
    print(f"CREATING Database: {DB_NAME}")
    resp = client.create_kx_database(environmentId=ENV_ID, databaseName=DB_NAME, description="Basictick kdb database")
    resp.pop('ResponseMetadata', None)

    print(f"CREATED Database: {DB_NAME}")

print(json.dumps(resp,sort_keys=True,indent=4,default=str))

## Add HDB Data to Database
Add the data in the local hdb directory to the managed database using the changeset mechanism. The Data will be copied to S3 then ingested with the create-kx-changeset API.

In [None]:
c_set_list = list_kx_changesets(client, environmentId=ENV_ID, databaseName=DB_NAME)
len(c_set_list)

In [None]:
# TODO: check is there is a changeset in the database, if so, no need to add another
c_set_list = list_kx_changesets(client, environmentId=ENV_ID, databaseName=DB_NAME)

if len(c_set_list) == 0:
    # if changesets exist, set chagnset_id to last created one

    changes=[]

    for f in os.listdir(f"{SOURCE_DATA_DIR}"):
        if os.path.isdir(f"{SOURCE_DATA_DIR}/{f}"):
            changes.append( { 'changeType': 'PUT', 's3Path': f"{S3_DEST}{f}/", 'dbPath': f"/{f}/" } )
        else:
            changes.append( { 'changeType': 'PUT', 's3Path': f"{S3_DEST}{f}", 'dbPath': f"/" } )

    resp = client.create_kx_changeset(environmentId=ENV_ID, databaseName=DB_NAME, 
        changeRequests=changes)

    resp.pop('ResponseMetadata', None)
    changeset_id = resp['changesetId']

    print("Changeset...")
    print(json.dumps(resp,sort_keys=True,indent=4,default=str))
else:
    c_set_list=sorted(c_set_list, key=lambda d: d['createdTimestamp']) 
    changeset_id=c_set_list[-1]['changesetId']
    print(f"Using Last changeset: {changeset_id}")


In [None]:
wait_for_changeset_status(client, environmentId=ENV_ID, databaseName=DB_NAME, changesetId=changeset_id, show_wait=True)
print("**Done**")

In [None]:
note_str = ""

c_set_list = list_kx_changesets(client, environmentId=ENV_ID, databaseName=DB_NAME)

if len(c_set_list) == 0:
    note_str = "<<Could not get changesets>>"
    
print(100*"=")
print(f"Database: {DB_NAME}, Changesets: {len(c_set_list)} {note_str}")
print(100*"=")

# sort by create time
c_set_list = sorted(c_set_list, key=lambda d: d['createdTimestamp']) 

for c in c_set_list:
    c_set_id = c['changesetId']
    print(f"  Changeset: {c_set_id}: Created: {c['createdTimestamp']} ({c['status']})")
    c_rqs = client.get_kx_changeset(environmentId=ENV_ID, databaseName=DB_NAME, changesetId=c_set_id)['changeRequests']

    chs_pdf = pd.DataFrame.from_dict(c_rqs).style.hide(axis='index')
    display(chs_pdf)

# Create Scaling Group
The scaling group represents the total compute avilable to the application. All clusters will be placed into the scaling group ans share the compute and memory of the scaling group.

In [None]:
# Check if scaling group exits, only create if it does not
resp = get_kx_scaling_group(client=client, environmentId=ENV_ID, scalingGroupName=SCALING_GROUP_NAME)

if resp is None:
    resp = client.create_kx_scaling_group(
        environmentId = ENV_ID, 
        scalingGroupName = SCALING_GROUP_NAME,
        hostType=NODE_TYPE,
        availabilityZoneId = AZ_ID
    )
else:
    print(f"Scaling Group {SCALING_GROUP_NAME} exists")

In [None]:
resp

# Create Shared Volume
The shared volume is a common storage device for the application. Every cluster using the shared volume will have a writable directory named after the cluster, can read the directories named after other clusters in the application using the volume. Also, there is a common 

In [None]:
# Check if volume already exists before trying to create one
resp = get_kx_volume(client=client, environmentId=ENV_ID, volumeName=VOLUME_NAME)

if resp is None:
    resp = client.create_kx_volume(
        environmentId = ENV_ID, 
        volumeType = 'NAS_1',
        volumeName = VOLUME_NAME,
        description = 'Shared volume between TP and RDB',
        nas1Configuration = NAS1_CONFIG,
        azMode='SINGLE',
        availabilityZoneIds=[ AZ_ID ]    
    )
else:
    print(f"Volume {VOLUME_NAME} exists")    

In [None]:
resp

# Wait for Volume and Scaling Group
Before proceeding to use Volumes and Scaling groups, wait for their creation to complete.

Volume will be used by the dataview.    
Dataview and Scaling Group will be used by the clusters


In [None]:
# wait for the scaling group to create
wait_for_scaling_group_status(client=client, environmentId=ENV_ID, scalingGroupName=SCALING_GROUP_NAME, show_wait=True)
print("** DONE **")

# wait for the volume to create
wait_for_volume_status(client=client, environmentId=ENV_ID, volumeName=VOLUME_NAME, show_wait=True)
print("** DONE **")

# Create Dataview
Create a dataview, for a specific (static) version of the database and have all of its data cached using the shared volume.

In [None]:
# Check if dataview already exists and is set to the requested changeset_id
resp = get_kx_dataview(client=client, environmentId=ENV_ID, databaseName=DB_NAME, dataviewName=DBVIEW_NAME)

if resp is None:
    # sort changeset list by create time
    c_set_list = sorted(c_set_list, key=lambda d: d['createdTimestamp']) 

    resp = client.create_kx_dataview(
        environmentId = ENV_ID, 
        databaseName=DB_NAME, 
        dataviewName=DBVIEW_NAME,
        azMode='SINGLE',
        availabilityZoneId=AZ_ID,
        changesetId=c_set_list[-1]['changesetId'],
        segmentConfigurations=[
            { 
                'dbPaths': ['/*'],
                'volumeName': VOLUME_NAME
            }
        ],
        autoUpdate=False,
        description = f'Dataview of database'
    )
else:
    print(f"Dataview {DBVIEW_NAME} exists")        

In [None]:
# wait for the view to create
wait_for_dataview_status(client=client, environmentId=ENV_ID, databaseName=DB_NAME, dataviewName=DBVIEW_NAME, show_wait=True)
print("** DONE **")

# Create Clusters
With foundation resources now completed, create the needed clusters for the application.

## Stage Code to S3
Code to be used in this application must be staged to an S3 bucket the service can read from, that code will then be deployed to the clusters as part of their creation workflow.

In [None]:
# zip the code
os.system(f"zip -q -r {CODEBASE}.zip {TORQ_CODEBASE}/ {TORQ_FINSPACE_CODEBASE}/ -x '*.ipynb_checkpoints*' -x '*/hdb/*' -x '*.git*' -x '*/tests/*' -x '*/terraform-deployment/*' -x '*/docs/*' -x '*/lib/*' -x '*/html/*' -x '*/datadog/*'  -x '*/monit/*'")

# copy code to S3
if AWS_ACCESS_KEY_ID is not None:
    cp = f"""
export AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID}
export AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY}
export AWS_SESSION_TOKEN={AWS_SESSION_TOKEN}

aws s3 cp --exclude .DS_Store {CODEBASE}.zip s3://{S3_BUCKET}/code/{CODEBASE}.zip
aws s3 ls s3://{S3_BUCKET}/code/
"""
else:
    cp = f"""
aws s3 cp --exclude .DS_Store {CODEBASE}.zip s3://{S3_BUCKET}/code/{CODEBASE}.zip
aws s3 ls s3://{S3_BUCKET}/code/
"""
    
# execute the S3 copy
os.system(cp)

In [None]:
for c in clusters:
#for c in clusters[5:6]:
    # wait for a cluster?
    if c['type'] == "WAIT":
        wait_for_cluster_status(client, environmentId=ENV_ID, clusterName=c['name'], show_wait=True)
        continue
    
    cluster_name = c['name']
    cluster_type = c['type']
    cluster_init = c['init']
    cluster_args = c['args']
    
    # cluster already exists
    resp = get_kx_cluster(client, environmentId=ENV_ID, clusterName=cluster_name)
    if resp is not None:
        print(f"Cluster: {cluster_name} already exists")
        continue
    
    print(f"Creating: {cluster_name}")
    
    resp = client.create_kx_cluster(
        environmentId=ENV_ID, 
        clusterName=cluster_name,
        clusterType=cluster_type,
        releaseLabel = '1.0',
        executionRole=EXECUTION_ROLE,
        databases=DATABASE_CONFIG,
        scalingGroupConfiguration={
            'memoryReservation': 6,
            'nodeCount': 1,
            'scalingGroupName': SCALING_GROUP_NAME,
        },
        savedownStorageConfiguration ={ 'volumeName': VOLUME_NAME },
#        tickerplantLogConfiguration ={ 'tickerplantLogVolumes': [ VOLUME_NAME ] },
        clusterDescription="Created with create_all notebook",
        code=CODE_CONFIG,
        initializationScript=cluster_init,
        commandLineArguments=cluster_args,
        azMode=AZ_MODE,
        availabilityZoneId=AZ_ID,
        vpcConfiguration={ 
            'vpcId': VPC_ID,
            'securityGroupIds': SECURITY_GROUPS,
            'subnetIds': SUBNET_IDS,
            'ipAddressType': 'IP_V4' }
    )
    
    display(resp)

## Wait for all clusters to finish creating

In [None]:
# Wait for all clusters to start
for c in clusters:
    cluster_name = c['name']
    wait_for_cluster_status(client, environmentId=ENV_ID, clusterName=cluster_name, show_wait=True)

print("** ALL DONE **")

# List Clusters

In [None]:
cdf = get_clusters(client, environmentId=ENV_ID)

all_clusters = [d['name'] for d in clusters if 'name' in d]

if cdf is not None:
    cdf = cdf[cdf['clusterName'].isin(all_clusters)]

display(cdf)

# All Processes Running

In [None]:
print( f"Last Run: {datetime.datetime.now()}" )