# Welcome Notebook
This notebook walks through the process of creating and populating your first database with FinSpace Managed KX.

## Before you start
Before you start this notebook, it is assumed you have the following:
- FinSpace Managed KX environment created in AWS account
- S3 staging bucket for data and code
  - This notebook boto's profile and the Managed KX environment can access the bucket
- Setup in ~/.aws directory
  - config is set (json and region)
  - default credentials are set (aws_access_key_id, aws_secret_access_key, aws_session_token)

## Steps
1. Untar hdb.tar.gz for the hdb data
2. Upload hdb to staging S3 bucket
3. Create database
4. Add HDB data to database
5. Create a Cluster
6. Get the connectionString
7. Query Cluster using PyKX

## Managed kdb Insights Archtecture
<img src="Managed kdb Insights-HDB Migration.png"  width="50%">


In [None]:
import os
import boto3
import json
import datetime

from managed_kx import *
from env_kdb_1 import *


In [None]:
# Source data directory
SOURCE_DATA_DIR="hdb"

# S3 bucket for external data and code
S3_DEST=f"s3://{S3_BUCKET}/data/{SOURCE_DATA_DIR}/"
CODEBASE="code"
CODE_PATH=f"code/{CODEBASE}.zip"

NODE_COUNT=1
CACHE_SIZE=1200

# Managed KX Database and Cluster names to create
DB_NAME="welcomedb"
DELETE_CLUSTER=False
DELETE_DATABASE=False

create_delete=True

if create_delete:
    TODAY=datetime.datetime.now().strftime("%Y%m%d_%H%M")    
    DB_NAME=f"create_delete_db_{TODAY}"
    DELETE_CLUSTER=True
    DELETE_DATABASE=True

CLUSTER_NAME=f"cluster_{DB_NAME}"


In [None]:
# triggers credential get
session=None

try:
    subprocess.call(["which", "ada"])
    os.system(f"ada credentials update --account={ACCOUNT_ID} --provider=isengard --role=Admin --once")
except: 
    None

if AWS_ACCESS_KEY_ID is None:
    print("Using Defaults ...")
    # create AWS session: using access variables
    session = boto3.Session()
else:
    print("Using variables ...")
    session = boto3.Session(
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        aws_session_token=AWS_SESSION_TOKEN
    )

# create finspace client
client = session.client(service_name='finspace', endpoint_url=ENDPOINT_URL)

# 0. Environment Check
Be sure the infrastructure ID has been entitled to the bucket you will be staging the HDB to. The environment will also need access to the KMX key used when creating the environment.

## Permission Templates

### S3 Permission
Example of code and data access to the same S3 bucket.

```
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "finspace.amazonaws.com"
            },
            "Action": [
                "s3:GetObject",
                "s3:GetObjectTagging",
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::S3_BUCKET/*",
                "arn:aws:s3:::S3_BUCKET"
            ],
            "Condition": {
                "StringEquals": {
                    "aws:SourceAccount": "ACCOUNT_ID"
                },
                "ArnEquals": {
                    "aws:SourceArn": "arn:aws:finspace:us-east-1:ACCOUNT_ID:kxEnvironment/ENV_ID/*"
                }
            }
        }
    ]
}

```

### KMS Key
Be sure the environment has access to use the KMS key given in environment creation.

```
"Statement": [
        {
            "Sid": "Enable Managed kdb Insights Access",
            "Effect": "Allow",
            "Principal": {
                "Service": "finspace.amazonaws.com"
            },
            "Action": [
                "kms:Encrypt",
                "kms:Decrypt",
                "kms:GenerateDataKey"
            ],
            "Resource": "arn:aws:kms:us-east-1:ACCOUNT_ID:key/KEY_ID",
            "Condition": {
                "StringEquals": {
                    "aws:SourceAccount": "ACCOUNT_ID"
                },
                "ArnLike": {
                    "aws:SourceArn": "arn:aws:finspace:us-east-1:ACCOUNT_ID:kxEnvironment/ENV_ID/*"
                }
            }
        }
   ]
```


In [None]:
resp=get_kx_environment(client)

print("Environment Information")
print(json.dumps(resp,sort_keys=True,indent=4,default=str))


## 1. Untar hdb.tar.gz
hdb database will be found in hdb directory

In [None]:
!tar -xf hdb.tar.gz

In [None]:
!ls -la hdb

# 2. Upload hdb data
using aws cli, copy hdb to staging bucket

In [None]:
if AWS_ACCESS_KEY_ID is not None:
    cp = f"""
export AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID}
export AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY}
export AWS_SESSION_TOKEN={AWS_SESSION_TOKEN}

aws s3 sync  --exclude .DS_Store {SOURCE_DATA_DIR} {S3_DEST}
aws s3 ls {S3_DEST}
"""
else:
    cp = f"""
aws s3 sync  --exclude .DS_Store {SOURCE_DATA_DIR} {S3_DEST}
aws s3 ls {S3_DEST}
"""
    
# execute the S3 copy
os.system(cp)

## 3. Create database

In [None]:
# assume it exists
create_db=False

try:
    resp = client.get_kx_database(environmentId=ENV_ID, databaseName=DB_NAME)
    resp.pop('ResponseMetadata', None)
except:
    # does not exist, will create
    create_db=True

if create_db:
    print(f"CREATING Database: {DB_NAME}")
    resp = client.create_kx_database(environmentId=ENV_ID, databaseName=DB_NAME, description="Welcome kdb database")
    resp.pop('ResponseMetadata', None)

    print(f"CREATED Database: {DB_NAME}")

print(json.dumps(resp,sort_keys=True,indent=4,default=str))

## 4. Add HDB data to database

In [None]:
changes=[]

for f in os.listdir("hdb"):
    if os.path.isdir(f"hdb/{f}"):
        changes.append( { 'changeType': 'PUT', 's3Path': f"{S3_DEST}{f}/", 'dbPath': f"/{f}/" } )
    else:
        changes.append( { 'changeType': 'PUT', 's3Path': f"{S3_DEST}{f}", 'dbPath': f"/" } )
        
resp = client.create_kx_changeset(environmentId=ENV_ID, databaseName=DB_NAME, changeRequests=changes)

resp.pop('ResponseMetadata', None)
changeset_id = resp['changesetId']

print("Changeset...")
print(json.dumps(resp,sort_keys=True,indent=4,default=str))

In [None]:
wait_for_changeset_status(client, environmentId=ENV_ID, databaseName=DB_NAME, changesetId=changeset_id, show_wait=True)

In [None]:
note_str = ""

c_set_list = list_kx_changesets(client, environmentId=ENV_ID, databaseName=DB_NAME)

if len(c_set_list) == 0:
    note_str = "<<Could not get changesets>>"
    
print(100*"=")
print(f"Database: {DB_NAME}, Changesets: {len(c_set_list)} {note_str}")
print(100*"=")

# sort by create time
c_set_list = sorted(c_set_list, key=lambda d: d['createdTimestamp']) 

for c in c_set_list:
    c_set_id = c['changesetId']
    print(f"  Changeset: {c_set_id}: Created: {c['createdTimestamp']} ({c['status']})")
    c_rqs = client.get_kx_changeset(environmentId=ENV_ID, databaseName=DB_NAME, changesetId=c_set_id)['changeRequests']

    chs_pdf = pd.DataFrame.from_dict(c_rqs).style.hide(axis='index')
    display(chs_pdf)

## 5. Create a Cluster for the database

In [None]:
# zip the code
#os.system(f"zip -r -X {CODEBASE}.zip {CODEBASE} -x '*.ipynb_checkpoints*'")
os.system(f"cd {CODEBASE}; zip -r -X ../{CODEBASE}.zip . -x '*.ipynb_checkpoints*';")

# copy code to S3

if AWS_ACCESS_KEY_ID is not None:
    cp = f"""
export AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID}
export AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY}
export AWS_SESSION_TOKEN={AWS_SESSION_TOKEN}

aws s3 cp  --exclude .DS_Store {CODEBASE}.zip s3://{S3_BUCKET}/code/{CODEBASE}.zip
aws s3 ls s3://{S3_BUCKET}/code/
"""
else:
    cp = f"""
aws s3 cp  --exclude .DS_Store {CODEBASE}.zip s3://{S3_BUCKET}/code/{CODEBASE}.zip
aws s3 ls s3://{S3_BUCKET}/code/
"""
    
# execute the S3 copy
os.system(cp)

In [None]:
print(f"Creating: {CLUSTER_NAME}")

resp = client.create_kx_cluster(
    environmentId=ENV_ID, 
    clusterName=CLUSTER_NAME,
    clusterDescription=f"Demo Cluster for database {DB_NAME}",
    clusterType='HDB',
    releaseLabel = '1.0',
    capacityConfiguration={ "nodeType": "kx.s.xlarge", "nodeCount": NODE_COUNT },
    databases=[{ 
        'databaseName': DB_NAME, 
        'cacheConfigurations': [
            {'dbPaths':['/'], 'cacheType': 'CACHE_1000' }
        ] 
    }],
    cacheStorageConfigurations=[{ 'type': 'CACHE_1000', 'size': CACHE_SIZE }],
    azMode=AZ_MODE,
    availabilityZoneId=AZ_ID,
    vpcConfiguration={ 
        'vpcId': VPC_ID,
        'securityGroupIds': SECURITY_GROUPS,
        'subnetIds': SUBNET_IDS,
        'ipAddressType': 'IP_V4' },
    code={ 's3Bucket': S3_BUCKET, 's3Key': CODE_PATH },
#    initializationScript=f"{CODEBASE}/init.q",
    initializationScript=f"init.q",
    commandLineArguments=[
        {'key': 's', 'value': '4'}, 
        {'key': 'dbname', 'value': DB_NAME}, 
#        {'key': 'codebase', 'value': CODEBASE}
    ]
)

In [None]:
resp

In [None]:
wait_for_cluster_status(client, environmentId=ENV_ID, clusterName=CLUSTER_NAME, show_wait=False)
print()
print("** DONE **")

## 6. Get the connectionString
This assumes that the IAM role exists and the user (KDB_USERNAME) have beed already added as well.

In [None]:
try:
    resp = client.get_kx_cluster(environmentId=ENV_ID, clusterName=CLUSTER_NAME)
except client.exceptions.ResourceNotFoundException:
    print(F"Cluster: {CLUSTER_NAME} did not create")
    
if resp['ResponseMetadata']['HTTPStatusCode'] != 200:
    sys.stderr.write("Error:\n {resp}")
else:
    resp.pop('ResponseMetadata', None)

kx_cluster = resp

print("Cluster: "+("-"*80))
print(json.dumps(kx_cluster, sort_keys=True, indent=4, default=str))


In [None]:
# Give permissions time to propogate after cluster creation....
time.sleep(60)


In [None]:
conn_str = get_kx_connection_string(client, environmentId=ENV_ID, clusterName=CLUSTER_NAME, userName=KDB_USERNAME, boto_session=session)

print ("")
print("Copy into q: "+("-"*80))
print(f"""
/ Cluster: {CLUSTER_NAME}
hdb_conn:"{conn_str}"
""")

## 7. Query Cluster using PyKX

In [None]:
# Query the HDB
hdb = get_pykx_connection(client, 
                          environmentId=ENV_ID, clusterName=CLUSTER_NAME, 
                          userName=KDB_USERNAME, boto_session=session)

In [None]:
# Tables
tables = hdb("tables[]").py()
print(f"Tables ({len(tables)}): {tables}")

# Schema
schema_pdf = hdb("meta `example").pd()
display(schema_pdf)


In [None]:
# Simple Query, uses function from lib
res_table = hdb("select counts:count i, avg_num: avg number, avg_sq_num: avg sq number by date from example").pd()
display(res_table)

# Number of Rows in Table
rows = hdb("count example").py()
print(f"Rows: {rows:,}")

# Clean Up

In [None]:
# Cluster Deletion
# ------------------------------------------------------------
db_list = list_kx_databases(client, environmentId=ENV_ID)
db_list

db_pdf = pd.DataFrame.from_dict(db_list).style.hide(axis='index')
display(db_pdf)
print("")

cluster_deleted=False

if DELETE_CLUSTER:   
    # list all clusters
    resp=client.get_kx_cluster(environmentId=ENV_ID, clusterName=CLUSTER_NAME)
    
    if resp['ResponseMetadata']['HTTPStatusCode'] != 200:
        sys.stderr.write("Error:\n {resp}")
    else:
        resp.pop('ResponseMetadata', None)

    if resp['status'] != 'DELETING':
        try:
            resp = client.delete_kx_cluster(environmentId=ENV_ID, clusterName=CLUSTER_NAME)
            if resp['ResponseMetadata']['HTTPStatusCode'] != 200:
                sys.stderr.write("Error:\n {resp}")
            else:
                resp.pop('ResponseMetadata', None)
        except Exception as e: 
            sys.stderr.write(f"Error deleting cluster: {CLUSTER_NAME}\n{e}")
            cluster_deleted = False

    try:
        wait_for_cluster_status(client, environmentId=ENV_ID, clusterName=CLUSTER_NAME, status='DELETED', show_wait=False)
        print()
        print("** DONE **")

        cluster_deleted = True
    except client.exceptions.ResourceNotFoundException:
        cluster_deleted = True
else:
    print(f"DELETE_CLUSTER: {DELETE_CLUSTER}")

In [None]:
# Database Deletion
# Requires cluster to have been deleted
if DELETE_DATABASE:
    if cluster_deleted:
        # if the database exists, delete it
        if has_database(client, environmentId=ENV_ID, databaseName=DB_NAME):
            try:
                resp = client.delete_kx_database(environmentId=ENV_ID, databaseName=DB_NAME)
                if resp['ResponseMetadata']['HTTPStatusCode'] != 200:
                    sys.stderr.write("Error:\n {resp}")
                else:
                    resp.pop('ResponseMetadata', None)

                resp
            except Exception as e: 
                sys.stderr.write(f"Error: \n{e}")
        else:
            print(f"Database already deleted: {DB_NAME} ")
    else:
        print(f"Cluster deleted? {cluster_deleted}, will not delete database if cluster not deleted")
else:
    print(f"DELETE_DATABASE: {DELETE_DATABASE}")

In [None]:
db_list = list_kx_databases(client, environmentId=ENV_ID)
db_list=sorted(db_list, key=lambda d: d['databaseName']) 

db_pdf = pd.DataFrame.from_dict(db_list).style.hide(axis='index')
display(db_pdf)

In [None]:
cdf = get_clusters(client, environmentId=ENV_ID)

display(cdf)

print( f"Last Run: {datetime.datetime.now()}" )