# DynamoDB Import/Export Notebook

Author: Everett

Temporary solution to our need to back up (and possibly restore) dynamodb tables. Change the settings in the next cell, and then run the entire notebook.

Good documentation on boto3, the python library for AWS APIs.
* [boto3.resource](http://boto3.readthedocs.io/en/latest/guide/dynamodb.html) (which you should use whenever possible)
* [boto3.client](http://boto3.readthedocs.io/en/latest/reference/services/dynamodb.html) (a more complete, but lower level API)

## Setup: Configure this block, then run the whole notebook

In [12]:
RUN_EXPORT = True
EXPORT_FILENAME = "dynamodb.pickle"

RUN_IMPORT = False
IMPORT_FILENAME = "dynamodb.pickle"
IMPORT_TABLES_ADD_PREFIX = ""

# AWS keys and DynamoDB region
# If these are left blank, we'll try to find environment
# variables by the same name and use those.
AWS_ACCESS_KEY_ID = ''
AWS_SECRET_ACCESS_KEY = ''
AWS_REGION = ''

# If this is true, we connect to a dynamodb-local instance
# running on port 8000.
USE_DYNAMODB_LOCAL = False

In [13]:
import boto3
import copy
import datetime
import json
import os
import pickle

## Helper functions

In [14]:
global client, resource

def set_local_db():
    global client, resource
    client = boto3.client('dynamodb', endpoint_url='http://localhost:8000')
    resource = boto3.resource('dynamodb', endpoint_url='http://localhost:8000')

def set_client(access=AWS_ACCESS_KEY_ID, secret=AWS_SECRET_ACCESS_KEY, region=AWS_REGION):
    global client, resource
    access = access or os.environ['AWS_ACCESS_KEY_ID']
    secret = secret or os.environ['AWS_SECRET_ACCESS_KEY']
    region = region or os.environ['AWS_REGION']
    client = boto3.client(
        'dynamodb', 
        aws_access_key_id=access,
        aws_secret_access_key=secret,
        region_name=region)
    resource = boto3.resource(
        'dynamodb', 
        aws_access_key_id=access,
        aws_secret_access_key=secret,
        region_name=region)

In [15]:
def scan_all_items(table_name, index_name=None):
    items = []
    start_key = None
    table = resource.Table(table_name)
    while True:
        args = {}
        if index_name:
            args['IndexName'] = index_name
        if start_key:
            args['ExclusiveStartKey'] = start_key
        response = table.scan(**args)
        items.extend(response['Items'])
        start_key = response.get('LastEvaluatedKey', None)
        if start_key is None:
            break
        print("Paginated, %d items so far" % len(items))
    return items

In [16]:
def clean_up_table_schema(schema, rename_prefix=None):
    schema = copy.deepcopy(schema)
    index_names = ['GlobalSecondaryIndexes', 'LocalSecondaryIndexes']
    table_keys = ['AttributeDefinitions', 'KeySchema', 'ProvisionedThroughput',
                 'TableName'] + index_names
    index_keys = ['IndexName', 'KeySchema', 'Projection', 'ProvisionedThroughput']
    if rename_prefix:
        schema['TableName'] = rename_prefix + schema['TableName']
        for name in index_names:
            for index in schema.get(name, []):
                index['IndexName'] = rename_prefix + index['IndexName']
    
    # Clean up indices
    for name in index_names:
        for index in schema.get(name, []):
            keys = list(index.keys())
            for k in keys:
                if k not in index_keys:
                    del index[k]
                elif k == 'ProvisionedThroughput':
                    del index[k]['NumberOfDecreasesToday']
    # Clean up table
    keys = list(schema.keys())
    for k in keys:
        if k not in table_keys:
            del schema[k]
        elif k == 'ProvisionedThroughput':
            del schema[k]['NumberOfDecreasesToday']
    return schema

In [17]:
def get_table_data(name):
    result = {}
    result['Items'] = scan_all_items(name)
    result['Table'] = client.describe_table(TableName=name)['Table']
    return result

In [18]:
def get_all_table_data(table_prefix=''):
    data = {}
    table_names = [t.name for t in resource.tables.all() if t.name.startswith(table_prefix)]
    for name in table_names:
        data[name] = get_table_data(name)
        describe_table_data(table_data)
    return data

In [24]:
def describe_table_data(table_data):
    print("%40s:%8d items %8.0f kB" % (
            name, len(table_data['Items']), table_data['Table']['TableSizeBytes'] / 1024))

## Task 0: Establish a connection and make sure it works

In [19]:
set_client()
for t in resource.tables.all():
    print("Found table", t.name)

## Task 1: Export an entire database

In [20]:
if RUN_EXPORT:
    data = get_all_table_data(table_prefix='')
    with open(EXPORT_FILENAME , "wb") as f:
        pickle.dump(data, f)
    print("Saved %d bytes to %s" % (os.stat(EXPORT_FILENAME).st_size, EXPORT_FILENAME))

## Task 2 - import from pickled export (task 1)

In [23]:
if RUN_IMPORT:
    print("Loading %d bytes of pickled data from",
          os.stat(IMPORT_FILENAME).st_size, IMPORT_FILENAME)
    data = None
    with open(IMPORT_FILENAME, "rb") as f:
        data = pickle.load(f)

    print("Loaded data for %d tables" % len(data))
    for table_name, table_data in data.items:
        describe_table_data(table_data)

    print("Importing...")
    existing_table_names = set([t.name for t in resource.tables.all()])
    for table_name, info in data.items():
        new_name = IMPORT_TABLES_ADD_PREFIX + table_name
        print("** Importing into table %s **" % new_name)

        # Delete table if it already exists
        if new_name in existing_table_names:
            print(new_name, "already exists, deleting it first and starting afresh.")
            client.delete_table(TableName=new_name)
            print("Delete operation sent. Waiting until table no longer exists.")
            resource.Table(new_name).wait_until_not_exists()

        # Create a new table
        schema = clean_up_table_schema(info["Table"], rename_prefix=IMPORT_TABLES_ADD_PREFIX)
        items = info["Items"]
        client.create_table(**schema)
        print("Create operation sent. Waiting until table exists.")
        tbl = resource.Table(new_name)
        tbl.wait_until_exists()
        if tbl.global_secondary_indexes:
            for x in tbl.global_secondary_indexes:
                print(x["IndexStatus"], "index:", x["KeySchema"][0]["AttributeName"])


        # Populate
        print("Populating %s with %d items" % (IMPORT_TABLES_ADD_PREFIX + table_name, len(items)))
        table = resource.Table(IMPORT_TABLES_ADD_PREFIX + table_name)
        with table.batch_writer() as batch:
            for i in data[table_name]["Items"]:
                batch.put_item(Item=i)
        print("Done.")

    print("---- Data import complete")

Loading pickled data from dynamodb.pickle
** Importing into table demo_ursus_involved_civilians **
Create operation sent. Waiting until table exists.
Populating demo_ursus_involved_civilians with 10 items
Done.
** Importing into table demo_ursus_visits **
Create operation sent. Waiting until table exists.
Populating demo_ursus_visits with 0 items
Done.
** Importing into table demo_ursus_users **
Create operation sent. Waiting until table exists.
ACTIVE index: ori
Populating demo_ursus_users with 2 items
Done.
** Importing into table demo_ursus_auditentries **
Create operation sent. Waiting until table exists.
Populating demo_ursus_auditentries with 0 items
Done.
** Importing into table demo_ursus_events **
Create operation sent. Waiting until table exists.
Populating demo_ursus_events with 5 items
Done.
** Importing into table demo_ursus_incidents **
Create operation sent. Waiting until table exists.
ACTIVE index: incident_id_str
Populating demo_ursus_incidents with 10 items
Done.
** I