In [1]:
import glob
import json
import logging
import os
import requests
import shutil
import time

from requests.auth import HTTPBasicAuth

import pandas as pd
import altair as alt
import ipywidgets as widgets

from snowflake.snowpark import Session

from services import ImportService

In [2]:
def x(l, k, v): l[k] = v

def get_config():
    logging.getLogger().debug("get config")

    with open('config.json', "r") as f:
        config = json.load(f)

    return config

In [3]:
def get_collibra(config):
    logging.getLogger().debug("get collibra")

    collibra = {}

    collibra["host"] = f"https://{config['collibra_host']}"

    collibra["username"] = config['collibra_username']

    collibra["password"] = config['collibra_password']

    collibra["endpoint"] = f"{collibra['host']}{config['collibra_api_endpoint']}"

    collibra["session"] = requests.Session()

    collibra.get("session").auth = HTTPBasicAuth(collibra.get("username"), collibra.get("password"))

    return collibra

In [4]:
def get_token(config):
    logging.getLogger().debug("get token")

    response = requests.post(
        f"{config['cyera_api_endpoint_url']}/v1/login",
        headers = {'accept': 'application/json', 'Content-Type': 'application/json'},
        json = {'clientId': config['cyera_client_id'], 'secret': config['cyera_client_secret']}
    )

    if response.status_code != requests.codes.ok:
        raise Exception(f'Error: {response.text}') 

    if not response.json().get('jwt'):
        raise Exception(f'Error: {response.json().get("message")}')

    config['cyera_token'] = response.json().get('jwt')

    return config

In [5]:
def send_request(method, url, data, limit, config):
    logging.getLogger().debug("send request")

    offset = 0
    
    results = []

    while True:
        response = requests.request(
            method=method,
            url=f"{url}&offset={offset}&limit={limit}",
            headers = {'accept': 'application/json', 'Content-Type': 'application/json', 'Authorization': f"Bearer {config['cyera_token']}"},
            data = data
        )

        if response.status_code != requests.codes.ok: raise Exception(f'Error: {response.text}') 

        if not response.json()['results']: break

        results = results + response.json()['results']

        offset+=limit

    return results

In [6]:
def get_classifications(config):
    logging.getLogger().debug("get classifications")

    url = f"{config['cyera_api_endpoint_url']}/v1/classifications?"

    results = send_request('GET', url, None, 10, config)

    return results

In [7]:
def get_datastores(provider, platform, config):
    logging.getLogger().debug("get datastores")

    url = f"{config['cyera_api_endpoint_url']}/v2/datastores?provider={provider}&inPlatformIdentifier={platform}"

    results = send_request('GET', url, None, 10, config)

    return results

In [8]:
def get_datastore_classifications(datastore, config):
    logging.getLogger().debug("get datastore classifications")

    url = f"{config['cyera_api_endpoint_url']}/v1/datastores/{datastore}/classifications?"

    results = send_request('GET', url, None, 10, config)

    _= [r.update({'datastoreUid':datastore}) for r in results]
    
    return results

In [9]:
def get_datastores_classifications(datastores_df, config):
    logging.getLogger().debug("get datastores classifications")

    results = []

    datastores_df['uid'].apply(lambda x: results.append(get_datastore_classifications(x, config)))

    return results

In [10]:
def get_datastore_objects(datastore, config):
    logging.getLogger().debug("get datastore objects")

    url = f"{config['cyera_api_endpoint_url']}/v1/datastores/{datastore}/objects?"
    
    results = send_request('GET', url, None, 10, config)

    _= [r.update({'datastoreUid':datastore}) for r in results]
    
    return results

In [11]:
def get_datastores_objects(datastores_df, config):
    logging.getLogger().debug("get datastores objects")

    results = []

    datastores_df['uid'].apply(lambda x: results.append(get_datastore_objects(x, config)))

    return results

In [21]:
def get_data_findings(config):
    logging.getLogger().debug("get all findings")

    session = Session.builder.config("connection_name", "cyera").create()

    # classifications_df = pd.DataFrame(get_classifications(config))

    # session.write_pandas(classifications_df, "CLASSIFICATIONS", auto_create_table=True, overwrite=True)

    classifications_df = session.table("CLASSIFICATIONS").to_pandas()

    # datastores_df = pd.DataFrame(get_datastores('AWS', '482457381153', config))

    # session.write_pandas(datastores_df, "DATASTORES", auto_create_table=True, overwrite=True)

    datastores_df = session.table("DATASTORES").to_pandas()

    datastores_df['createdYYMM'] = datastores_df['createdDate'].str[0:7]

    try:
        datastores_df['regions'] = datastores_df['regions'].apply(lambda x: json.loads(x))

        datastores_df['classificationGroups'] = datastores_df['classificationGroups'].apply(lambda x: json.loads(x))
        
    except Exception as e:
        pass

    datastores_df = datastores_df.explode('regions', ignore_index=True) # only 1 region

    datastores_exploded_df = datastores_df.explode('classificationGroups', ignore_index=True)

    # session.write_pandas(datastores_df, "DATASTORES_EXPLODED", auto_create_table=True, overwrite=True)

    # datastores_classifications_df = pd.DataFrame([vv for k,v in enumerate(get_datastores_classifications(datastores_df, config)) for kk,vv in enumerate(v)])

    # datastores_classifications_df = datastores_classifications_df.join(datastores_df.set_index('uid'), on='datastoreUid', rsuffix='_d')
    
    # datastores_classifications_df = datastores_df.join(datastores_classifications_df.set_index('datastoreUid'), on='uid', lsuffix='_d')

    # session.write_pandas(datastores_classifications_df, "DATASTORES_CLASSIFICATIONS", auto_create_table=True, overwrite=True)

    datastores_classifications_df = session.table("DATASTORES_CLASSIFICATIONS").to_pandas()

    return classifications_df, datastores_df, datastores_exploded_df, datastores_classifications_df, session

In [13]:
def do_classifications(x, entries, importService, config):
    # data category
    if x['classificationGroup'] not in entries[0]:
        entries[0][x['classificationGroup']] = {
            "entry": importService.get_asset("Privacy and Risk community", "Data categories", "Data Category", x['classificationGroup'], x['classificationGroup'])
        }

    # data concept
    if x['name'] not in entries[1]:
        entries[1][x['name']] = {
            "entry": importService.get_asset("Data Architects community", "Business Data Models", "Data Concept", x['name'], x['name']),
            "relations": [],
            "attributes": []
        }

    if x['classificationGroup'] not in entries[1][x['name']]['relations']:
        entries[1][x['name']]['relations'].append(x['classificationGroup'])
        importService.add_relations(entries[1][x['name']]['entry'], "c0e00000-0000-0000-0000-000000007316", "SOURCE", "Data categories", "Privacy and Risk community", x['classificationGroup'])

    if x['sensitivity'] not in entries[1][x['name']]['attributes']:
        entries[1][x['name']]['attributes'].append(x['sensitivity'])
        importService.add_attributes(entries[1][x['name']]['entry'], 'Severity', x['sensitivityDisplayName'], 'string')

In [14]:
def do_datastores(x, entries, importService, config):
    account = json.loads(x['account'])['inPlatformIdentifier']

    # domain
    if account not in entries[2]:
        entries[2][account] = {
            "entry": importService.get_domain(config['community_to_query'], "Technology Asset Domain", account),
        }

    # system
    if account not in entries[3]:
        entries[3][account] = {
            "entry": importService.get_asset(config['community_to_query'], account, "System", account, account),
            "attributes": []
        }

    if x['provider'] not in entries[3][account]['attributes']:
        entries[3][account]['attributes'].append(x['provider'])
        importService.add_attributes(entries[3][account]['entry'], 'Platform', x['provider'], 'string')

    if account not in entries[3][account]['attributes']:
        entries[3][account]['attributes'].append(account)
        importService.add_attributes(entries[3][account]['entry'], 'Account Name', account, 'string')

    # if buckets
    if x['type'] == 'S3':        
        # storage container
        if x['name'] not in entries[4]:
            entries[4][x['name']] = {
                "entry": importService.get_asset(config['community_to_query'], account, "S3 Bucket", f"s3://{x['name']}/", f"s3://{x['name']}/"),
                "relations": [],
                "attributes": []
            }

        if x['name'] not in entries[4][x['name']]['relations']:
            entries[4][x['name']]['relations'].append(x['name'])
            importService.add_relations(entries[4][x['name']]['entry'], "00000000-0000-0000-0000-000000007054", "SOURCE", account, config['community_to_query'], account)

        if x['provider'] not in entries[4][x['name']]['attributes']:
            entries[4][x['name']]['attributes'].append(x['provider'])
            importService.add_attributes(entries[4][x['name']]['entry'], 'Platform', x['provider'], 'string')

        if account not in entries[4][x['name']]['attributes']:
            entries[4][x['name']]['attributes'].append(account)
            importService.add_attributes(entries[4][x['name']]['entry'], 'Account Name', account, 'string')

        if x['regions'] not in entries[4][x['name']]['attributes']:
            entries[4][x['name']]['attributes'].append(x['regions'])
            importService.add_attributes(entries[4][x['name']]['entry'], 'Region', x['regions'], 'string')

        if x['createdDate'] not in entries[4][x['name']]['attributes']:
            entries[4][x['name']]['attributes'].append(x['createdDate'])
            importService.add_attributes(entries[4][x['name']]['entry'], 'Created At', x['createdDate'], 'string')
         
    # if database
    if x['type'] in ('DYNAMO_DB', 'REDSHIFT', 'RDS'):

        if x['name'] not in entries[5]: 
            entries[5][x['name']] = {
                "entry": importService.get_asset(config['community_to_query'], account, "System", x['name'], x['name']), 
                "relations": [],
                "attributes": []
            }

        if account not in entries[5][x['name']]['relations']:
            entries[5][x['name']]['relations'].append(account)
            importService.add_relations(entries[5][x['name']]['entry'], "00000000-0000-0000-0000-000000007054", "SOURCE",  account, config['community_to_query'], account)


        if x['provider'] not in entries[5][x['name']]['attributes']:
            entries[5][x['name']]['attributes'].append(x['provider'])
            importService.add_attributes(entries[5][x['name']]['entry'], 'Platform', x['provider'], 'string')

        if account not in entries[5][x['name']]['attributes']:
            entries[5][x['name']]['attributes'].append(account)
            importService.add_attributes(entries[5][x['name']]['entry'], 'Account Name', account, 'string')

        if x['regions'] not in entries[5][x['name']]['attributes']:
            entries[5][x['name']]['attributes'].append(x['regions'])
            importService.add_attributes(entries[5][x['name']]['entry'], 'Region', x['regions'], 'string')

        if x['createdDate'] not in entries[5][x['name']]['attributes']:
            entries[5][x['name']]['attributes'].append(x['createdDate'])
            importService.add_attributes(entries[5][x['name']]['entry'], 'Created At', x['createdDate'], 'string')

        if x['arn'] not in entries[5][x['name']]['attributes']:
            entries[5][x['name']]['attributes'].append(x['arn'])
            importService.add_attributes(entries[5][x['name']]['entry'], 'Principal Identifier', x['arn'], 'string')

In [15]:
def do_datastores_classifications(x, entries, importService, config):
    account = json.loads(x['account'])['inPlatformIdentifier']

    # if buckets
    if x['type'] == 'S3':        
        # storage container
        if x['name_d'] not in entries[4]:
            entries[4][x['name_d']] = {
                "entry": importService.get_asset(config['community_to_query'], account, "S3 Bucket", f"s3://{x['name_d']}/", f"s3://{x['name_d']}/"),
                "relations": [],
                "attributes": []
            }

        if x['classificationGroup'] not in entries[4][x['name_d']]['relations']:
            entries[4][x['name_d']]['relations'].append(x['classificationGroup'])
            importService.add_relations(entries[4][x['name_d']]['entry'], "01930192-86fb-77b0-8baf-30a80dccb864", "TARGET", "Data categories", "Privacy and Risk community", x['classificationGroup'])

        if x['name'] not in entries[4][x['name_d']]['relations']:
            entries[4][x['name_d']]['relations'].append(x['name'])
            importService.add_relations(entries[4][x['name_d']]['entry'], "01930192-f332-70fc-8572-9f7283c4cfd4", "TARGET",  "Business Data Models", "Data Architects community", x['name'])

        # measure         
        entries[6][f"{x['name_d']}:{x['name']}:Total Matches"] = {
            "entry": importService.get_asset("Governance council", "New Data Findings Metrics", "Measure", f"{x['name_d']}:{x['name']}:Total Matches", f"{x['name']} Total Matches")
        }

        importService.add_attributes(entries[6][f"{x['name_d']}:{x['name']}:Total Matches"]['entry'], 'Count', x['recordCountInDatastore'], 'string')

        importService.add_relations(entries[6][f"{x['name_d']}:{x['name']}:Total Matches"]['entry'], "01930b23-1a84-7d44-b817-275206442bf6", "TARGET",  "Business Data Models", "Data Architects community",  x['name'])
        
        importService.add_relations(entries[6][f"{x['name_d']}:{x['name']}:Total Matches"]['entry'], "01930b24-2617-722b-9502-8c30d4b3818c", "SOURCE",  account, config['community_to_query'], f"s3://{x['name_d']}/")

        # dimension
        if x['name'] not in entries[7]:
            entries[7][x['name']] = {
                "entry": importService.get_asset("Governance council", "Data Findings Dimensions", "Data Findings Dimension", x['name'], x['name'])
            }

        entries[8][f"s3://{x['name_d']}/:{x['name']}:Total Matches:Rule"] = {
            "entry": importService.get_asset("Governance council", "Data Findings Rules", "Data Findings Rule", f"s3://{x['name_d']}/:{x['name']}:Total Matches", f"{x['name']} Total Matches")
        }

        importService.add_relations(entries[8][f"s3://{x['name_d']}/:{x['name']}:Total Matches:Rule"]['entry'], "00000000-0000-0000-0000-000000007018", "SOURCE",  account, config['community_to_query'], f"s3://{x['name_d']}/")
        
        # metric
        entries[8][f"s3://{x['name_d']}/:{x['name']}:Total Matches:Metric"] = {
            "entry": importService.get_asset("Governance council", "Data Findings Metrics", "Data Findings Metric", f"s3://{x['name_d']}/:{x['name']}:Total Matches", f"{x['name']} Total Matches")
        }

        importService.add_attributes(entries[8][f"s3://{x['name_d']}/:{x['name']}:Total Matches:Metric"]['entry'], 'Passing Fraction', x['recordCountInDatastore'], 'string')

        importService.add_relations(entries[8][f"s3://{x['name_d']}/:{x['name']}:Total Matches:Metric"]['entry'], "01931f87-3dca-7b65-a03c-dce0146ade76", "TARGET",  "Data Findings Dimensions", "Governance council", x['name'])

        importService.add_relations(entries[8][f"s3://{x['name_d']}/:{x['name']}:Total Matches:Metric"]['entry'], "01931feb-4b9a-7b6b-a456-e1a2759ceca4", "SOURCE",  "Data Findings Rules", "Governance council", f"s3://{x['name_d']}/:{x['name']}:Total Matches")


    # if database
    if x['type'] in ('DYNAMO_DB', 'REDSHIFT', 'RDS'):
        if x['name_d'] not in entries[5]: 
            entries[5][x['name_d']] = {
                "entry": importService.get_asset(config['community_to_query'], account, "System", x['name_d'], x['name_d']), 
                "relations": [],
                "attributes": []
            }

        if x['classificationGroup'] not in entries[5][x['name_d']]['relations']:
            entries[5][x['name_d']]['relations'].append(x['classificationGroup'])
            importService.add_relations(entries[5][x['name_d']]['entry'], "019465e7-438a-7115-8158-68545ff8d12d", "TARGET", "Data categories", "Privacy and Risk community", x['classificationGroup']) 

        if x['name'] not in entries[5][x['name_d']]['relations']:
            entries[5][x['name_d']]['relations'].append(x['name'])
            importService.add_relations(entries[5][x['name_d']]['entry'], "019465e8-5d94-76a6-a34b-68a3f8d7c74c", "TARGET",  "Business Data Models", "Data Architects community", x['name']) 

        # measure
        entries[6][f"{x['name_d']}:{x['name']}:Total Matches"] = {
            "entry": importService.get_asset("Governance council", "New Data Findings Metrics", "Measure", f"{x['name_d']}:{x['name']}:Total Matches", f"{x['name']} Total Matches")
        }

        importService.add_attributes(entries[6][f"{x['name_d']}:{x['name']}:Total Matches"]['entry'], 'Count', x['recordCountInDatastore'], 'string')

        importService.add_relations(entries[6][f"{x['name_d']}:{x['name']}:Total Matches"]['entry'], "01930b23-1a84-7d44-b817-275206442bf6", "TARGET",  "Business Data Models", "Data Architects community",  x['name'])
        
        importService.add_relations(entries[6][f"{x['name_d']}:{x['name']}:Total Matches"]['entry'], "019465e9-0c5a-7293-863b-adad740124cc", "SOURCE",  account, config['community_to_query'], x['name_d'])

        # dimension
        if x['name'] not in entries[7]:
            entries[7][x['name']] = {
                "entry": importService.get_asset("Governance council", "Data Findings Dimensions", "Data Findings Dimension", x['name'], x['name'])
            }

        # metric    
        entries[8][f"{x['name_d']}:{x['name']}:Total Matches:Rule"] = {
            "entry": importService.get_asset("Governance council", "Data Findings Rules", "Data Findings Rule", f"{x['name_d']}:{x['name']}:Total Matches", f"{x['name']} Total Matches")
        }

        importService.add_relations(entries[8][f"{x['name_d']}:{x['name']}:Total Matches:Rule"]['entry'], "00000000-0000-0000-0000-000000007018", "SOURCE",  account, config['community_to_query'], f"{x['name_d']}")

        entries[8][f"{x['name_d']}:{x['name']}:Total Matches:Metric"] = {
            "entry": importService.get_asset("Governance council", "Data Findings Metrics", "Data Findings Metric", f"{x['name_d']}:{x['name']}:Total Matches", f"{x['name']} Total Matches")
        }

        importService.add_attributes(entries[8][f"{x['name_d']}:{x['name']}:Total Matches:Metric"]['entry'], 'Passing Fraction', x['recordCountInDatastore'], 'string')

        importService.add_relations(entries[8][f"{x['name_d']}:{x['name']}:Total Matches:Metric"]['entry'], "01931f87-3dca-7b65-a03c-dce0146ade76", "TARGET",  "Data Findings Dimensions", "Governance council", x['name'])

        importService.add_relations(entries[8][f"{x['name_d']}:{x['name']}:Total Matches:Metric"]['entry'], "01931feb-4b9a-7b6b-a456-e1a2759ceca4", "SOURCE",  "Data Findings Rules", "Governance council", f"{x['name_d']}:{x['name']}:Total Matches")

In [16]:
def do_datastores_objects(x, entries, importService, config):
    account = json.loads(x['account'])['inPlatformIdentifier']

    # if bucket
    if x['type'] == 'S3':        

        file = f"s3://{x['name_d']}/{x['relativePath']}"

        entries[9][file] = {
            "entry": importService.get_asset(config['community_to_query'], account, "File", file, x['relativePath']),
            "relations": []
        }

        importService.add_relations(entries[9][file]['entry'], "00000000-0000-0000-0000-000000007060", "SOURCE", account, config['community_to_query'], f"s3://{x['name_d']}/")

        # importService.add_relations(entries[9][file]['entry'], "01943678-0ab4-7015-ba1f-0f9a168a6ade", "TARGET", "Data categories", "Privacy and Risk community", x['Category'])

        # importService.add_relations(entries[9][file]['entry'], "01943678-ebf1-7cd5-bc9c-c78b2d115f3c", "TARGET",  "Business Data Models", "Data Architects community", x['Classifier'])

    # if database
    if x['type'] in ('DYNAMO_DB', 'REDSHIFT', 'RDS'):
        #database
        database = x['dbName']
        if f"{x['name_d']}>{database}" not in entries[10]:
            entries[10][f"{x['name_d']}>{database}"] = {
                "entry": importService.get_asset(config['community_to_query'], account, "Database", f"{x['name_d']}>{database}", database),
                "relations": []
            }

        if x['name_d'] not in entries[10][f"{x['name_d']}>{database}"]['relations']:
            entries[10][f"{x['name_d']}>{database}"]['relations'].append(x['name_d'])
            importService.add_relations(entries[10][f"{x['name_d']}>{database}"]['entry'], "00000000-0000-0000-0000-000000007054", "SOURCE", account, config['community_to_query'], x['name_d'])

        # schema
        # TODO: get schema name
        schema = 'pending'
        if f"{x['name_d']}>{database}>{schema}" not in entries[11]:
            entries[11][f"{x['name_d']}>{database}>{schema}"] = {
                "entry": importService.get_asset(config['community_to_query'], account, "Schema", f"{x['name_d']}>{database}>{schema}", schema),
                "relations": [],
                "attributes": []
            }

        if  f"{x['name_d']}>{database}" not in entries[11][f"{x['name_d']}>{database}>{schema}"]['relations']:
            entries[11][f"{x['name_d']}>{database}>{schema}"]['relations'].append(f"{x['name_d']}>{database}")
            importService.add_relations(entries[11][f"{x['name_d']}>{database}>{schema}"]['entry'], "00000000-0000-0000-0000-000000007024", "SOURCE", account, config['community_to_query'], f"{x['name_d']}>{database}")

        # table
        table = x['name']
        if f"{x['name_d']}>{database}>{schema}>{table}" not in entries[12]:
            entries[12][f"{x['name_d']}>{database}>{schema}>{table}"] = {
                "entry": importService.get_asset(config['community_to_query'], account, "Table", f"{x['name_d']}>{database}>{schema}>{table}", table),
                "relations": [],
                "attributes": []
            }

        if  f"{x['name_d']}>{database}>{schema}" not in entries[12][f"{x['name_d']}>{database}>{schema}>{table}"]['relations']:
            entries[12][f"{x['name_d']}>{database}>{schema}>{table}"]['relations'].append(f"{x['name_d']}>{database}>{schema}")
            importService.add_relations(entries[12][f"{x['name_d']}>{database}>{schema}>{table}"]['entry'], "00000000-0000-0000-0000-000000007043", "SOURCE", account, config['community_to_query'], f"{x['name_d']}>{database}>{schema}")

            

In [17]:
def do_all_findings(classifications_df, datastores_df, datastores_exploded_df, datastores_classifications_df, session, config):
    logging.getLogger().debug("do all findings")

    runId = time.strftime("%Y%m%d")

    shutil.rmtree(f'./runs/{runId}', ignore_errors=True)

    _= [os.remove(f) for f in glob.glob(f'./runs/{runId}.json.*')]
    
    importService = ImportService(runId, 1, 150000)


    entries = [{} for element in range(13)]

    classifications_df.apply(lambda x: do_classifications(x, entries, importService, config), axis=1)

    datastores_df.apply(lambda x: do_datastores(x, entries, importService, config), axis=1)
    
    datastores_classifications_df.apply(lambda x: do_datastores_classifications(x, entries, importService, config), axis=1)
    
    if config['if_datastores_objects']:
        # datastores_objects_df = pd.DataFrame([vv for k,v in enumerate(get_datastores_objects(datastores_df.query('scanningState=="Scanned"'), config)) for kk,vv in enumerate(v)])

        # session.write_pandas(datastores_objects_df, "DATASTORES_OBJECTS", auto_create_table=True, overwrite=True)

        datastores_objects_df = session.table("DATASTORES_OBJECTS").to_pandas()

        datastores_objects_df = datastores_objects_df.join(datastores_df.set_index('uid'), on='datastoreUid', rsuffix='_d')

        datastores_objects_df.apply(lambda x: do_datastores_objects(x, entries, importService, config), axis=1)


    # each in it step file
    allEntries = [[] for element in range(13)]

    _= [allEntries[i].append(v['entry']) for i,e in enumerate(entries) for k,v in e.items()]

    _= [importService.save(e, "./runs", runId, i, True) for i,e in enumerate(allEntries)]
    
    results = importService.harvest(get_collibra(config), config, "./runs", runId)

    return(results)


## Select 

the community where you want to find your buckets on 

In [18]:
communities = {}

collibra = get_collibra(get_config())

response = collibra.get("session").get(f"{collibra.get('endpoint')}/communities")

_ = [x(communities, community.get("name"), community) for community in response.json()["results"]]

community = widgets.Select(options=sorted([f"{k}" for k,v in communities.items()]), description='Communities', layout=widgets.Layout(width='40%'))

display(community)



Select(description='Communities', layout=Layout(width='40%'), options=('Airflow', 'Amazon', 'Asia', 'Asset cha…

## Choose 

if you want to register all finding examples

In [19]:
if_datastores_objects = widgets.Checkbox(value=False, description='Check to register your datastores objects', indent=False)

display(if_datastores_objects)

Checkbox(value=False, description='Check to register your datastores objects', indent=False)

In [22]:
def main():
    logging.getLogger().setLevel(logging.DEBUG)

    config = get_token(get_config())

    config['community_to_query'] = community.value

    config['if_datastores_objects'] = if_datastores_objects.value
    
    classifications_df, datastores_df, datastores_exploded_df, datastores_classifications_df, session = get_data_findings(config)

    results = do_all_findings(classifications_df, datastores_df, datastores_exploded_df, datastores_classifications_df, session, config)

    print(json.dumps(results, indent=2))
    
if __name__ == '__main__':
    main()    
    

In [18]:
config = get_token(get_config())

classifications_df, datastores_df, datastores_exploded_df, datastores_classifications_df, session = get_data_findings(config)


## General Dashboard

In [19]:
datastores_per_cloud_platform = datastores_df[['provider','uid']].drop_duplicates().groupby(by=['provider']).count().reset_index().rename(columns={"uid": "count"})

datastores_per_datatype = datastores_df[['dataType','uid']].drop_duplicates().groupby(by=['dataType']).count().reset_index().rename(columns={"uid": "count"})

display(f"AWS {datastores_per_cloud_platform.iloc[0]['count']} Structured {datastores_per_datatype.iloc[0]['count']} Unstructured {datastores_per_datatype.iloc[1]['count']}")

'AWS 95 Structured 45 Unstructured 50'

## Resources Summary


In [20]:
datastores_per_creation_date = datastores_df[['createdYYMM', 'uid']].drop_duplicates().groupby(by=['createdYYMM']).count().reset_index().rename(columns={"uid": "count"})

c = (alt.Chart(datastores_per_creation_date)
    .encode(alt.X('createdYYMM:O', axis=alt.Axis(labels=True, labelAngle=0)).timeUnit("yearmonth").title('Created date'), alt.Y('count', axis=alt.Axis(labels=False)).title('Datastores'), alt.Color('count', legend=None).scale(scheme="lightgreyteal", reverse=False), alt.Text('count'), tooltip=["createdYYMM:T", "count"])
    .properties(title='Number of datastores per date', width=1330, height=200)
)

(c.mark_bar() + c.mark_text(align='center', dy=-10)).configure_axis(grid=False).configure_view(strokeWidth=0)

The analysis provides a breakdown of the **datastores** identified across different **regions** and their **types**. 

As illustrated in the graphs below, more than **70%** of the resources with data findings are located in the **us-east-1** region, nearly **56%** are categorized as **buckets**, while **44%** are classified as **databases**.

In [21]:
datastores_per_regions = datastores_df[['regions', 'uid']].drop_duplicates().groupby(by=['regions']).count().reset_index().rename(columns={"uid": "count"})

c1 = (alt.Chart(datastores_per_regions)
    .encode(alt.X('regions', axis=alt.Axis(labels=True, labelAngle=0)).title('Datastore region'), alt.Y('count', axis=alt.Axis(labels=False)).title('Datastores'), alt.Color('count', legend=None).scale(scheme="lightgreyteal", reverse=False), alt.Text('count'), tooltip=["regions", "count"])
    .properties(title='Number of datastores per region', width=640, height=200)
)

In [22]:
datastores_per_type = datastores_df[['type', 'uid']].drop_duplicates().groupby(by=['type']).count().reset_index().rename(columns={"uid": "count"})

c2 = (alt.Chart(datastores_per_type)
    .encode(alt.X('type', axis=alt.Axis(labels=True, labelAngle=0)).title('Datastore type'), alt.Y('count', axis=alt.Axis(labels=False)).title('Datastores'), alt.Color('count', legend=None).scale(scheme="lightgreyteal", reverse=False), alt.Text('count'), tooltip=["type", "count"])
    .properties(title='Number of datastores per type', width=640, height=200)
)

In [23]:
((c1.mark_bar() + c1.mark_text(align='center', dy=-10)) | (c2.mark_bar() + c2.mark_text(align='center', dy=-10))).configure_axis(grid=False).configure_view(strokeWidth=0)

The analysis offers a comprehensive overview of the identified **datastores**, highlighting their **sensitivity** and **classifications**. 

As shown in the graphs below, **44%** of the resources exhibit significant findings, categorized as **sensitive** and **very sensitive** data with **Personal**, **Financial**, and **Health** being in the top 5 categories.

In [24]:
datastores_per_category = datastores_exploded_df[['classificationGroups', 'uid']].drop_duplicates().groupby(by=['classificationGroups']).count().reset_index().rename(columns={"uid": "count"})

c1 = (alt.Chart(datastores_per_category)
    .encode(alt.X('classificationGroups', axis=alt.Axis(labels=True, labelAngle=0)).title('Classification groups'), alt.Y('count', axis=alt.Axis(labels=False)).title('Datastores'), alt.Color('count', legend=None).scale(scheme="lightgreyteal", reverse=False), alt.Text('count'), tooltip=["classificationGroups", "count"])
    .properties(title='Number of datastores per classification group', width=640, height=200)
)


In [25]:
datastores_per_sensitivity = datastores_df[['sensitivity', 'uid']].drop_duplicates().groupby(by=['sensitivity']).count().reset_index().rename(columns={"uid": "count"})

c2 = (alt.Chart(datastores_per_sensitivity)
    .encode(alt.X('sensitivity', axis=alt.Axis(labels=True, labelAngle=0)).title('Datastore sensitivity'), alt.Y('count', axis=alt.Axis(labels=False)).title('Datastores'), alt.Color('count', legend=None).scale(scheme="lightgreyteal", reverse=False), alt.Text('count'), tooltip=["sensitivity", "count"])
    .properties(title='Number of datastores per sensitivity', width=640, height=200)
)


In [26]:
((c1.mark_bar() + c1.mark_text(align='center', dy=-10)) | (c2.mark_bar() + c2.mark_text(align='center', dy=-10))).configure_axis(grid=False).configure_view(strokeWidth=0)

In [27]:
datastores_per_state = datastores_exploded_df[['scanningState', 'uid']].drop_duplicates().groupby(by=['scanningState']).count().reset_index().rename(columns={"uid": "count"})

c1 = (alt.Chart(datastores_per_state)
    .encode(alt.X('scanningState', axis=alt.Axis(labels=True, labelAngle=0)).title('Scanning state'), alt.Y('count', axis=alt.Axis(labels=False)).title('Datastores'), alt.Color('count', legend=None).scale(scheme="lightgreyteal", reverse=False), alt.Text('count'), tooltip=["scanningState", "count"])
    .properties(title='Number of datastores per scanning state', width=640, height=200)
)


In [28]:
df = pd.DataFrame([(c[0], k, v) for c in datastores_df[['uid','recordCountBySensitivity']].itertuples(index=False) for k, v in json.loads(c[1]).items()], columns=['uid', 'sensitivity', 'count'])

datastores_per_sensitivity = df[['sensitivity','count']].groupby(by=['sensitivity']).sum().reset_index().rename(columns={"uid": "count"})

c2 = (alt.Chart(datastores_per_sensitivity)
    .encode(alt.X('sensitivity', axis=alt.Axis(labels=True, labelAngle=0)).title('Datastore sensitivity'), alt.Y('count', axis=alt.Axis(labels=False)).title('Datastores'), alt.Color('count', legend=None).scale(scheme="lightgreyteal", reverse=False), alt.Text("count"), tooltip=["sensitivity", "count"])
    .properties(title='Number of datastores per sensitivity', width=640, height=200)
)

In [29]:
((c1.mark_bar() + c1.mark_text(align='center', dy=-10)) | (c2.mark_bar() + c2.mark_text(align='center', dy=-10))).configure_axis(grid=False).configure_view(strokeWidth=0)

## Data Findinds Summary

The analysis offers a detailed overview of the **unique findings** discovered across various **regions** and their **classifications**. 

As demonstrated in the graphs below, nearly **68%** of the resources containing data findings are situated in the **us-east-1** region. 

Over **60%** of these resources are categorized as **buckets**, while around **40%** are identified as **databases**. This reinforces our earlier observations that buckets and databases are the most critical components.

In [30]:

findings_per_regions = datastores_classifications_df[['regions', 'uid']].groupby(by=['regions']).count().reset_index().rename(columns={"uid": "count"})

c1 = (alt.Chart(findings_per_regions)
    .encode(alt.X('regions', axis=alt.Axis(labels=True, labelAngle=0)).title('Datastore region'), alt.Y('count', axis=alt.Axis(labels=False)).title('Findings'), alt.Color('count', legend=None).scale(scheme="lightorange", reverse=False), alt.Text('count'), tooltip=["regions", "count"])
    .properties(title='Number of findings per region', width=640, height=200)
)

In [31]:
findings_per_type = datastores_classifications_df[['type', 'uid']].groupby(by=['type']).count().reset_index().rename(columns={"uid": "count"})

c2 = (alt.Chart(findings_per_type)
    .encode(alt.X('type', axis=alt.Axis(labels=True, labelAngle=0)).title('Datastore type'), alt.Y('count', axis=alt.Axis(labels=False)).title('Findings'), alt.Color('count', legend=None).scale(scheme="lightorange", reverse=False), alt.Text('count'), tooltip=["type", "count"])
    .properties(title='Number of findings per type', width=640, height=200)
)

In [32]:
((c1.mark_bar() + c1.mark_text(align='center', dy=-10)) | (c2.mark_bar() + c2.mark_text(align='center', dy=-10))).configure_axis(grid=False).configure_view(strokeWidth=0)

The analysis provides a thorough overview of the identified **resources** and their **classifications**. 

The graph below illustrates that key data points, including **names**, **emails**, **phone numbers**, **addresses**, **gender**, and **geo location**, are prominently featured.

In [33]:
findings_per_classifier = datastores_classifications_df[['name', 'uid']].groupby(by=['name']).count().reset_index().rename(columns={"uid": "count"})

c = (alt.Chart(findings_per_classifier)
    .encode(alt.X('name', axis=alt.Axis(labels=True, labelAngle=90)).title('Finding classifier'), alt.Y('count', axis=alt.Axis(labels=False)).title('Findings'), alt.Color('count', legend=None).scale(scheme="lightorange", reverse=False), alt.Text('count'), tooltip=["name", "count"])
    .properties(title='Number of findings per classifier', width=1330, height=200)
)
            
(c.mark_bar() + c.mark_text(align='center', dy=-10)).configure_axis(grid=False).configure_view(strokeWidth=0)


### The most bang for the buck. 

In [34]:
findings_per_type_and_classifier = datastores_classifications_df[['type', 'name', 'uid']].groupby(by=['type','name']).count().reset_index().rename(columns={"uid": "count"})

c = (alt.Chart(findings_per_type_and_classifier)
    .encode(alt.X('name', axis=alt.Axis(labels=True, labelAngle=90)).title('Finding classifier'), alt.Y('type', axis=alt.Axis(labels=False, labelAngle=0)).title('Datastore type'), alt.Color('count', legend=None).scale(scheme="orangered", reverse=False), alt.Text('count'), tooltip=["name","type","count"])
    .properties(title='Number of findings per datastore type and classifier', width=1330, height=200)
)

c.mark_rect()


In [35]:
findings_per_engine_and_classifier = datastores_classifications_df[['engine', 'name', 'uid']].groupby(by=['engine','name']).count().reset_index().rename(columns={"uid": "count"})

c = (alt.Chart(findings_per_engine_and_classifier)
    .encode(alt.X('name', axis=alt.Axis(labels=True, labelAngle=90)).title('Finding classifier'), alt.Y('engine', axis=alt.Axis(labels=False, labelAngle=0)).title('Datastore engine'), alt.Color('count', legend=None).scale(scheme="orangered", reverse=False), alt.Text('count'), tooltip=["name","engine","count"])
    .properties(title='Number of findings per datastore engine and classifier', width=1330, height=200)
)

c.mark_rect()


When spending time or money, it is essential to insist on getting the most bang for the buck.

In [36]:
findings_per_type_and_sensitivity = datastores_classifications_df[['type', 'sensitivity', 'uid']].groupby(by=['type','sensitivity']).count().reset_index().rename(columns={"uid": "count"})

c = (alt.Chart(findings_per_type_and_sensitivity)
    .encode(alt.X('sensitivity', axis=alt.Axis(labels=True, labelAngle=90)).title('Finding sensitivity'), alt.Y('type', axis=alt.Axis(labels=False, labelAngle=0)).title('Datastore type'), alt.Color('count', legend=None).scale(scheme="orangered", reverse=False), alt.Text('count'), tooltip=["sensitivity","type","count"])
    .properties(title='Number of findings per datastore type and sensitivity', width=1330, height=200)
)

(c.mark_rect() + c.mark_text(baseline="middle", fontWeight="bold").encode(color=alt.value("white")))

## Total Matches Summary

The analysis provides a comprehensive overview of the **total matches** identified across **regions** and their **classifications**. 

As illustrated in the graphs below, more than **83%** of the resources containing data findings are located in the **us-east-1** region. 

Furthermore, nearly **94%** of these resources are classified as **buckets**, while merely **6%** are recognized as databases. If you're looking to begin your work, start with your buckets..

In [37]:
total_matches_per_regions = datastores_classifications_df[['regions','recordCountInDatastore']].groupby(by=['regions']).sum().reset_index().rename(columns={"recordCountInDatastore": "count"})

c1 = (alt.Chart(total_matches_per_regions)
    .encode(alt.X('regions', axis=alt.Axis(labels=True, labelAngle=0)).title('Datastore region'), alt.Y('count', axis=alt.Axis(labels=False)).title('recordCountInDatastore'), alt.Color('count', legend=None).scale(scheme="reds", reverse=False), alt.Text('count'), tooltip=["regions", "count"])
    .properties(title='Number of total matches per region', width=640, height=200)
)

In [38]:
total_matches_per_type = datastores_classifications_df[['type','recordCountInDatastore']].groupby(by=['type']).sum().reset_index().rename(columns={"recordCountInDatastore": "count"})

c2 = (alt.Chart(total_matches_per_type)
    .encode(alt.X('type', axis=alt.Axis(labels=True, labelAngle=0)).title('Datastore type'), alt.Y('count', axis=alt.Axis(labels=False)).title('recordCountInDatastore'), alt.Color('count', legend=None).scale(scheme="reds", reverse=False), alt.Text('count'), tooltip=["type", "count"])
    .properties(title='Number of total matches per type', width=640, height=200)
)


In [39]:
((c1.mark_bar() + c1.mark_text(align='center', dy=-10)) | (c2.mark_bar() + c2.mark_text(align='center', dy=-10))).configure_axis(grid=False).configure_view(strokeWidth=0)

.. and get rid of mushrooms in your yard.

In [40]:
total_matches_per_classifier = datastores_classifications_df[['name','recordCountInDatastore']].groupby(by=['name']).sum().reset_index().rename(columns={"recordCountInDatastore": "count"})

c = (alt.Chart(total_matches_per_classifier)
    .encode(alt.X('name', axis=alt.Axis(labels=True, labelAngle=90)).title('Finding classifier'), alt.Y('count', axis=alt.Axis(labels=False)).title('Datastores'), alt.Color('count', legend=None).scale(scheme="reds", reverse=False), alt.Text('count'), tooltip=["name", "count"])
    .properties(title='Number of total matches per classifier', width=1330, height=200)
)
            
(c.mark_bar() + c.mark_text(align='center', dy=-10)).configure_axis(grid=False).configure_view(strokeWidth=0)


Prioritize addressing the critical findings first, followed by the high findings

In [41]:
total_matches_per_type_and_classifier = datastores_classifications_df[['type', 'name', 'recordCountInDatastore']].groupby(by=['type','name']).sum().reset_index().rename(columns={"recordCountInDatastore": "count"})

c = (alt.Chart(total_matches_per_type_and_classifier)
    .encode(alt.X('name', axis=alt.Axis(labels=True, labelAngle=90)).title('Finding classifier'), alt.Y('type', axis=alt.Axis(labels=False, labelAngle=0)).title('Datastore type'), alt.Color('count', legend=None).scale(scheme="reds", reverse=False), alt.Text('count'), tooltip=["name","type","count"])
    .properties(title='Number of total matches per datastore type and classifier', width=1330, height=200)
)

c.mark_rect()

In [42]:
total_matches_per_engine_and_classifier = datastores_classifications_df[['engine', 'name', 'recordCountInDatastore']].groupby(by=['engine','name']).sum().reset_index().rename(columns={"recordCountInDatastore": "count"})

c = (alt.Chart(total_matches_per_engine_and_classifier)
    .encode(alt.X('name', axis=alt.Axis(labels=True, labelAngle=90)).title('Finding classifier'), alt.Y('engine', axis=alt.Axis(labels=False, labelAngle=0)).title('Datastore type'), alt.Color('count', legend=None).scale(scheme="reds", reverse=False), alt.Text('count'), tooltip=["name","engine","count"])
    .properties(title='Number of total matches per datastore engine and classifier', width=1330, height=200)
)

c.mark_rect()

In [43]:
total_matches_per_type_and_sensitivity = datastores_classifications_df[['type', 'sensitivity', 'recordCountInDatastore']].groupby(by=['type','sensitivity']).sum().reset_index().rename(columns={"recordCountInDatastore": "count"})

c = (alt.Chart(total_matches_per_type_and_sensitivity)
    .encode(alt.X('sensitivity', axis=alt.Axis(labels=True, labelAngle=90)).title('Finding sensitivity'), alt.Y('type', axis=alt.Axis(labels=False, labelAngle=0)).title('Datastore type'), alt.Color('count', legend=None).scale(scheme="reds", reverse=False), alt.Text('count'), tooltip=["sensitivity","type","count"])
    .properties(title='Number of total matches per datastore type and sensitivity', width=1330, height=200)
)

(c.mark_rect() + c.mark_text(baseline="middle", fontWeight="bold").encode(color=alt.value("white")))


In [123]:
#done