In [34]:
%load_ext autoreload
%autoreload 2

In [1]:
import codecs
import csv
import glob
import json
import logging
import os
import re
import requests
import shutil
import time

from contextlib import closing
from datetime import datetime
from requests.auth import HTTPBasicAuth
from services import ImportService

import pandas as pd
import altair as alt
import ipywidgets as widgets

from snowflake.snowpark import Session


In [2]:
MAX_RETRIES_FOR_QUERY = 5
MAX_RETRIES_FOR_DOWNLOAD_REPORT = 5
RETRY_TIME_FOR_QUERY = 2
RETRY_TIME_FOR_DOWNLOAD_REPORT = 60
CHECK_INTERVAL_FOR_DOWNLOAD_REPORT = 20

AUTH0_URLS = ['https://auth.wiz.io/oauth/token', 'https://auth0.gov.wiz.io/oauth/token']
COGNITO_URLS = ['https://auth.app.wiz.io/oauth/token', 'https://auth.gov.wiz.io/oauth/token']

GET_PROJECTS_QUERY = (
    """
      query ProjectsTable(
        $filterBy: ProjectFilters
        $first: Int
        $after: String
        $orderBy: ProjectOrder
      ) {
        projects(
          filterBy: $filterBy
          first: $first
          after: $after
          orderBy: $orderBy
        ) {
          nodes {
            id
            name
            isFolder
            archived
            businessUnit
            description
          }
        }
      }
    """
)

GET_ISSUES_QUERY = (
    """
    query IssuesTable($filterBy: IssueFilters, $first: Int, $after: String, $orderBy: IssueOrder) {
    issues: issuesV2(
        filterBy: $filterBy
        first: $first
        after: $after
        orderBy: $orderBy
    ) {
        nodes {
        id
        sourceRule {
            __typename
            ... on Control {
            id
            name
            controlDescription: description
            resolutionRecommendation
            securitySubCategories {
                title
                category {
                name
                framework {
                    name
                }
                }
            }
            risks
            }
            ... on CloudEventRule {
            id
            name
            cloudEventRuleDescription: description
            sourceType
            type
            risks
            securitySubCategories {
                title
                category {
                name
                framework {
                    name
                }
                }
            }
            }
            ... on CloudConfigurationRule {
            id
            name
            cloudConfigurationRuleDescription: description
            remediationInstructions
            serviceType
            risks
            securitySubCategories {
                title
                category {
                name
                framework {
                    name
                }
                }
            }
            }
        }
        createdAt
        updatedAt
        dueAt
        type
        resolvedAt
        statusChangedAt
        projects {
            id
            name
            slug
            businessUnit
            riskProfile {
            businessImpact
            }
        }
        status
        severity
        entitySnapshot {
            id
            type
            nativeType
            name
            status
            cloudPlatform
            cloudProviderURL
            providerId
            region
            resourceGroupExternalId
            subscriptionExternalId
            subscriptionName
            subscriptionTags
            tags
            createdAt
            externalId
        }
        serviceTickets {
            externalId
            name
            url
        }
        notes {
            createdAt
            updatedAt
            text
            user {
            name
            email
            }
            serviceAccount {
            name
            }
        }
        }
        pageInfo {
        hasNextPage
        endCursor
        }
    }
    }
    """
)

GET_RESOURCES_QUERY = (
    """
      query CloudResourceSearch(
          $filterBy: CloudResourceFilters
          $first: Int
          $after: String
        ) {
          cloudResources(
            filterBy: $filterBy
            first: $first
            after: $after
          ) {
            nodes {
              ...CloudResourceFragment
            }
            pageInfo {
              hasNextPage
              endCursor
            }
          }
        }
        fragment CloudResourceFragment on CloudResource {
          id
          name
          type
          subscriptionId
          subscriptionExternalId
          graphEntity{
            id
            providerUniqueId
            name
            type
            projects {
              id
            }
            properties
            firstSeen
            lastSeen
          }
        }
    """
)

GET_REPORT_QUERY = (
    """
    query ReportsTable($filterBy: ReportFilters, $first: Int, $after: String) {
      reports(first: $first, after: $after, filterBy: $filterBy) {
        nodes {
          id
          name
        }
        pageInfo {
          hasNextPage
          endCursor
        }
      }
    }
    """
)

CREATE_REPORT_MUTATION = (
    """
    mutation CreateReport($input: CreateReportInput!) {
      createReport(input: $input) {
        report {
          id
        }
      }
    }
    """
)

RERUN_REPORT_MUTATION = (
    """
    mutation RerunReport($reportId: ID!) {
        rerunReport(input: { id: $reportId }) {
            report {
                id
            }
        }
    }
    """
)

DOWNLOAD_REPORT_QUERY = (
    """
    query ReportDownloadUrl($reportId: ID!) {
        report(id: $reportId) {
            lastRun {
                url
                status
            }
        }
    }
    """
)

In [3]:
def x(l, k, v): l[k] = v

def get_config():
    logging.getLogger().debug("get config")

    with open('config.json', "r") as f:
        config = json.load(f)

    return config

def generate_authentication_params(config):
    logging.getLogger().debug("generate authentication params")

    if config['wizio_token_url'] in AUTH0_URLS:
        return {
            'grant_type': 'client_credentials',
            'audience': 'beyond-api',
            'client_id': config['wizio_client_id'],
            'client_secret': config['wizio_client_secret']
        }
    
    elif config['wizio_token_url'] in COGNITO_URLS:
        return {
            'grant_type': 'client_credentials',
            'audience': 'wiz-api',
            'client_id': config['wizio_client_id'],
            'client_secret': config['wizio_client_secret']
        }
    
    else:
        raise Exception('Error: wrong token url')

def get_token(config):
    logging.getLogger().debug("get token")

    response = requests.post(
        config['wizio_token_url'],
        headers = {'Content-Type': 'application/x-www-form-urlencoded'},
        data = generate_authentication_params(config)
    )

    if response.status_code != requests.codes.ok:
        raise Exception(f'Error: {response.text}') 

    if not response.json().get('access_token'):
        raise Exception(f'Error: {response.json().get("message")}')

    config['wizio_token'] = response.json().get('access_token')

    return config

def send_request(config, query, variables):    
    logging.getLogger().debug("send request")

    if config['wizio_token']:
        return requests.post(
            config['wizio_api_endpoint_url'],
            headers = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + config['wizio_token']},
            json = {'query': query, 'variables': variables}
        )
    
    raise Exception('Error: wizio_token not found')

def query(config, query, variables):
    logging.getLogger().debug("query")

    retries = 0

    response = send_request(config, query, variables)
    
    if response.status_code == requests.codes.unauthorized or response.status_code == requests.codes.forbidden:
        raise Exception(f'Error: {response.text}') 
    
    elif response.status_code == requests.codes.not_found:
        raise Exception(f'Error: {response.text}') 
    
    while response.status_code != requests.codes.ok:
        if retries >= MAX_RETRIES_FOR_QUERY:
            raise Exception(f'Error: {response.text}') 
        
        time.sleep(RETRY_TIME_FOR_QUERY)

        response = send_request(config, query, variables)

        retries += 1
    
    if not response.json().get('data'):
        raise Exception(f'Error: {response.json().get("errors")}')
   
    logging.getLogger().debug(f"Info: {response.json().get('data')}")
    
    return response.json().get('data')

def get_projects(config):
    logging.getLogger().debug("get projects")

    variables = {
        "first": 500,
        "filterBy": {
            "includeArchived": False,
            "isFolder": False,
            "root": False
        }
    }

    nodes = []

    while True:
        response = query(config, GET_PROJECTS_QUERY, variables)

        try:
            nodes = nodes + response['projects']['nodes']

            variables['after'] = response['projects']['pageInfo']['endCursor']

            if response['projects']['pageInfo']['hasNextPage'] == False:
                break

        except Exception as error:
            break

    return nodes

def get_issues(config, project_id):
    logging.getLogger().debug("get issues")

    variables = {
        "first": 500,
        "filterBy": {
            "project": [
                project_id
            ],
        },
        "severity": ["CRITICAL"],
        "stackLayer": ["APPLICATION_AND_DATA", "DATA_STORES"]
    }
    
    nodes = []

    while True:
        response = query(config, GET_ISSUES_QUERY, variables)

        try:
            nodes = nodes + response['issues']['nodes']

            variables['after'] = response['issues']['pageInfo']['endCursor']

            if response['issues']['pageInfo']['hasNextPage'] == False:
                break

        except Exception as error:
            break
            
    return nodes

def get_resources(config, project_id):
    logging.getLogger().debug("get resources")

    variables = {
        "first": 500,
        "filterBy": {
            "projectId": [
                project_id
            ],
        }
    }
    
    nodes = []

    while True:
        response = query(config, GET_RESOURCES_QUERY, variables)

        try:
            nodes = nodes + response['cloudResources']['nodes']

            variables['after'] = response['cloudResources']['pageInfo']['endCursor']

            if response['cloudResources']['pageInfo']['hasNextPage'] == False:
                break

        except Exception as error:
            break
            
    return nodes

def create_report(config, project_id, report_prefix, report_type):
    logging.getLogger().debug("create report")

    variables = {
        "input": {
            "name": re.sub(' |\.|:|-','', f'{report_prefix}_{datetime.now()}'),
            "type": report_type,
            "projectId": project_id
        }
    }
    
    response = query(config, CREATE_REPORT_MUTATION, variables)
    
    report_id = response['createReport']['report']['id']

    return report_id

def rerun_report(config, report_id):
    logging.getLogger().debug("rerun report")

    variables = {
        'reportId': report_id
    }

    response = query(config, RERUN_REPORT_MUTATION, variables)

    report_id = response['rerunReport']['report']['id']

    return report_id

def get_report_url_and_status(config, report_id):
    logging.getLogger().debug("get report url and status")

    num_of_retries = 0

    while num_of_retries < MAX_RETRIES_FOR_DOWNLOAD_REPORT:
        time.sleep(CHECK_INTERVAL_FOR_DOWNLOAD_REPORT)

        response = query(config, DOWNLOAD_REPORT_QUERY, {'reportId': report_id})

        status = response['report']['lastRun']['status']

        if status == 'COMPLETED':
            return response['report']['lastRun']['url']
        
        elif status == 'FAILED' or status == 'EXPIRED':
            rerun_report(report_id)

            time.sleep(RETRY_TIME_FOR_DOWNLOAD_REPORT)

            num_of_retries += 1

    raise Exception('Error: get report fail')

def get_report_content(download_url):
    logging.getLogger().debug("get report content")

    report_data = []

    with closing(requests.get(download_url, stream=True)) as r:
        reader = csv.reader(codecs.iterdecode(r.iter_lines(), 'utf-8'))

        for row in reader:     
            report_data.append(row)
    
    if report_data:
        logging.debug(f'Info: {report_data}')

        return report_data
    
    else:
        raise Exception('Error: download failed')

def get_report_content_to_dataframe(download_url):
    logging.getLogger().debug("get report content to dataframe")

    return pd.read_csv(download_url)    

def get_report(config, project_id):    
    logging.getLogger().debug("get report")
    
    report_data = {}
    
    report_type= "DATA_SCAN"

    report_id = create_report(config, project_id, report_type, report_type)

    report_url = get_report_url_and_status(config, report_id)
    
    #report_data[report_type] = get_report_content(report_url)
    report_data[report_type] = get_report_content_to_dataframe(report_url)

    return report_data

In [4]:
def get_external_id(x):
    try:
        return x['properties']['externalId']
    
    except Exception as error:
        try:
            return x['externalId']
        
        except Exception as error:
            return None

def get_number_of_findings(x):
    try:
        return len(json.loads(x))
    
    except Exception as error:
        return None
    
def get_data_findings(config):
    logging.getLogger().debug("get data findings")

    get_token(config)

    session = Session.builder.create()
    
    # resources = get_resources(config, config['wizio_project_id']) 
    
    # resources_df = pd.DataFrame(resources)

    # session.write_pandas(resources_df, "RESOURCES", auto_create_table=True, overwrite=True)

    # resources_df = session.table("RESOURCES").to_pandas()

    # resources_df['externalId'] = resources_df['graphEntity'].apply(get_external_id)

    # session.write_pandas(resources_df, "RESOURCES_READY", auto_create_table=True, overwrite=True)

    # esources_ready_df = session.table("RESOURCES_READY").to_pandas()

    # reports = get_report(config, config['wizio_project_id']) 

    # data_scan_df = reports['DATA_SCAN']

    # session.write_pandas(data_scan_df, "DATA_SCAN", auto_create_table=True, overwrite=True)

    # data_scan_df = session.table("DATA_SCAN").to_pandas()

    # data_scan_resources_df = data_scan_df.set_index('Resource External ID') .join(resources_df.set_index('externalId'))

    # data_scan_resources_df.reset_index(inplace=True)

    # session.write_pandas(data_scan_resources_df, "DATA_SCAN_RESOURCES", auto_create_table=True, overwrite=True)

    # data_scan_resources_df = session.table("DATA_SCAN_RESOURCES").to_pandas()

    # data_scan_resources_ready_df = pd.json_normalize(data_scan_resources_df['graphEntity'])

    # data_scan_resources_ready_df[['Finding ID', 'Category', 'Classifier', 'Unique Matches', 'Total Matches', 'Severity', 'Finding Examples']] = data_scan_resources_df[['ID', 'Category', 'Classifier', 'Unique Matches', 'Total Matches', 'Severity', 'Finding Examples']]

    # data_scan_resources_ready_df = data_scan_resources_ready_df.rename(lambda x: x.replace('properties.','_'), axis='columns')

    # data_scan_resources_ready_df['Examples Count'] = data_scan_resources_ready_df['Finding Examples'].apply(get_number_of_findings)

    # data_scan_resources_ready_df['_creationYYMM']=data_scan_resources_ready_df['_creationDate'].str[0:7]

    # data_scan_resources_ready_df.to_csv('datascanresourcesready.csv', index=False)

    # data_scan_resources_ready_df = pd.read_csv('datascanresourcesready.csv')  

    # session.write_pandas(data_scan_resources_ready_df, "DATA_SCAN_RESOURCES_READY", auto_create_table=True, overwrite=True)

    data_scan_resources_ready_df = session.table("DATA_SCAN_RESOURCES_READY").to_pandas()

    # data_scan_resources_exploded_df = data_scan_resources_ready_df.query("type in ('BUCKET', 'DATABASE', 'DB_SERVER')")

    # data_scan_resources_exploded_df['Finding Examples'] = data_scan_resources_exploded_df['Finding Examples'].apply(lambda x: eval(x) if x is not None else None)

    # columns=['id', 'name', 'type', '_subscriptionExternalId', 'Finding ID', 'Category', 'Classifier', 'Finding Examples']

    # data_scan_resources_exploded_df = data_scan_resources_exploded_df[columns].explode('Finding Examples', ignore_index=True)

    # exploded_df = pd.json_normalize(data_scan_resources_exploded_df['Finding Examples'])

    # columns=['id', 'name', 'type', '_subscriptionExternalId', 'Finding ID', 'Category', 'Classifier']
        
    # data_scan_resources_exploded_df = pd.concat([data_scan_resources_exploded_df[columns], exploded_df[['key', 'path']]], axis=1)

    # session.write_pandas(data_scan_resources_exploded_df, "DATA_SCAN_RESOURCES_EXPLODED", auto_create_table=True, overwrite=True)    

    data_scan_resources_exploded_df = session.table("DATA_SCAN_RESOURCES_EXPLODED").to_pandas()

    return data_scan_resources_ready_df, data_scan_resources_exploded_df


In [5]:
config = get_config()

data_scan_resources_ready_df, data_scan_resources_exploded_df = get_data_findings(config)


## General Dashboard

In [6]:
resources_per_cloud_platform = data_scan_resources_ready_df[['_cloudPlatform','id']].drop_duplicates().groupby(by=['_cloudPlatform']).count().reset_index().rename(columns={"id": "count"})

resources_per_environment = data_scan_resources_ready_df[['__environments','id']].drop_duplicates().groupby(by=['__environments']).count().reset_index().rename(columns={"id": "count"})

resources_per_status = data_scan_resources_ready_df[['_status','id']].drop_duplicates().groupby(by=['_status']).count().reset_index().rename(columns={"id": "count"})

display(f"AWS {resources_per_cloud_platform.iloc[0]['count']} Production {resources_per_environment.iloc[0]['count']} Active {resources_per_status.iloc[0]['count']} Inactive {resources_per_status.iloc[1]['count']}")

'AWS 87 Production 87 Active 72 Inactive 7'

## Resources Summary


In [7]:
resources_per_creation_date = data_scan_resources_ready_df[['_creationYYMM','id']].drop_duplicates().groupby(by=['_creationYYMM']).count().reset_index().rename(columns={"id": "count"})

c = (alt.Chart(resources_per_creation_date)
    .encode(alt.X('_creationYYMM:O', axis=alt.Axis(labels=True, labelAngle=0)).timeUnit("yearmonth").title('Resource creation'), alt.Y('count', axis=alt.Axis(labels=False)).title('Resources'), alt.Color('count', legend=None).scale(scheme="lightgreyteal", reverse=False), alt.Text('count'), tooltip=["_creationYYMM:T", "count"])
    .properties(title='Number of resources per date', width=1330, height=200)
)

(c.mark_bar() + c.mark_text(align='center', dy=-10)).configure_axis(grid=False).configure_view(strokeWidth=0)

The analysis provides a breakdown of the **resources** identified across different **regions** and their **types**. 

As illustrated in the graphs below, more than **70%** of the resources with data findings are located in the **us-east-1** region, nearly **50%** are categorized as **buckets**, while **15%** are classified as **databases**.

In [8]:
resources_per_region = data_scan_resources_ready_df[['_region','id']].drop_duplicates().groupby(by=['_region']).count().reset_index().rename(columns={"id": "count"})

c1 = (alt.Chart(resources_per_region)
    .encode(alt.X('_region', axis=alt.Axis(labels=True, labelAngle=0)).title('Resource region'), alt.Y('count', axis=alt.Axis(labels=False)).title('Resources'), alt.Color('count', legend=None).scale(scheme="lightgreyteal", reverse=False), alt.Text('count'), tooltip=["_region", "count"])
    .properties(title='Number of resources per region', width=640, height=200)
)

In [9]:
resources_per_type = data_scan_resources_ready_df[['type','id']].drop_duplicates().groupby(by=['type']).count().reset_index().rename(columns={"id": "count"})

c2 = (alt.Chart(resources_per_type)
    .encode(alt.X('type', axis=alt.Axis(labels=True, labelAngle=0)).title('Resource type'), alt.Y('count', axis=alt.Axis(labels=False)).title('Resources'), alt.Color('count', legend=None).scale(scheme="lightgreyteal", reverse=False), alt.Text('count'), tooltip=["type", "count"])
    .properties(title='Number of resources per type', width=640, height=200)
)

In [10]:
((c1.mark_bar() + c1.mark_text(align='center', dy=-10)) | (c2.mark_bar() + c2.mark_text(align='center', dy=-10))).configure_axis(grid=False).configure_view(strokeWidth=0)


The analysis offers a comprehensive overview of the identified **resources**, highlighting their **severity** and **classifications**. 

As shown in the graphs below, **48%** of the resources exhibit significant findings, categorized as **high** and **critical** data with **Personally Identifiable**, **Financial**, and **Digital Identity** being in the top 5 categories.

In [11]:
resources_per_category = data_scan_resources_ready_df[['Category','id']].drop_duplicates().groupby(by=['Category']).count().reset_index().rename(columns={"id": "count"})

c1 = (alt.Chart(resources_per_category)
    .encode(alt.X('Category', axis=alt.Axis(labels=True, labelAngle=0)).title('Finding classifier'), alt.Y('count', axis=alt.Axis(labels=False)).title('Resources'), alt.Color('count', legend=None).scale(scheme="lightgreyteal", reverse=False), alt.Text('count'), tooltip=["Category", "count"])
    .properties(title='Number of resources per classifier', width=640, height=200)
)


In [12]:
resources_per_severity = data_scan_resources_ready_df[['Severity','id']].drop_duplicates().groupby(by=['Severity']).count().reset_index().rename(columns={"id": "count"})

c2 = (alt.Chart(resources_per_severity)
    .encode(alt.X('Severity', axis=alt.Axis(labels=True, labelAngle=0)).title('Finding severity'), alt.Y('count', axis=alt.Axis(labels=False)).title('Resources'), alt.Color('count', legend=None).scale(scheme="lightgreyteal", reverse=False), alt.Text('count'), tooltip=["Severity", "count"])
    .properties(title='Number of resources per severity', width=640, height=200)
)


In [13]:
((c1.mark_bar() + c1.mark_text(align='center', dy=-10)) | (c2.mark_bar() + c2.mark_text(align='center', dy=-10))).configure_axis(grid=False).configure_view(strokeWidth=0)

## Data Findinds Summary

The analysis offers a detailed overview of the **unique findings** discovered across various **regions** and their **classifications**. 

As demonstrated in the graphs below, nearly **75%** of the resources containing data findings are situated in the **us-east-1** region. 

Over **70%** of these resources are categorized as **buckets**, while around **20%** are identified as **databases**. This reinforces our earlier observations that buckets and databases are the most critical components.

In [14]:
findings_per_region = data_scan_resources_ready_df[['_region','Finding ID']].groupby(by=['_region']).count().reset_index().rename(columns={"Finding ID": "count"})

c1 = (alt.Chart(findings_per_region)
    .encode(alt.X('_region', axis=alt.Axis(labels=True, labelAngle=0)).title('Resource region'), alt.Y('count', axis=alt.Axis(labels=False)).title('Findings'), alt.Color('count', legend=None).scale(scheme="lightorange", reverse=False), alt.Text('count'), tooltip=["_region", "count"])
    .properties(title='Number of findings per region', width=640, height=200)
)


In [15]:
findings_per_type = data_scan_resources_ready_df[['type','Finding ID']].groupby(by=['type']).count().reset_index().rename(columns={"Finding ID": "count"})

c2 = (alt.Chart(findings_per_type)
    .encode(alt.X('type', axis=alt.Axis(labels=True, labelAngle=0)).title('Resource type'), alt.Y('count', axis=alt.Axis(labels=False)).title('Findings'), alt.Color('count', legend=None).scale(scheme="lightorange", reverse=False), alt.Text('count'), tooltip=["type", "count"])
    .properties(title='Number of findings per type', width=640, height=200)
)


In [16]:
((c1.mark_bar() + c1.mark_text(align='center', dy=-10)) | (c2.mark_bar() + c2.mark_text(align='center', dy=-10))).configure_axis(grid=False).configure_view(strokeWidth=0)

The analysis provides a thorough overview of the identified **resources** and their **classifications**. 

The graph below illustrates that key data points, including **names**, **emails**, **phone numbers**, **addresses**, **gender**, and **transaction details**, are prominently featured.

In [17]:
findings_per_classifier = data_scan_resources_ready_df[['Classifier','Finding ID']].groupby(by=['Classifier']).count().reset_index().rename(columns={"Finding ID": "count"})

c = (alt.Chart(findings_per_classifier)
    .encode(alt.X('Classifier', axis=alt.Axis(labels=True, labelAngle=90)).title('Finding classifier'), alt.Y('count', axis=alt.Axis(labels=False)).title('Findings'), alt.Color('count', legend=None).scale(scheme="lightorange", reverse=False), alt.Text('count'), tooltip=["Classifier", "count"])
    .properties(title='Number of findings per classifier', width=1330, height=200)
)
            
(c.mark_bar() + c.mark_text(align='center', dy=-10)).configure_axis(grid=False).configure_view(strokeWidth=0)


### The most bang for the buck. 

In [18]:
findings_per_type_and_classifier = data_scan_resources_ready_df[['type', 'Classifier', 'Finding ID']].groupby(by=['type','Classifier']).count().reset_index().rename(columns={"Finding ID": "count"})

c = (alt.Chart(findings_per_type_and_classifier)
    .encode(alt.X('Classifier', axis=alt.Axis(labels=True, labelAngle=90)).title('Finding classifier'), alt.Y('type', axis=alt.Axis(labels=False, labelAngle=0)).title('Resource type'), alt.Color('count', legend=None).scale(scheme="orangered", reverse=False), alt.Text('count'), tooltip=["Classifier","type","count"])
    .properties(title='Number of findings per resource type and classifier', width=1330, height=200)
)

c.mark_rect()


When spending time or money, it is essential to insist on getting the most bang for the buck.

In [19]:
findings_per_type_and_severity = data_scan_resources_ready_df[['type', 'Severity', 'Finding ID']].groupby(by=['type','Severity']).count().reset_index().rename(columns={"Finding ID": "count"})

c = (alt.Chart(findings_per_type_and_severity)
    .encode(alt.X('Severity', axis=alt.Axis(labels=True, labelAngle=90)).title('Finding severity'), alt.Y('type', axis=alt.Axis(labels=False, labelAngle=0)).title('Resource type'), alt.Color('count', legend=None).scale(scheme="orangered", reverse=False), alt.Text('count'), tooltip=["Severity","type","count"])
    .properties(title='Number of findings per resource type and severity', width=1330, height=200)
)

(c.mark_rect() + c.mark_text(baseline="middle", fontWeight="bold").encode(color=alt.value("white")))

## Total Matches Summary

The analysis provides a comprehensive overview of the **total matches** identified across **regions** and their **classifications**. 

As illustrated in the graphs below, more than **75%** of the resources containing data findings are located in the **us-east-1** region. 

Furthermore, nearly **97%** of these resources are classified as **buckets**, while merely **2%** are recognized as databases. If you're looking to begin your work, start with your buckets..

In [20]:
total_matches_per_region = data_scan_resources_ready_df[['_region','Total Matches']].groupby(by=['_region']).sum().reset_index().rename(columns={"Total Matches": "count"})

c1 = (alt.Chart(total_matches_per_region)
    .encode(alt.X('_region', axis=alt.Axis(labels=True, labelAngle=0)).title('Resource region'), alt.Y('count', axis=alt.Axis(labels=False)).title('Total Matches'), alt.Color('count', legend=None).scale(scheme="reds", reverse=False), alt.Text('count'), tooltip=["_region", "count"])
    .properties(title='Number of total matches per region', width=640, height=200)
)

In [21]:
total_matches_per_type = data_scan_resources_ready_df[['type','Total Matches']].groupby(by=['type']).sum().reset_index().rename(columns={"Total Matches": "count"})

c2 = (alt.Chart(total_matches_per_type)
    .encode(alt.X('type', axis=alt.Axis(labels=True, labelAngle=0)).title('Resource type'), alt.Y('count', axis=alt.Axis(labels=False)).title('Total Matches'), alt.Color('count', legend=None).scale(scheme="reds", reverse=False), alt.Text('count'), tooltip=["type", "count"])
    .properties(title='Number of total matches per type', width=640, height=200)
)


In [22]:
((c1.mark_bar() + c1.mark_text(align='center', dy=-10)) | (c2.mark_bar() + c2.mark_text(align='center', dy=-10))).configure_axis(grid=False).configure_view(strokeWidth=0)

.. and get rid of mushrooms in your yard.

In [23]:
total_matches_per_classifier = data_scan_resources_ready_df[['Classifier','Total Matches']].groupby(by=['Classifier']).sum().reset_index().rename(columns={"Total Matches": "count"})

c = (alt.Chart(total_matches_per_classifier)
    .encode(alt.X('Classifier', axis=alt.Axis(labels=True, labelAngle=90)).title('Finding classifier'), alt.Y('count', axis=alt.Axis(labels=False)).title('Resources'), alt.Color('count', legend=None).scale(scheme="reds", reverse=False), alt.Text('count'), tooltip=["Classifier", "count"])
    .properties(title='Number of total matches per classifier', width=1330, height=200)
)
            
(c.mark_bar() + c.mark_text(align='center', dy=-10)).configure_axis(grid=False).configure_view(strokeWidth=0)


Prioritize addressing the critical findings first, followed by the high findings

In [24]:
total_matches_per_type_and_classifier = data_scan_resources_ready_df[['type', 'Classifier', 'Total Matches']].groupby(by=['type','Classifier']).sum().reset_index().rename(columns={"Total Matches": "count"})

c = (alt.Chart(total_matches_per_type_and_classifier)
    .encode(alt.X('Classifier', axis=alt.Axis(labels=True, labelAngle=90)).title('Finding classifier'), alt.Y('type', axis=alt.Axis(labels=False, labelAngle=0)).title('Resource type'), alt.Color('count', legend=None).scale(scheme="reds", reverse=False), alt.Text('count'), tooltip=["Classifier","type","count"])
    .properties(title='Number of total matches per resource type and classifier', width=1330, height=200)
)

c.mark_rect()

In [25]:
total_matches_per_type_and_severity = data_scan_resources_ready_df[['type', 'Severity', 'Total Matches']].groupby(by=['type','Severity']).sum().reset_index().rename(columns={"Total Matches": "count"})

c = (alt.Chart(total_matches_per_type_and_severity)
    .encode(alt.X('Severity', axis=alt.Axis(labels=True, labelAngle=90)).title('Finding severity'), alt.Y('type', axis=alt.Axis(labels=False, labelAngle=0)).title('Resource type'), alt.Color('count', legend=None).scale(scheme="reds", reverse=False), alt.Text('count'), tooltip=["Severity","type","count"])
    .properties(title='Number of total matches per resource type and severity', width=1330, height=200)
)

(c.mark_rect() + c.mark_text(baseline="middle", fontWeight="bold").encode(color=alt.value("white")))


## Data Findings

In [26]:
data_scan_resources_ready_df

Unnamed: 0,id,providerUniqueId,name,type,projects,firstSeen,lastSeen,__environments,__productIDs,__techIDs,...,_requiresSSL,Finding ID,Category,Classifier,Unique Matches,Total Matches,Severity,Finding Examples,Examples Count,_creationYYMM
0,4b6076f9-330e-5f91-a265-e53fe4bd5e50,,PubSecDQ,DB_SERVER,"[{""id"":""b827013d-e8f2-52ba-9903-eadf943ee9da""}]",2024-11-12T22:19:31.737233Z,2025-01-02T03:15:40Z,Production,b827013d-e8f2-52ba-9903-eadf943ee9da,2024,...,,i-013ebc0d8bff0950a##CloudPlatform/DBServer##/...,DataCategoryFinancial,Bank Name,1,1,MEDIUM,"[{""path"":""postgres.demos.observation"",""key"":""H...",1.0,
1,4c9c26e1-5b49-55b6-9e67-5dc63b14503e,arn:aws:s3:::bryang-bucket,bryang-bucket,BUCKET,"[{""id"":""b827013d-e8f2-52ba-9903-eadf943ee9da""}]",2024-11-12T18:17:21.376743Z,2025-01-02T12:18:54Z,Production,b827013d-e8f2-52ba-9903-eadf943ee9da,3136,...,,bryang-bucket##BUILTIN-374,DataCategoryFinancial,Price,1656,5371,INFO,"[{""path"":""sec/02_EDWDATA/02_EDWDATA/taq_nbbo_2...",12.0,2021-10
2,4c9c26e1-5b49-55b6-9e67-5dc63b14503e,arn:aws:s3:::bryang-bucket,bryang-bucket,BUCKET,"[{""id"":""b827013d-e8f2-52ba-9903-eadf943ee9da""}]",2024-11-12T18:17:21.376743Z,2025-01-02T12:18:54Z,Production,b827013d-e8f2-52ba-9903-eadf943ee9da,3136,...,,bryang-bucket##BUILTIN-1,DataCategoryPII,Email,1502,2514,CRITICAL,"[{""path"":""MOCK_DATA.csv"",""key"":""email"",""value""...",6.0,2021-10
3,a817d3f8-e676-5109-bcb9-b6bf6ce3bf6c,arn:aws:ec2:us-west-1:482457381153:instance/i-...,BG - Ohalo Data X-Ray 7.12.1 - Deployment Script,VIRTUAL_MACHINE,"[{""id"":""b827013d-e8f2-52ba-9903-eadf943ee9da""}]",2024-11-12T18:16:20.638582Z,2025-01-02T12:18:08Z,Production,b827013d-e8f2-52ba-9903-eadf943ee9da,3018,...,,i-0bdcd0c7f3557a50c##BUILTIN-425,DataCategoryFinancial,Masked Credit Card,533,1599,INFO,"[{""path"":""/folder_datasources/demo_files/CardB...",7.0,2024-10
4,4c9c26e1-5b49-55b6-9e67-5dc63b14503e,arn:aws:s3:::bryang-bucket,bryang-bucket,BUCKET,"[{""id"":""b827013d-e8f2-52ba-9903-eadf943ee9da""}]",2024-11-12T18:17:21.376743Z,2025-01-02T12:18:54Z,Production,b827013d-e8f2-52ba-9903-eadf943ee9da,3136,...,,bryang-bucket##BUILTIN-32,DataCategoryPII,Phone Number,1886,2072,CRITICAL,"[{""path"":""srp/Sample Data/Purchase Order Data/...",6.0,2021-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
405,d37535c1-cb0e-5a7b-8c11-bbdba326c6f3,arn:aws:s3:::bailine-buckethead,bailine-buckethead,BUCKET,"[{""id"":""b827013d-e8f2-52ba-9903-eadf943ee9da""}]",2024-11-12T18:17:22.901598Z,2025-01-02T12:18:54Z,Production,b827013d-e8f2-52ba-9903-eadf943ee9da,3136,...,,bailine-buckethead##BUILTIN-20,DataCategoryPII,Gender,2,172,INFO,"[{""path"":""003c6c7e-4f4d-4bc1-a7cc-a16de73c3469...",1.0,2021-08
406,8d073c69-d114-5f47-86f9-31a2fc3500df,arn:aws:s3:::connectivity.collibra.com,connectivity.collibra.com,BUCKET,"[{""id"":""b827013d-e8f2-52ba-9903-eadf943ee9da""}]",2024-11-12T18:17:24.96273Z,2025-01-02T12:18:56Z,Production,b827013d-e8f2-52ba-9903-eadf943ee9da,3136,...,,connectivity.collibra.com##BUILTIN-1,DataCategoryPII,Email,2000,2020,CRITICAL,"[{""path"":""parquetfiles/userdata1.parquet"",""key...",7.0,2020-07
407,c3cd1114-53b7-5f0e-94cb-cd382d7c5055,arn:aws:s3:::marcdq,marcdq,BUCKET,"[{""id"":""b827013d-e8f2-52ba-9903-eadf943ee9da""}]",2024-11-12T18:17:27.349245Z,2025-01-02T12:18:57Z,Production,b827013d-e8f2-52ba-9903-eadf943ee9da,3136,...,,marcdq##BUILTIN-20,DataCategoryPII,Gender,6,333,INFO,"[{""path"":""semantic_test.csv"",""key"":""gender"",""v...",1.0,2022-03
408,dc6c68d3-fb36-5670-9c08-7307d930474a,arn:aws:s3:::kiel-collibra-dq,kiel-collibra-dq,BUCKET,"[{""id"":""b827013d-e8f2-52ba-9903-eadf943ee9da""}]",2024-11-12T18:17:26.469405Z,2025-01-02T12:18:57Z,Production,b827013d-e8f2-52ba-9903-eadf943ee9da,3136,...,,kiel-collibra-dq##BUILTIN-1,DataCategoryPII,Email,2000,2047,CRITICAL,"[{""path"":""MOCK_DATA (1).csv"",""key"":""email"",""va...",6.0,2021-09


## Finding Examples

In [27]:
data_scan_resources_exploded_df #.rename(columns={"id":"Resource Id","name":"Resource Name","type":"Resource Type","_subscriptionExternalId":"Resource Account","Category": "Finding Category","Classifier": "Finding Classifier","key": "File Key","path":" File Path"}))

Unnamed: 0,id,name,type,_subscriptionExternalId,Finding ID,Category,Classifier,key,path
0,4b6076f9-330e-5f91-a265-e53fe4bd5e50,PubSecDQ,DB_SERVER,482457381153,i-013ebc0d8bff0950a##CloudPlatform/DBServer##/...,DataCategoryFinancial,Bank Name,HEARTLAND FINANCIAL USA,postgres.demos.observation
1,4c9c26e1-5b49-55b6-9e67-5dc63b14503e,bryang-bucket,BUCKET,482457381153,bryang-bucket##BUILTIN-374,DataCategoryFinancial,Price,price,sec/02_EDWDATA/02_EDWDATA/taq_nbbo_2020_05.csv
2,4c9c26e1-5b49-55b6-9e67-5dc63b14503e,bryang-bucket,BUCKET,482457381153,bryang-bucket##BUILTIN-374,DataCategoryFinancial,Price,Price,sec/01_DATASETS.zip->etp_merge_mock_v4.csv
3,4c9c26e1-5b49-55b6-9e67-5dc63b14503e,bryang-bucket,BUCKET,482457381153,bryang-bucket##BUILTIN-374,DataCategoryFinancial,Price,Price,sec/01_DATASETS.zip->etp_merge_mock_v4.xlsx##e...
4,4c9c26e1-5b49-55b6-9e67-5dc63b14503e,bryang-bucket,BUCKET,482457381153,bryang-bucket##BUILTIN-374,DataCategoryFinancial,Price,Price,sec/01_DATASETS/etp_merge_mock_v4.csv
...,...,...,...,...,...,...,...,...,...
1399,dc6c68d3-fb36-5670-9c08-7307d930474a,kiel-collibra-dq,BUCKET,482457381153,kiel-collibra-dq##BUILTIN-1,DataCategoryPII,Email,email,MOCK_DATA (1).csv
1400,dc6c68d3-fb36-5670-9c08-7307d930474a,kiel-collibra-dq,BUCKET,482457381153,kiel-collibra-dq##BUILTIN-1,DataCategoryPII,Email,email,Customer.csv
1401,8d073c69-d114-5f47-86f9-31a2fc3500df,connectivity.collibra.com,BUCKET,482457381153,connectivity.collibra.com##BUILTIN-436,DataCategoryFinancial,Credit Cards,Cc,parquetfiles/userdata1.parquet
1402,8d073c69-d114-5f47-86f9-31a2fc3500df,connectivity.collibra.com,BUCKET,482457381153,connectivity.collibra.com##BUILTIN-436,DataCategoryFinancial,Credit Cards,Cc,parquetfiles/userdata2.parquet


## Select 

the community where you want to find your buckets on 

In [28]:
def get_collibra(config):
    logging.getLogger().debug("get collibra")

    collibra = {}

    collibra["host"] = f"https://{config['collibra_host']}"

    collibra["username"] = config['collibra_username']

    collibra["password"] = config['collibra_password']

    collibra["endpoint"] = f"{collibra['host']}{config['collibra_api_endpoint']}"

    collibra["session"] = requests.Session()

    collibra.get("session").auth = HTTPBasicAuth(collibra.get("username"), collibra.get("password"))

    return collibra


In [29]:
communities = {}

collibra = get_collibra(get_config())

response = collibra.get("session").get(f"{collibra.get('endpoint')}/communities")

_ = [x(communities, community.get("name"), community) for community in response.json()["results"]]

community = widgets.Select(options=sorted([f"{k}" for k,v in communities.items()]), description='Communities', layout=widgets.Layout(width='40%'))

display(community)



Select(description='Communities', layout=Layout(width='40%'), options=('Airflow', 'Amazon', 'Asia', 'Asset cha…

## Choose 

if you want to register all finding examples

In [30]:
do_files = widgets.Checkbox(value=False, description='Register files', indent=True)

display(do_files)


Checkbox(value=False, description='Register files')

In [55]:
def do_finding(importService, config, entries, x):
    # data category
    if x['Category'] not in entries[0]:
        entries[0][x['Category']] = {
            "entry": importService.get_asset("Privacy and Risk community", "Data categories", "Data Category", x['Category'], x['Category'])
        }

    # data concept
    if x['Classifier'] not in entries[1]:
        entries[1][x['Classifier']] = {
            "entry": importService.get_asset("Data Architects community", "Business Data Models", "Data Concept", x['Classifier'], x['Classifier']),
            "relations": [],
            "attributes": []
        }

    if x['Category'] not in entries[1][x['Classifier']]['relations']:
        entries[1][x['Classifier']]['relations'].append(x['Category'])
        importService.add_relations(entries[1][x['Classifier']]['entry'], "c0e00000-0000-0000-0000-000000007316", "SOURCE", "Data categories", "Privacy and Risk community", x['Category'])

    if x['Severity'] not in entries[1][x['Classifier']]['attributes']:
        entries[1][x['Classifier']]['attributes'].append(x['Severity'])
        importService.add_attributes(entries[1][x['Classifier']]['entry'], 'Severity', x['Severity'], 'string')

    # domain
    if x['_subscriptionExternalId'] not in entries[2]:
        entries[2][x['_subscriptionExternalId']] = {
            "entry": importService.get_domain(config['community_to_query'], "Technology Asset Domain", x['_subscriptionExternalId']),
        }

    # system
    if x['_subscriptionExternalId'] not in entries[3]:
        entries[3][x['_subscriptionExternalId']] = {
            "entry": importService.get_asset(config['community_to_query'], x['_subscriptionExternalId'], "System", x['_subscriptionExternalId'], x['_subscriptionExternalId']),
            "attributes": []

        }

    if x['_cloudPlatform'] not in entries[3][x['_subscriptionExternalId']]['attributes']:
        entries[3][x['_subscriptionExternalId']]['attributes'].append(x['_cloudPlatform'])
        importService.add_attributes(entries[3][x['_subscriptionExternalId']]['entry'], 'Platform', x['_cloudPlatform'], 'string')

    if x['_subscriptionExternalId'] not in entries[3][x['_subscriptionExternalId']]['attributes']:
        entries[3][x['_subscriptionExternalId']]['attributes'].append(x['_subscriptionExternalId'])
        importService.add_attributes(entries[3][x['_subscriptionExternalId']]['entry'], 'Account Name', x['_subscriptionExternalId'], 'string')


    measure = None
    source = None

    # if buckets
    if x['type'] == 'BUCKET':        
        # file system
        if x['_externalId'] not in entries[4]:
            entries[4][x['_externalId']] = {
                "entry": importService.get_asset(config['community_to_query'], x['_subscriptionExternalId'], "S3 File System", x['_externalId'], x['_externalId']),
                "relations": [],
                "attributes": []
            }

        if x['_subscriptionExternalId'] not in entries[4][x['_externalId']]['relations']:
            entries[4][x['_externalId']]['relations'].append(x['_subscriptionExternalId'])
            importService.add_relations(entries[4][x['_externalId']]['entry'], "00000000-0000-0000-0000-000000007054", "SOURCE", x['_subscriptionExternalId'], config['community_to_query'], x['_subscriptionExternalId'])

        if x['_region'] not in entries[4][x['_externalId']]['attributes']:
            entries[4][x['_externalId']]['attributes'].append(x['_region'])
            importService.add_attributes(entries[4][x['_externalId']]['entry'], 'Region', x['_region'], 'string')

        if x['_creationDate'] not in entries[4][x['_externalId']]['attributes']:
            entries[4][x['_externalId']]['attributes'].append(x['_creationDate'])
            importService.add_attributes(entries[4][x['_externalId']]['entry'], 'Created At', x['_creationDate'], 'string')

        # storage container
        if x['_externalId'] not in entries[5]:
            entries[5][x['_externalId']] = {
                "entry": importService.get_asset(config['community_to_query'], x['_subscriptionExternalId'], "S3 Bucket", f"s3://{x['_externalId']}", f"s3://{x['_externalId']}"),
                "relations": [],
                "attributes": []
            }

        if x['_externalId'] not in entries[5][x['_externalId']]['relations']:
            entries[5][x['_externalId']]['relations'].append(x['_externalId'])
            importService.add_relations(entries[5][x['_externalId']]['entry'], "00000000-0000-0000-0001-002600000000", "SOURCE", x['_subscriptionExternalId'], config['community_to_query'], x['_externalId'])

        if x['Category'] not in entries[5][x['_externalId']]['relations']:
            entries[5][x['_externalId']]['relations'].append(x['Category'])
            importService.add_relations(entries[5][x['_externalId']]['entry'], "01930192-86fb-77b0-8baf-30a80dccb864", "TARGET", "Data categories", "Privacy and Risk community", x['Category'])

        if x['Classifier'] not in entries[5][x['_externalId']]['relations']:
            entries[5][x['_externalId']]['relations'].append(x['Classifier'])
            importService.add_relations(entries[5][x['_externalId']]['entry'], "01930192-f332-70fc-8572-9f7283c4cfd4", "TARGET",  "Business Data Models", "Data Architects community", x['Classifier'])

        if x['_region'] not in entries[5][x['_externalId']]['attributes']:
            entries[5][x['_externalId']]['attributes'].append(x['_region'])
            importService.add_attributes(entries[5][x['_externalId']]['entry'], 'Region', x['_region'], 'string')

        if x['_creationDate'] not in entries[5][x['_externalId']]['attributes']:
            entries[5][x['_externalId']]['attributes'].append(x['_creationDate'])
            importService.add_attributes(entries[5][x['_externalId']]['entry'], 'Created At', x['_creationDate'], 'string')

        # directory
        if x['_externalId'] not in entries[6]:
            entries[6][x['_externalId']] = {
                "entry": importService.get_asset(config['community_to_query'], x['_subscriptionExternalId'], "Directory", f"s3://{x['_externalId']}/", "/"),
                "relations": [],
                "attributes": []
            }

        if x['_externalId'] not in entries[6][x['_externalId']]['relations']:
            entries[6][x['_externalId']]['relations'].append(x['_externalId'])
            importService.add_relations(entries[6][x['_externalId']]['entry'], "00000000-0000-0000-0001-002600000001", "SOURCE", x['_subscriptionExternalId'], config['community_to_query'], f"s3://{x['_externalId']}")

        if x['Category'] not in entries[6][x['_externalId']]['relations']:
            entries[6][x['_externalId']]['relations'].append(x['Category'])
            importService.add_relations(entries[6][x['_externalId']]['entry'], "01930192-86fb-77b0-8baf-30a80dccb864", "TARGET", "Data categories", "Privacy and Risk community", x['Category'])

        if x['Classifier'] not in entries[6][x['_externalId']]['relations']:
            entries[6][x['_externalId']]['relations'].append(x['Classifier'])
            importService.add_relations(entries[6][x['_externalId']]['entry'], "01930192-f332-70fc-8572-9f7283c4cfd4", "TARGET",  "Business Data Models", "Data Architects community", x['Classifier'])

        if x['_region'] not in entries[6][x['_externalId']]['attributes']:
            entries[6][x['_externalId']]['attributes'].append(x['_region'])
            importService.add_attributes(entries[6][x['_externalId']]['entry'], 'Region', x['_region'], 'string')

        if x['_creationDate'] not in entries[6][x['_externalId']]['attributes']:
            entries[6][x['_externalId']]['attributes'].append(x['_creationDate'])
            importService.add_attributes(entries[6][x['_externalId']]['entry'], 'Created At', x['_creationDate'], 'string')
            


        # measure
        entries[8][f"{x['_externalId']}:{x['Classifier']}:Unique Matches"] = {
            "entry": importService.get_asset("Governance council", "New Data Findings Metrics", "Measure", f"{x['_externalId']}:{x['Classifier']}:Unique Matches", f"{x['Classifier']} Unique Matches")
        }

        importService.add_attributes(entries[8][f"{x['_externalId']}:{x['Classifier']}:Unique Matches"]['entry'], 'Count', x['Unique Matches'], 'string')

        importService.add_relations(entries[8][f"{x['_externalId']}:{x['Classifier']}:Unique Matches"]['entry'], "01930b23-1a84-7d44-b817-275206442bf6", "TARGET",  "Business Data Models", "Data Architects community",  x['Classifier'])
        
        importService.add_relations(entries[8][f"{x['_externalId']}:{x['Classifier']}:Unique Matches"]['entry'], "01930b24-2617-722b-9502-8c30d4b3818c", "SOURCE",  x['_subscriptionExternalId'], config['community_to_query'], f"s3://{x['_externalId']}/")

        entries[8][f"{x['_externalId']}:{x['Classifier']}:Total Matches"] = {
            "entry": importService.get_asset("Governance council", "New Data Findings Metrics", "Measure", f"{x['_externalId']}:{x['Classifier']}:Total Matches", f"{x['Classifier']} Total Matches")
        }

        importService.add_attributes(entries[8][f"{x['_externalId']}:{x['Classifier']}:Total Matches"]['entry'], 'Count', x['Total Matches'], 'string')

        importService.add_relations(entries[8][f"{x['_externalId']}:{x['Classifier']}:Total Matches"]['entry'], "01930b23-1a84-7d44-b817-275206442bf6", "TARGET",  "Business Data Models", "Data Architects community",  x['Classifier'])
        
        importService.add_relations(entries[8][f"{x['_externalId']}:{x['Classifier']}:Total Matches"]['entry'], "01930b24-2617-722b-9502-8c30d4b3818c", "SOURCE",  x['_subscriptionExternalId'], config['community_to_query'], f"s3://{x['_externalId']}/")

        # dimension
        if x['Classifier'] not in entries[9]:
            entries[9][x['Classifier']] = {
                "entry": importService.get_asset("Governance council", "Data Findings Dimensions", "Data Findings Dimension", x['Classifier'], x['Classifier'])
            }

        # metric    
        entries[10][f"s3://{x['_externalId']}/:{x['Classifier']}:Unique Matches:Rule"] = {
            "entry": importService.get_asset("Governance council", "Data Findings Rules", "Data Findings Rule",f"s3://{x['_externalId']}/:{x['Classifier']}:Unique Matches", f"{x['Classifier']} Unique Matches")
        }

        importService.add_relations(entries[10][f"s3://{x['_externalId']}/:{x['Classifier']}:Unique Matches:Rule"]['entry'], "00000000-0000-0000-0000-000000007018", "SOURCE",  x['_subscriptionExternalId'], config['community_to_query'], f"s3://{x['_externalId']}/")
        

        entries[10][f"s3://{x['_externalId']}/:{x['Classifier']}:Total Matches:Rule"] = {
            "entry": importService.get_asset("Governance council", "Data Findings Rules", "Data Findings Rule", f"s3://{x['_externalId']}/:{x['Classifier']}:Total Matches", f"{x['Classifier']} Total Matches")
        }

        importService.add_relations(entries[10][f"s3://{x['_externalId']}/:{x['Classifier']}:Total Matches:Rule"]['entry'], "00000000-0000-0000-0000-000000007018", "SOURCE",  x['_subscriptionExternalId'], config['community_to_query'], f"s3://{x['_externalId']}/")

        entries[10][f"s3://{x['_externalId']}/:{x['Classifier']}:Unique Matches:Metric"] = {
            "entry": importService.get_asset("Governance council", "Data Findings Metrics", "Data Findings Metric", f"s3://{x['_externalId']}/:{x['Classifier']}:Unique Matches", f"{x['Classifier']} Unique Matches")
        }

        importService.add_attributes(entries[10][f"s3://{x['_externalId']}/:{x['Classifier']}:Unique Matches:Metric"]['entry'], 'Passing Fraction', x['Unique Matches'], 'string')

        importService.add_relations(entries[10][f"s3://{x['_externalId']}/:{x['Classifier']}:Unique Matches:Metric"]['entry'], "01931f87-3dca-7b65-a03c-dce0146ade76", "TARGET",  "Data Findings Dimensions", "Governance council", x['Classifier'])
        
        importService.add_relations(entries[10][f"s3://{x['_externalId']}/:{x['Classifier']}:Unique Matches:Metric"]['entry'], "01931feb-4b9a-7b6b-a456-e1a2759ceca4", "SOURCE",  "Data Findings Rules", "Governance council", f"s3://{x['_externalId']}/:{x['Classifier']}:Unique Matches")
        
        entries[10][f"s3://{x['_externalId']}/:{x['Classifier']}:Total Matches:Metric"] = {
            "entry": importService.get_asset("Governance council", "Data Findings Metrics", "Data Findings Metric", f"s3://{x['_externalId']}/:{x['Classifier']}:Total Matches", f"{x['Classifier']} Total Matches")
        }

        importService.add_attributes(entries[10][f"s3://{x['_externalId']}/:{x['Classifier']}:Total Matches:Metric"]['entry'], 'Passing Fraction', x['Total Matches'], 'string')

        importService.add_relations(entries[10][f"s3://{x['_externalId']}/:{x['Classifier']}:Total Matches:Metric"]['entry'], "01931f87-3dca-7b65-a03c-dce0146ade76", "TARGET",  "Data Findings Dimensions", "Governance council", x['Classifier'])

        importService.add_relations(entries[10][f"s3://{x['_externalId']}/:{x['Classifier']}:Total Matches:Metric"]['entry'], "01931feb-4b9a-7b6b-a456-e1a2759ceca4", "SOURCE",  "Data Findings Rules", "Governance council", f"s3://{x['_externalId']}/:{x['Classifier']}:Total Matches")



    # if database
    if x['type'] in ('DATABASE', 'DB_SERVER'):
        if x['_externalId'] not in entries[7]:
            entries[7][x['_externalId']] = {
                "entry": importService.get_asset(config['community_to_query'], x['_subscriptionExternalId'], "Database", x['name'], x['name']),
                "relations": [],
                "attributes": []
            }

        if x['_subscriptionExternalId'] not in entries[7][x['_externalId']]['relations']:
            entries[7][x['_externalId']]['relations'].append(x['_subscriptionExternalId'])
            importService.add_relations(entries[7][x['_externalId']]['entry'], "00000000-0000-0000-0000-000000007054", "SOURCE",  x['_subscriptionExternalId'], config['community_to_query'], x['_subscriptionExternalId'])

        if x['Category'] not in entries[7][x['_externalId']]['relations']:
            entries[7][x['_externalId']]['relations'].append(x['Category'])
            importService.add_relations(entries[7][x['_externalId']]['entry'], "01944282-004e-73ea-a9d6-5a418e9738a7", "TARGET", "Data categories", "Privacy and Risk community", x['Category'])

        if x['Classifier'] not in entries[7][x['_externalId']]['relations']:
            entries[7][x['_externalId']]['relations'].append(x['Classifier'])
            importService.add_relations(entries[7][x['_externalId']]['entry'], "01944282-9d1a-7185-97a6-3b2aef01c556", "TARGET",  "Business Data Models", "Data Architects community", x['Classifier'])

        if x['_region'] not in entries[7][x['_externalId']]['attributes']:
            entries[7][x['_externalId']]['attributes'].append(x['_region'])
            importService.add_attributes(entries[7][x['_externalId']]['entry'], 'Region', x['_region'], 'string')

        if x['_creationDate'] not in entries[7][x['_externalId']]['attributes']:
            entries[7][x['_externalId']]['attributes'].append(x['_creationDate'])
            importService.add_attributes(entries[7][x['_externalId']]['entry'], 'Created At', x['_creationDate'], 'string')

        if x['_externalId'] not in entries[7][x['_externalId']]['attributes']:
            entries[7][x['_externalId']]['attributes'].append(x['_externalId'])
            importService.add_attributes(entries[7][x['_externalId']]['entry'], 'Principal Identifier', x['_externalId'], 'string')

        # measure
        entries[8][f"{x['name']}:{x['Classifier']}:Unique Matches"] = {
            "entry": importService.get_asset("Governance council", "New Data Findings Metrics", "Measure", f"{x['name']}:{x['Classifier']}:Unique Matches", f"{x['Classifier']} Unique Matches")
        }

        importService.add_attributes(entries[8][f"{x['name']}:{x['Classifier']}:Unique Matches"]['entry'], 'Count', x['Unique Matches'], 'string')

        importService.add_relations(entries[8][f"{x['name']}:{x['Classifier']}:Unique Matches"]['entry'], "01930b23-1a84-7d44-b817-275206442bf6", "TARGET",  "Business Data Models", "Data Architects community",  x['Classifier'])
        
        importService.add_relations(entries[8][f"{x['name']}:{x['Classifier']}:Unique Matches"]['entry'], "01944259-fa74-7122-902a-f019e671cc3a", "SOURCE",  x['_subscriptionExternalId'], config['community_to_query'], x['name'])

        entries[8][f"{x['name']}:{x['Classifier']}:Total Matches"] = {
            "entry": importService.get_asset("Governance council", "New Data Findings Metrics", "Measure", f"{x['name']}:{x['Classifier']}:Total Matches", f"{x['Classifier']} Total Matches")
        }

        importService.add_attributes(entries[8][f"{x['name']}:{x['Classifier']}:Total Matches"]['entry'], 'Count', x['Total Matches'], 'string')

        importService.add_relations(entries[8][f"{x['name']}:{x['Classifier']}:Total Matches"]['entry'], "01930b23-1a84-7d44-b817-275206442bf6", "TARGET",  "Business Data Models", "Data Architects community",  x['Classifier'])
        
        importService.add_relations(entries[8][f"{x['name']}:{x['Classifier']}:Total Matches"]['entry'], "01944259-fa74-7122-902a-f019e671cc3a", "SOURCE",  x['_subscriptionExternalId'], config['community_to_query'], x['name'])

        # dimension
        if x['Classifier'] not in entries[9]:
            entries[9][x['Classifier']] = {
                "entry": importService.get_asset("Governance council", "Data Findings Dimensions", "Data Findings Dimension", x['Classifier'], x['Classifier'])
            }

        # metric    
        entries[10][f"{x['name']}:{x['Classifier']}:Unique Matches:Rule"] = {
            "entry": importService.get_asset("Governance council", "Data Findings Rules", "Data Findings Rule",f"{x['name']}:{x['Classifier']}:Unique Matches", f"{x['Classifier']} Unique Matches")
        }

        importService.add_relations(entries[10][f"{x['name']}:{x['Classifier']}:Unique Matches:Rule"]['entry'], "00000000-0000-0000-0000-000000007018", "SOURCE",  x['_subscriptionExternalId'], config['community_to_query'], f"{x['name']}")
        

        entries[10][f"{x['name']}:{x['Classifier']}:Total Matches:Rule"] = {
            "entry": importService.get_asset("Governance council", "Data Findings Rules", "Data Findings Rule", f"{x['name']}:{x['Classifier']}:Total Matches", f"{x['Classifier']} Total Matches")
        }

        importService.add_relations(entries[10][f"{x['name']}:{x['Classifier']}:Total Matches:Rule"]['entry'], "00000000-0000-0000-0000-000000007018", "SOURCE",  x['_subscriptionExternalId'], config['community_to_query'], f"{x['name']}")

        entries[10][f"{x['name']}:{x['Classifier']}:Unique Matches:Metric"] = {
            "entry": importService.get_asset("Governance council", "Data Findings Metrics", "Data Findings Metric", f"{x['name']}:{x['Classifier']}:Unique Matches", f"{x['Classifier']} Unique Matches")
        }

        importService.add_attributes(entries[10][f"{x['name']}:{x['Classifier']}:Unique Matches:Metric"]['entry'], 'Passing Fraction', x['Unique Matches'], 'string')

        importService.add_relations(entries[10][f"{x['name']}:{x['Classifier']}:Unique Matches:Metric"]['entry'], "01931f87-3dca-7b65-a03c-dce0146ade76", "TARGET",  "Data Findings Dimensions", "Governance council", x['Classifier'])
        
        importService.add_relations(entries[10][f"{x['name']}:{x['Classifier']}:Unique Matches:Metric"]['entry'], "01931feb-4b9a-7b6b-a456-e1a2759ceca4", "SOURCE",  "Data Findings Rules", "Governance council", f"{x['name']}:{x['Classifier']}:Unique Matches")
        
        entries[10][f"{x['name']}:{x['Classifier']}:Total Matches:Metric"] = {
            "entry": importService.get_asset("Governance council", "Data Findings Metrics", "Data Findings Metric", f"{x['name']}:{x['Classifier']}:Total Matches", f"{x['Classifier']} Total Matches")
        }

        importService.add_attributes(entries[10][f"{x['name']}:{x['Classifier']}:Total Matches:Metric"]['entry'], 'Passing Fraction', x['Total Matches'], 'string')

        importService.add_relations(entries[10][f"{x['name']}:{x['Classifier']}:Total Matches:Metric"]['entry'], "01931f87-3dca-7b65-a03c-dce0146ade76", "TARGET",  "Data Findings Dimensions", "Governance council", x['Classifier'])

        importService.add_relations(entries[10][f"{x['name']}:{x['Classifier']}:Total Matches:Metric"]['entry'], "01931feb-4b9a-7b6b-a456-e1a2759ceca4", "SOURCE",  "Data Findings Rules", "Governance council", f"{x['name']}:{x['Classifier']}:Total Matches")


def do_finding_example(importService, config, entries, x):
    if x['type'] == 'BUCKET':        
        file = f"s3://{x['name']}/{x['path']}"

        entries[11][file] = {
            "entry": importService.get_asset(config['community_to_query'], x['_subscriptionExternalId'], "File", file, x['path']),
            "relations": []
        }

        entries[11][file]['relations'].append(f"s3://{x['name']}/")
        importService.add_relations(entries[11][file]['entry'], "00000000-0000-0000-0000-000000007060", "SOURCE", x['_subscriptionExternalId'], config['community_to_query'], f"s3://{x['name']}/")

        entries[11][file]['relations'].append(x['Category'])
        importService.add_relations(entries[11][file]['entry'], "01943678-0ab4-7015-ba1f-0f9a168a6ade", "TARGET", "Data categories", "Privacy and Risk community", x['Category'])

        entries[11][file]['relations'].append(x['Classifier'])
        importService.add_relations(entries[11][file]['entry'], "01943678-ebf1-7cd5-bc9c-c78b2d115f3c", "TARGET",  "Business Data Models", "Data Architects community", x['Classifier'])


    
def do_all_findings(config, data_scan_resources_ready_df, data_scan_resources_exploded_df):
    logging.getLogger().debug("do all findings")

    runId = time.strftime("%Y%m%d")

    shutil.rmtree(f'./runs/{runId}', ignore_errors=True)

    _= [os.remove(f) for f in glob.glob(f'./runs/{runId}.json.*')]
    
    importService = ImportService(runId, 1, 150000)


    entries = [{} for element in range(12)]

    data_scan_resources_ready_df.apply(lambda x: do_finding(importService, config, entries, x), axis=1)

    if config['do_files']:
        data_scan_resources_exploded_df.apply(lambda x: do_finding_example(importService, config, entries, x), axis=1)


    # each in it step file
    allEntries = [[] for element in range(12)]

    _= [allEntries[i].append(v['entry']) for i,e in enumerate(entries) for k,v in e.items()]

    _= [importService.save(e, "./runs", runId, i, True) for i,e in enumerate(allEntries)]
    
    # # all in one step file
    # allEntries = []

    # _= [allEntries.append(v['entry']) for i,e in enumerate(entries) for k,v in e.items()]
    
    # importService.save(allEntries, "./runs", runId, 0, True)

    results = importService.harvest(collibra, config, "./runs", runId)

    return(results)


In [56]:
def main():
    logging.getLogger().setLevel(logging.DEBUG)

    config = get_config()

    config['community_to_query'] = community.value
    
    config['do_files'] = do_files.value

    data_scan_resources_ready_df, data_scan_resources_exploded_df = get_data_findings(config)

    results = do_all_findings(config, data_scan_resources_ready_df, data_scan_resources_exploded_df))

    print(json.dumps(results, indent=2))

if __name__ == '__main__':
    main()    
    

[
  {
    "step_number": 0,
    "resource_location": "./runs/20250113",
    "file_name": "20250113",
    "part_number": 0,
    "job": {
      "id": "019461b4-f91c-70bb-8e74-0313cd5492b7",
      "result": "SUCCESS"
    }
  },
  {
    "step_number": 1,
    "resource_location": "./runs/20250113",
    "file_name": "20250113",
    "part_number": 0,
    "job": {
      "id": "019461b4-feb6-718a-9769-567ddcc11962",
      "result": "SUCCESS"
    }
  },
  {
    "step_number": 2,
    "resource_location": "./runs/20250113",
    "file_name": "20250113",
    "part_number": 0,
    "job": {
      "id": "019461b5-1098-77cd-ba4d-82d16a486a86",
      "result": "SUCCESS"
    }
  },
  {
    "step_number": 3,
    "resource_location": "./runs/20250113",
    "file_name": "20250113",
    "part_number": 0,
    "job": {
      "id": "019461b5-22dd-73f9-8ab5-9641ff30a207",
      "result": "SUCCESS"
    }
  },
  {
    "step_number": 4,
    "resource_location": "./runs/20250113",
    "file_name": "20250113",
    "pa

In [33]:
#done