In [19]:
import re
import sys
import csv
import time
import json
import codecs
import logging
import requests

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import pygwalker as pyg

from datetime import datetime

from contextlib import closing

import altair as alt


In [20]:
# graphql conf
MAX_RETRIES_FOR_QUERY = 5
MAX_RETRIES_FOR_DOWNLOAD_REPORT = 5
RETRY_TIME_FOR_QUERY = 2
RETRY_TIME_FOR_DOWNLOAD_REPORT = 60
CHECK_INTERVAL_FOR_DOWNLOAD_REPORT = 20

In [21]:
# authentication     
AUTH0_URLS = ['https://auth.wiz.io/oauth/token', 'https://auth0.gov.wiz.io/oauth/token']
COGNITO_URLS = ['https://auth.app.wiz.io/oauth/token', 'https://auth.gov.wiz.io/oauth/token']

In [22]:
# get projects query 
GET_PROJECTS_QUERY = (
    """
      query ProjectsTable(
        $filterBy: ProjectFilters
        $first: Int
        $after: String
        $orderBy: ProjectOrder
      ) {
        projects(
          filterBy: $filterBy
          first: $first
          after: $after
          orderBy: $orderBy
        ) {
          nodes {
            id
            name
            isFolder
            archived
            businessUnit
            description
          }
        }
      }
    """
)

In [23]:
# get isseus query
GET_ISSUES_QUERY = (
    """
    query IssuesTable($filterBy: IssueFilters, $first: Int, $after: String, $orderBy: IssueOrder) {
    issues: issuesV2(
        filterBy: $filterBy
        first: $first
        after: $after
        orderBy: $orderBy
    ) {
        nodes {
        id
        sourceRule {
            __typename
            ... on Control {
            id
            name
            controlDescription: description
            resolutionRecommendation
            securitySubCategories {
                title
                category {
                name
                framework {
                    name
                }
                }
            }
            risks
            }
            ... on CloudEventRule {
            id
            name
            cloudEventRuleDescription: description
            sourceType
            type
            risks
            securitySubCategories {
                title
                category {
                name
                framework {
                    name
                }
                }
            }
            }
            ... on CloudConfigurationRule {
            id
            name
            cloudConfigurationRuleDescription: description
            remediationInstructions
            serviceType
            risks
            securitySubCategories {
                title
                category {
                name
                framework {
                    name
                }
                }
            }
            }
        }
        createdAt
        updatedAt
        dueAt
        type
        resolvedAt
        statusChangedAt
        projects {
            id
            name
            slug
            businessUnit
            riskProfile {
            businessImpact
            }
        }
        status
        severity
        entitySnapshot {
            id
            type
            nativeType
            name
            status
            cloudPlatform
            cloudProviderURL
            providerId
            region
            resourceGroupExternalId
            subscriptionExternalId
            subscriptionName
            subscriptionTags
            tags
            createdAt
            externalId
        }
        serviceTickets {
            externalId
            name
            url
        }
        notes {
            createdAt
            updatedAt
            text
            user {
            name
            email
            }
            serviceAccount {
            name
            }
        }
        }
        pageInfo {
        hasNextPage
        endCursor
        }
    }
    }
    """
)

In [24]:
# get resources query
GET_RESOURCES_QUERY = (
    """
      query CloudResourceSearch(
          $filterBy: CloudResourceFilters
          $first: Int
          $after: String
        ) {
          cloudResources(
            filterBy: $filterBy
            first: $first
            after: $after
          ) {
            nodes {
              ...CloudResourceFragment
            }
            pageInfo {
              hasNextPage
              endCursor
            }
          }
        }
        fragment CloudResourceFragment on CloudResource {
          id
          name
          type
          subscriptionId
          subscriptionExternalId
          graphEntity{
            id
            providerUniqueId
            name
            type
            projects {
              id
            }
            properties
            firstSeen
            lastSeen
          }
        }
    """
)

In [25]:
# get report query
GET_REPORT_QUERY = (
    """
    query ReportsTable($filterBy: ReportFilters, $first: Int, $after: String) {
      reports(first: $first, after: $after, filterBy: $filterBy) {
        nodes {
          id
          name
        }
        pageInfo {
          hasNextPage
          endCursor
        }
      }
    }
    """
)

In [26]:

# create report mutation
CREATE_REPORT_MUTATION = (
    """
    mutation CreateReport($input: CreateReportInput!) {
      createReport(input: $input) {
        report {
          id
        }
      }
    }
    """
)

In [27]:
# rerun report mutation
RERUN_REPORT_MUTATION = (
    """
    mutation RerunReport($reportId: ID!) {
        rerunReport(input: { id: $reportId }) {
            report {
                id
            }
        }
    }
    """
)

In [28]:
# report download query
DOWNLOAD_REPORT_QUERY = (
    """
    query ReportDownloadUrl($reportId: ID!) {
        report(id: $reportId) {
            lastRun {
                url
                status
            }
        }
    }
    """
)

In [29]:
# set logging
def set_logging():
    logging.getLogger().setLevel(logging.INFO)

    handler = logging.StreamHandler(sys.stderr)

    formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')

    handler.setFormatter(formatter)

    logging.getLogger().handlers = [handler]

In [30]:
# get config
def get_config():
    logging.getLogger().debug("get config")

    with open('config.json', "r") as f:
        config = json.load(f)

    return config

In [31]:
# get auth params
def generate_authentication_params(config):
    if config['token_url'] in AUTH0_URLS:
        return {
            'grant_type': 'client_credentials',
            'audience': 'beyond-api',
            'client_id': config['client_id'],
            'client_secret': config['client_secret']
        }
    
    elif config['token_url'] in COGNITO_URLS:
        return {
            'grant_type': 'client_credentials',
            'audience': 'wiz-api',
            'client_id': config['client_id'],
            'client_secret': config['client_secret']
        }
    
    else:
        raise Exception('Error: wrong token url')

In [32]:
# get token
def get_token(config):
    response = requests.post(
        config['token_url'],
        headers = {'Content-Type': 'application/x-www-form-urlencoded'},
        data = generate_authentication_params(config)
    )

    if response.status_code != requests.codes.ok:
        raise Exception(f'Error: {response.text}') 

    if not response.json().get('access_token'):
        raise Exception(f'Error: {response.json().get("message")}')

    config['token'] = response.json().get('access_token')

    return config

In [33]:
# send request
def send_request(config, query, variables):    
    if config['token']:
        return requests.post(
            config['api_endpoint_url'],
            headers = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + config['token']},
            json = {'query': query, 'variables': variables}
        )
    
    raise Exception('Error: token not found')

In [34]:
# query
def query(config, query, variables):
    retries = 0

    response = send_request(config, query, variables)
    
    if response.status_code == requests.codes.unauthorized or response.status_code == requests.codes.forbidden:
        raise Exception(f'Error: {response.text}') 
    
    elif response.status_code == requests.codes.not_found:
        raise Exception(f'Error: {response.text}') 
    
    while response.status_code != requests.codes.ok:
        if retries >= MAX_RETRIES_FOR_QUERY:
            raise Exception(f'Error: {response.text}') 
        
        time.sleep(RETRY_TIME_FOR_QUERY)

        response = send_request(config, query, variables)

        retries += 1
    
    if not response.json().get('data'):
        raise Exception(f'Error: {response.json().get("errors")}')
   
    logging.getLogger().debug(f"Info: {response.json().get('data')}")
    
    return response.json().get('data')

In [35]:
# get projects
def get_projects(config):
    logging.getLogger().debug("get projects")

    variables = {
        "first": 500,
        "filterBy": {
            "includeArchived": False,
            "isFolder": False,
            "root": False
        }
    }

    nodes = []

    while True:
        response = query(config, GET_PROJECTS_QUERY, variables)

        try:
            nodes = nodes + response['projects']['nodes']

            variables['after'] = response['projects']['pageInfo']['endCursor']

            if response['projects']['pageInfo']['hasNextPage'] == False:
                break

        except Exception as error:
            break

    return nodes

In [36]:
# get issues
def get_issues(config, project_id):
    logging.getLogger().debug("get issues")

    variables = {
        "first": 500,
        "filterBy": {
            "project": [
                project_id
            ],
        },
        "severity": ["CRITICAL"],
        "stackLayer": ["APPLICATION_AND_DATA", "DATA_STORES"]
    }
    
    nodes = []

    while True:
        response = query(config, GET_ISSUES_QUERY, variables)

        try:
            nodes = nodes + response['issues']['nodes']

            variables['after'] = response['issues']['pageInfo']['endCursor']

            if response['issues']['pageInfo']['hasNextPage'] == False:
                break

        except Exception as error:
            break
            
    return nodes

In [37]:
# get resources
def get_resources(config, project_id):
    logging.getLogger().debug("get resources")

    variables = {
        "first": 500,
        "filterBy": {
            "projectId": [
                project_id
            ],
        }
    }
    
    nodes = []

    while True:
        response = query(config, GET_RESOURCES_QUERY, variables)

        try:
            nodes = nodes + response['cloudResources']['nodes']

            variables['after'] = response['cloudResources']['pageInfo']['endCursor']

            if response['cloudResources']['pageInfo']['hasNextPage'] == False:
                break

        except Exception as error:
            break
            
    return nodes

In [38]:
# create report
def create_report(config, project_id, report_prefix, report_type):
    variables = {
        "input": {
            "name": re.sub(' |\.|:|-','', f'{report_prefix}_{datetime.now()}'),
            "type": report_type,
            "projectId": project_id
        }
    }
    
    response = query(config, CREATE_REPORT_MUTATION, variables)
    
    report_id = response['createReport']['report']['id']

    return report_id

In [39]:
# rerun report
def rerun_report(config, report_id):
    variables = {
        'reportId': report_id
    }

    response = query(config, RERUN_REPORT_MUTATION, variables)

    report_id = response['rerunReport']['report']['id']

    return report_id

In [40]:
# get report url and status
def get_report_url_and_status(config, report_id):
    num_of_retries = 0

    while num_of_retries < MAX_RETRIES_FOR_DOWNLOAD_REPORT:
        time.sleep(CHECK_INTERVAL_FOR_DOWNLOAD_REPORT)

        response = query(config, DOWNLOAD_REPORT_QUERY, {'reportId': report_id})

        status = response['report']['lastRun']['status']

        if status == 'COMPLETED':
            return response['report']['lastRun']['url']
        
        elif status == 'FAILED' or status == 'EXPIRED':
            rerun_report(report_id)

            time.sleep(RETRY_TIME_FOR_DOWNLOAD_REPORT)

            num_of_retries += 1

    raise Exception('Error: get report fail')

In [41]:
# get report content
def get_report_content(download_url):
    report_data = []

    with closing(requests.get(download_url, stream=True)) as r:
        reader = csv.reader(codecs.iterdecode(r.iter_lines(), 'utf-8'))

        for row in reader:     
            report_data.append(row)
    
    if report_data:
        logging.debug(f'Info: {report_data}')

        return report_data
    
    else:
        raise Exception('Error: download failed')

In [42]:
# get report content to dataframe
def get_report_content_to_dataframe(download_url):
    return pd.read_csv(download_url)    

In [43]:
# get report
def get_report(config, project_id):    
    logging.getLogger().debug("get report")
    
    report_data = {}
    
    report_type= "DATA_SCAN"

    report_id = create_report(config, project_id, report_type, report_type)

    report_url = get_report_url_and_status(config, report_id)
    
    #report_data[report_type] = get_report_content(report_url)
    report_data[report_type] = get_report_content_to_dataframe(report_url)

    return report_data

In [44]:
# get external id
def get_external_id(x):
    try:
        return x['properties']['externalId']
    
    except Exception as error:
        try:
            return x['externalId']
        
        except Exception as error:
            return None

In [45]:
# main
def main():
    global projects
    global issues_df
    global resources_df
    global data_scan_df
    global data_scan_issues_df
    global data_scan_resources_df

    try:
        set_logging()

        config = get_token(get_config())

        #projects = get_projects(config)

        #issues = get_issues(config, config['project_id']) 
        
        #issues_df = pd.DataFrame(issues)
    
        #issues_df['externalId'] = issues_df['entitySnapshot'].apply(get_external_id)
        
        resources = get_resources(config, config['project_id']) 
        
        resources_df = pd.DataFrame(resources)

        resources_df['externalId'] = resources_df['graphEntity'].apply(get_external_id)

        reports = get_report(config, config['project_id']) 

        data_scan_df = reports['DATA_SCAN']

        data_scan_resources_df = data_scan_df.set_index('Resource External ID').join(resources_df.set_index('externalId'))

        #data_scan_issues_df = data_scan_df.set_index('Resource External ID').join(issues_df.set_index('externalId'),lsuffix='_i', rsuffix='_r')

        logging.getLogger().debug('done')

    except Exception as error:
        raise Exception('Error: %s', error)


In [46]:
if __name__ == '__main__':
    main()    

In [47]:
# save results
#issues_df.to_csv('issues.csv')
#resources_df.to_csv('resources.csv')
#data_scan_df.to_csv('datascans.csv')
#data_scan_issues_df.to_csv('datascanissues.csv')
#data_scan_resources_df.to_csv('datascanresources.csv')

In [48]:
data_scan_resources_df.reset_index(inplace=True)

In [49]:
# expand findings
data_scan_resources_df2=pd.json_normalize(data_scan_resources_df['graphEntity'])

data_scan_resources_df2[['Finding ID', 'Category', 'Classifier', 'Unique Matches', 'Total Matches', 'Severity', 'Finding Examples']]= data_scan_resources_df[['ID', 'Category', 'Classifier', 'Unique Matches', 'Total Matches', 'Severity', 'Finding Examples']]

In [None]:
#data_scan_resources_df2 = pd.read_csv('datascanresources2.csv')  

In [50]:
def replaceDot(x):
    return(x.replace('properties.','_'))
    
data_scan_resources_df2=data_scan_resources_df2.rename(replaceDot, axis='columns')


In [51]:
# resources per cloud platform
resources_per_cloud_platform = data_scan_resources_df2[['_cloudPlatform','id']].drop_duplicates().groupby(by=['_cloudPlatform']).count().reset_index().rename(columns={"id": "count"}).sort_values(by=['count'], ascending=False)
resources_per_cloud_platform

Unnamed: 0,_cloudPlatform,count
0,AWS,96


In [52]:
# resources per environment
resources_per_environment = data_scan_resources_df2[['__environments','id']].drop_duplicates().groupby(by=['__environments']).count().reset_index().rename(columns={"id": "count"}).sort_values(by=['count'], ascending=False)
resources_per_environment

Unnamed: 0,__environments,count
0,Production,96


In [53]:
# resources per status
resources_per_status = data_scan_resources_df2[['_status','id']].drop_duplicates().groupby(by=['_status']).count().reset_index().rename(columns={"id": "count"}).sort_values(by=['count'], ascending=False)
resources_per_status

Unnamed: 0,_status,count
0,Active,80
1,Inactive,8


In [54]:
# resources per region 
resources_per_region = data_scan_resources_df2[['_region','id']].drop_duplicates().groupby(by=['_region']).count().reset_index().rename(columns={"properties.region":"region","id": "count"}).sort_values(by=['count'], ascending=False)
resources_per_region

Unnamed: 0,_region,count
3,us-east-1,69
5,us-west-1,11
4,us-east-2,8
0,eu-central-1,3
1,eu-west-2,3
2,eu-west-3,2


In [55]:
# resources per type
resources_per_type = data_scan_resources_df2[['type','id']].drop_duplicates().groupby(by=['type']).count().reset_index().rename(columns={"id": "count"}).sort_values(by=['count'], ascending=False)
resources_per_type

Unnamed: 0,type,count
0,BUCKET,48
3,SNAPSHOT,23
4,VIRTUAL_MACHINE,11
2,DB_SERVER,9
1,DATABASE,5


In [56]:
# resources per native type
resources_per_native_type = data_scan_resources_df2[['_nativeType','id']].drop_duplicates().groupby(by=['_nativeType']).count().reset_index().rename(columns={"id": "count"}).sort_values(by=['count'], ascending=False)
resources_per_native_type

Unnamed: 0,_nativeType,count
1,bucket,48
4,rds#snapshot,17
6,virtualMachine,11
0,HostedTechnologyConfigurationDBServer,8
3,rds#clustersnapshot,6
2,dynamoDB/table,5
5,rds/PostgreSQL/instance,1


In [57]:
# resources per native type and status
resources_per_native_type_and_status = data_scan_resources_df2[['_nativeType','_status','id']].drop_duplicates().groupby(by=['_nativeType','_status']).count().reset_index().rename(columns={"id": "count"}).sort_values(by=['count'], ascending=False)
resources_per_native_type_and_status

Unnamed: 0,_nativeType,_status,count
0,bucket,Active,48
3,rds#snapshot,Active,17
6,virtualMachine,Inactive,8
2,rds#clustersnapshot,Active,6
1,dynamoDB/table,Active,5
5,virtualMachine,Active,3
4,rds/PostgreSQL/instance,Active,1


In [58]:
# resources per kind
resources_per_kind = data_scan_resources_df2[['_kind','id']].drop_duplicates().groupby(by=['_kind']).count().reset_index().rename(columns={"id": "count"}).sort_values(by=['count'], ascending=False)
resources_per_kind

Unnamed: 0,_kind,count
2,PostgreSQL,8
0,AmazonDynamoDBTable,5
1,MySQL,1


In [59]:
# findings per type
findings_per_type = data_scan_resources_df2[['type','Finding ID']].groupby(by=['type']).count().reset_index().rename(columns={"Finding ID": "count"}).sort_values(by=['count'], ascending=False)
findings_per_type

Unnamed: 0,type,count
0,BUCKET,356
2,DB_SERVER,78
3,SNAPSHOT,23
4,VIRTUAL_MACHINE,18
1,DATABASE,11


In [60]:
# categories per type
categories_per_type = data_scan_resources_df2[['type','Category','Finding ID']].groupby(by=['type','Category']).count().reset_index().rename(columns={"Finding ID": "count"}).sort_values(by=['count'], ascending=False)
categories_per_type

Unnamed: 0,type,Category,count
4,BUCKET,DataCategoryPII,223
1,BUCKET,DataCategoryFinancial,73
11,DB_SERVER,DataCategoryPII,29
5,BUCKET,DataCategoryStaleData,25
13,SNAPSHOT,DataCategoryStaleData,23
9,DB_SERVER,DataCategoryFinancial,23
3,BUCKET,DataCategoryPHI,16
0,BUCKET,DataCategoryDigitalIdentity,15
8,DB_SERVER,DataCategoryDigitalIdentity,13
10,DB_SERVER,DataCategoryPHI,12


In [61]:
# classifiers per type
classifiers_per_type = data_scan_resources_df2[['type','Classifier','Finding ID']].groupby(by=['type','Classifier']).count().reset_index().rename(columns={"Finding ID": "count"}).sort_values(by=['count'], ascending=False)
classifiers_per_type

Unnamed: 0,type,Classifier,count
15,BUCKET,Email,34
35,BUCKET,Name,30
52,BUCKET,Unmodified Bucket,25
89,SNAPSHOT,Orphaned Snapshot,23
21,BUCKET,Gender,20
...,...,...,...
69,DB_SERVER,Date of Birth,1
70,DB_SERVER,Drug Enforcement Agency (DEA) Number,1
18,BUCKET,Financial Balance,1
44,BUCKET,Salesforce ID,1


In [62]:
# categories and classifiers per type
categories_and_classifiers_per_type = data_scan_resources_df2[['type','Category','Classifier','Finding ID']].groupby(by=['type','Category','Classifier']).count().reset_index().rename(columns={"Finding ID": "count"}).sort_values(by=['count'], ascending=False)
categories_and_classifiers_per_type

Unnamed: 0,type,Category,Classifier,count
37,BUCKET,DataCategoryPII,Email,34
46,BUCKET,DataCategoryPII,Name,30
54,BUCKET,DataCategoryStaleData,Unmodified Bucket,25
89,SNAPSHOT,DataCategoryStaleData,Orphaned Snapshot,23
42,BUCKET,DataCategoryPII,Gender,20
...,...,...,...,...
60,DATABASE,DataCategoryPII,Gender,1
61,DATABASE,DataCategoryPII,Name,1
63,DATABASE,DataCategoryPII,Zip Code,1
11,BUCKET,DataCategoryFinancial,Credit Card Brand,1


In [63]:
# unique matches per categories classifiers and type
unique_matches_percategories_classifiers_and_type = data_scan_resources_df2[['type','Category','Classifier','Unique Matches']].groupby(by=['type','Category','Classifier']).sum().reset_index().sort_values(by=['Unique Matches'], ascending=False)
unique_matches_percategories_classifiers_and_type

Unnamed: 0,type,Category,Classifier,Unique Matches
37,BUCKET,DataCategoryPII,Email,31640
21,BUCKET,DataCategoryFinancial,Price,12574
33,BUCKET,DataCategoryPII,Address,11631
50,BUCKET,DataCategoryPII,Phone Number,11299
46,BUCKET,DataCategoryPII,Name,10342
...,...,...,...,...
95,VIRTUAL_MACHINE,DataCategoryFinancial,Transaction Details,1
79,DB_SERVER,DataCategoryPII,Date of Birth,1
54,BUCKET,DataCategoryStaleData,Unmodified Bucket,0
88,DB_SERVER,DataCategoryStaleData,Inactive Object,0


In [64]:
# total matches per categories classifiers and type
total_matches_percategories_classifiers_and_type = data_scan_resources_df2[['type','Category','Classifier','Total Matches']].groupby(by=['type','Category','Classifier']).sum().reset_index().sort_values(by=['Total Matches'], ascending=False)
total_matches_percategories_classifiers_and_type

Unnamed: 0,type,Category,Classifier,Total Matches
37,BUCKET,DataCategoryPII,Email,78537
23,BUCKET,DataCategoryFinancial,Transaction Details,75511
42,BUCKET,DataCategoryPII,Gender,51047
29,BUCKET,DataCategoryPHI,Patient ID,41837
50,BUCKET,DataCategoryPII,Phone Number,33388
...,...,...,...,...
58,DATABASE,DataCategoryPII,Email,1
60,DATABASE,DataCategoryPII,Gender,1
54,BUCKET,DataCategoryStaleData,Unmodified Bucket,0
88,DB_SERVER,DataCategoryStaleData,Inactive Object,0


In [57]:
walker = pyg.walk(data_scan_resources_df2)

Box(children=(HTML(value='\n<div id="ifr-pyg-000629226094de01rHzQBJEyiDMnqNS4" style="height: auto">\n    <hea…

In [None]:
# unique matches per classifier and type
unique_matches_per_findings_and_type = data_scan_resources_df2[['type', 'Category', 'Classifier', 'Unique Matches']].groupby(by=['type', 'Category', 'Classifier']).sum().reset_index().sort_values(by=['Unique Matches'], ascending=False)
unique_matches_per_findings_and_type

In [None]:
# total matches per classifier and type
total_matches_per_classifier_and_type = data_scan_resources_df2[['type', 'Category', 'Classifier', 'Total Matches']].groupby(by=['type', 'Category', 'Classifier', ]).sum().reset_index().sort_values(by=['Total Matches'], ascending=False)
total_matches_per_classifier_and_type

In [None]:
# unique matches per severity and type
unique_matches_per_severity_and_type = data_scan_resources_df2[['type', 'Severity', 'Unique Matches']].groupby(by=['type', 'Severity', ]).sum().reset_index().sort_values(by=['Unique Matches'], ascending=False)
unique_matches_per_severity_and_type

In [None]:
# total matches per severity and type
total_matches_per_severity_and_type = data_scan_resources_df2[['type', 'Severity', 'Total Matches']].groupby(by=['type', 'Severity', ]).sum().reset_index().sort_values(by=['Total Matches'], ascending=False)
total_matches_per_severity_and_type

In [65]:
def get_number_of_findings(x):
    try:
        return len(json.loads(x))
    
    except Exception as error:
        return None
    
data_scan_df['findings'] = data_scan_df['Finding Examples'].apply(get_number_of_findings)


In [66]:
x=data_scan_df[data_scan_df['findings']>50]
x[['Name','Unique Matches','Total Matches','Resource External ID','findings','Finding Examples']]


Unnamed: 0,Name,Unique Matches,Total Matches,Resource External ID,findings,Finding Examples
301,PII/Gender,2,24488,wagners3bucket,85.0,"[{""path"":""003c6c7e-4f4d-4bc1-a7cc-a16de73c3469..."
420,PII/Education Level,4,287,ana-bucket,56.0,"[{""path"":""CSV/employees (2).csv"",""key"":""educat..."
449,Financial/Invoice Details,2,1275,ana-bucket,200.0,"[{""path"":""KBC/datagroup/tablename/pdate=202306..."
452,Financial/Business Unit,1,518,ana-bucket,200.0,"[{""path"":""KBC/datagroup/tablename/pdate=202306..."


In [None]:
walker = pyg.walk(data_scan_df)

In [None]:
#resources_df = pd.read_csv('Annambucketresources_2024_10_30T20_47_42Z.csv')  
#resources_df

In [None]:
#data_scan_df = pd.read_csv('Annamdemodatafindings_2024_10_29T17_11_37Z.csv')  
#data_scan_df

In [None]:
#data_scan_resources_df = data_scan_df.join(resources_df.set_index('External ID'),on="Resource External ID", rsuffix='_r')

#data_scan_resources_df = data_scan_resources_df[data_scan_resources_df["Resource Type_r"]=="BUCKET"] #buckets only

#print(data_scan_resources_df.to_json(orient="records",indent=2))

In [None]:
#def get_new_external_id(s):
#    if (s.rfind("#") != -1):
#        return s[0:s.rfind("#")]
#    else: 
#        return s

In [None]:
#resources_df = pd.read_csv('annamdatabaseserver_2024_10_30T20_48_19Z.csv')  
#resources_df

In [None]:
#data_scan_df = pd.read_csv('Annamdemodatafindings_2024_10_29T17_11_37Z.csv')  

#data_scan_df['Resource External ID']= data_scan_df['Resource External ID'].apply(lambda s: get_new_external_id(s))

#data_scan_df

In [None]:
#data_scan_resources_df = data_scan_df.join(resources_df.set_index('External ID'),on="Resource External ID", rsuffix='_r')

#data_scan_resources_df = data_scan_resources_df[data_scan_resources_df["Resource Type_r"]=="DB_SERVER"] #db_server only

#print(data_scan_resources_df.to_json(orient="records",indent=2))

In [None]:
#done