In [7]:
# RISK CONSTANT
import pandas as pd

df = pd.read_csv('data/risk_constant.csv')

contextTypeCounts = df['context_type'].value_counts() # host(58), domain(3), bucket(2)

severityCounts = df['recommended_severity'].value_counts() # critical (3), high (38), medium(10), low (12)

typeCounts = df['type'].value_counts() # 63 different types!

# clean categories

def cleanCategory(text, index):
    label = text.replace("{", '').replace("}", '').replace('"', '').replace("'", '')
    levels = label.split(',')
    if index >= len(levels):
        return False
    return levels[index]

def numLevels(text):
    label = text.replace("{", '').replace("}", '').replace('"', '').replace("'", '')
    levels = label.split(',')
    return len(levels)

# Exposure(33), Vulnerability(16), Misconfiguration (12), Service or Interface Exposure (1), Compromise (1)
df['parent'] = df['default_categories'].map(lambda x: cleanCategory(x, 0))

# Service or Interface Exposure(23), Software Vulnerability (15), Service Misconfiguration (9), Device Exposure (8)
# Name Infrastrucutre (3), Information Leakage (2), False (1), Evidence of Compromise (1), Web App Security Vulnerabiliy (1)
df['child'] = df['default_categories'].map(lambda x: cleanCategory(x, 1))

# EOL Software Vulnerability (8), Database Engine Exposure (8), CVE (7), Internal Network Protocol Exposure (6)
# ISC/SCADA Exposure (5), Remote Access Service Exposure (4), Unencrypted (4), Domain Registration Misconfiguration (3)
# Cloud Storage Exposure (2), DDoS Amplification Exposure (2), TLS Cryptographic Weakness (2), XSS (1)
# Lights Out Server Management Exposure (1), Other Exposure (1), Weak Authenthication (1), File Transfer Protocol Exposure (1)
# Storage Device Exposure (1), Deprecated and Plaintext Protocol Exposure (1), Scan Derived (1), False (1)
# Unmanaged Host (1), IoT Protocol Exposure (1), Mail Misconfiguration (1)
df['subchild'] = df['default_categories'].map(lambda x: cleanCategory(x, 2))

# Questions
# Why do some types have multiple hierarchies?
# Why is one type missing extra levels?

df['levels'] = df['default_categories'].map(lambda x: numLevels(x))

df['levels'].value_counts()

tf = df[df['levels'] == 1]

tf[['default_categories']]

Unnamed: 0,default_categories
21,"'{{""Service or Interface Exposure""}}'"


In [58]:
# RISK OBJECT -> RISK TYPE

df = pd.read_csv('data/risk_object.csv')

severityCounts = df['severity'].value_counts() # critical (173), high (425), medium (30870), low (1444)

# host-not-present-in-vm-solution                            29331
# weak-auth-page                                               882
# login-page-missing-csp                                       770
# weak-tls-cipher                                              433
# ssh-service-exposed                                          415
# unencrypted-login-page                                       146
# smtp-service-exposed                                         135
# unencrypted-weak-auth-page                                   124
# eol-php-software                                             107
# ntp-service-exposed                                           73
# vulnerable-log4j-generic-cve-2021-44228                       62
# outdated-tls-version                                          59
# eol-nginx-software                                            59
# domain-expiring-in-30-days                                    54
# ftp-service-exposed                                           37
# aws-storage-bucket-exposed                                    28
# recursive-dns-service-exposed                                 27
# mysql-service-exposed                                         23
# eol-apache-httpd-software                                     20
# eol-microsoft-iis-software                                    19
# vnc-service-exposed                                           14
# ipp-service-exposed                                           13
# postgres-service-exposed                                      12
# unencrypted-imap-service                                      12
# amqp-service-exposed                                          11
# domain-expiring-in-7-days                                     11
# eol-openssl-software                                          10
# unencrypted-pop3-service                                       8
# rdp-service-exposed                                            3
# pop3s-service-exposed                                          3
# smb-service-exposed                                            2
# eol-eclipse-jetty-software                                     2
# unencrypted-cwmp-service                                       1
# fox-service-exposed                                            1
# vulnerable-log4j-unifi-network-appliance-cve-2021-44228        1
# ipmi-service-exposed                                           1
# mongodb-service-exposed                                        1
# gcp-storage-bucket-exposed                                     1
# expired-domain                                                 1
typeCounts = df['type'].value_counts()

df['first_computed_at'] = pd.to_datetime(df['first_computed_at'])
df['first_date'] = df['first_computed_at'].dt.strftime('%m/%d/%Y')

# first big threat 16586 on 8/3/2021
df['first_date'].value_counts()

minDate = min(df['first_computed_at'])
maxDate = max(df['first_computed_at'])

# Timeline: 07/28/2020 -> 02/17/2022
# What are the biggest risk types?
# How do risks change ['over time? in severity? 
# Details on Demand -> what are some of the risk details? -> context

Unnamed: 0,id,type,context,status,first_computed_at,last_computed_at,last_updated_at,severity,user_status,first_date
0,2711983,aws-storage-bucket-exposed,"{""cri"": ""cri:bucket:aws:s3:aac"", ""type"": ""buck...",open,2021-06-18 01:24:05.315000-03:00,2022-02-17 01:04:33.094 -0300,2022-02-15 14:28:38.576 -0300,critical,,06/18/2021
1,2711990,aws-storage-bucket-exposed,"{""cri"": ""cri:bucket:aws:s3:aaccservices"", ""typ...",open,2021-06-18 01:58:04.928000-03:00,2022-02-17 01:04:33.094 -0300,2022-02-15 14:28:38.576 -0300,critical,,06/18/2021
2,2711989,aws-storage-bucket-exposed,"{""cri"": ""cri:bucket:aws:s3:adod"", ""type"": ""buc...",open,2021-06-18 01:53:13.777000-03:00,2022-02-17 01:04:33.094 -0300,2022-02-15 14:28:38.576 -0300,critical,,06/18/2021
3,2711977,aws-storage-bucket-exposed,"{""cri"": ""cri:bucket:aws:s3:cube-images"", ""type...",open,2021-06-18 01:14:04.940000-03:00,2022-02-17 01:04:33.094 -0300,2022-02-15 14:28:38.576 -0300,critical,,06/18/2021
4,2711979,aws-storage-bucket-exposed,"{""cri"": ""cri:bucket:aws:s3:cube-static"", ""type...",open,2021-06-18 01:16:04.903000-03:00,2022-02-17 01:04:33.094 -0300,2022-02-15 14:28:38.576 -0300,critical,,06/18/2021
...,...,...,...,...,...,...,...,...,...,...
32907,2741919,domain-expiring-in-30-days,"{""type"": ""domain"", ""domain"": ""transformcsys.com""}",open,2022-02-13 04:16:44.599000-03:00,2022-02-13 04:16:44.599 -0300,2022-02-15 14:28:38.576 -0300,medium,,02/13/2022
32908,2741517,domain-expiring-in-30-days,"{""type"": ""domain"", ""domain"": ""unioncsys.com""}",open,2022-02-05 01:04:58.125000-03:00,2022-02-13 04:16:44.599 -0300,2022-02-15 14:28:38.576 -0300,medium,,02/05/2022
32909,2741920,domain-expiring-in-7-days,"{""type"": ""domain"", ""domain"": ""upcsys.com""}",open,2022-02-13 04:16:44.599000-03:00,2022-02-13 04:16:44.599 -0300,2022-02-15 14:28:38.576 -0300,high,,02/13/2022
32910,2741519,domain-expiring-in-30-days,"{""type"": ""domain"", ""domain"": ""somecsysdomain.c...",open,2022-02-05 01:07:12.655000-03:00,2022-02-13 04:16:58.112 -0300,2022-02-15 14:28:38.576 -0300,medium,,02/05/2022


In [13]:
# RISK EVENTS
import pandas as pd

df = pd.read_csv('data/risk_event.csv')

df['risk_type'].value_counts()

df['reason'].value_counts()

df['risk_id'].value_counts()

# update (59640), close(29958), open(2310)
df['op'].value_counts()

# Questions
# What is a lifecycle for a risk instance using these events?
# How long does it take for a risk event t

  df = pd.read_csv('data/risk_event.csv')


update    59640
close     29958
open       2310
Name: op, dtype: int64