In [8]:
# RISK CONSTANT
import pandas as pd

df = pd.read_csv('data/risk_constant.csv')

contextTypeCounts = df['context_type'].value_counts() # host(58), domain(3), bucket(2)

severityCounts = df['recommended_severity'].value_counts() # critical (3), high (38), medium(10), low (12)

typeCounts = df['type'].value_counts() # 63 different types!

# clean categories

def cleanCategory(text, index):
    label = text.replace("{", '').replace("}", '').replace('"', '').replace("'", '')
    levels = label.split(',')
    if index >= len(levels):
        return False
    return levels[index]

def numLevels(text):
    label = text.replace("{", '').replace("}", '').replace('"', '').replace("'", '')
    levels = label.split(',')
    return len(levels)

# Exposure(33), Vulnerability(16), Misconfiguration (12), Service or Interface Exposure (1), Compromise (1)
df['parent'] = df['default_categories'].map(lambda x: cleanCategory(x, 0))

# Service or Interface Exposure(23), Software Vulnerability (15), Service Misconfiguration (9), Device Exposure (8)
# Name Infrastrucutre (3), Information Leakage (2), False (1), Evidence of Compromise (1), Web App Security Vulnerabiliy (1)
df['child'] = df['default_categories'].map(lambda x: cleanCategory(x, 1))

# EOL Software Vulnerability (8), Database Engine Exposure (8), CVE (7), Internal Network Protocol Exposure (6)
# ISC/SCADA Exposure (5), Remote Access Service Exposure (4), Unencrypted (4), Domain Registration Misconfiguration (3)
# Cloud Storage Exposure (2), DDoS Amplification Exposure (2), TLS Cryptographic Weakness (2), XSS (1)
# Lights Out Server Management Exposure (1), Other Exposure (1), Weak Authenthication (1), File Transfer Protocol Exposure (1)
# Storage Device Exposure (1), Deprecated and Plaintext Protocol Exposure (1), Scan Derived (1), False (1)
# Unmanaged Host (1), IoT Protocol Exposure (1), Mail Misconfiguration (1)
df['subchild'] = df['default_categories'].map(lambda x: cleanCategory(x, 2))

# Questions
# Why do some types have multiple hierarchies?
# Why is one type missing extra levels?

df['levels'] = df['default_categories'].map(lambda x: numLevels(x))
df['index'] = df.index
prep_df = df[['index', 'type', 'name', 'context_type', 'recommended_severity', 'parent', 'child', 'subchild']]

prep_df.to_json('data/risk_mapping.json', orient='records')

riskMap = dict(zip(df['type'], df['index']))

riskMap


{'aws-storage-bucket-exposed': 0,
 'bacnet-service-exposed': 1,
 'domain-expiring-in-7-days': 2,
 'eol-apache-traffic-server-software': 3,
 'eol-microsoft-iis-software': 4,
 'eol-apache-httpd-software': 5,
 'dnp3-service-exposed': 6,
 'eol-eclipse-jetty-software': 7,
 'elasticsearch-service-exposed': 8,
 'expired-domain': 9,
 'gcp-storage-bucket-exposed': 10,
 'kubernetes-service-exposed': 11,
 'memcached-service-exposed': 12,
 'modbus-service-exposed': 13,
 'mqtt-service-exposed': 14,
 'host-not-present-in-vm-solution': 15,
 'mongodb-service-exposed': 16,
 'mssql-service-exposed': 17,
 'oracle-service-exposed': 18,
 'outdated-exchange-cumulative-update': 19,
 'pc-anywhere-service-exposed': 20,
 'pop3s-service-exposed': 21,
 'prometheus-service-exposed': 22,
 'qnap-device-compromised': 23,
 'qnap-device-exposed': 24,
 'recursive-dns-service-exposed': 25,
 'redis-service-exposed': 26,
 's7-service-exposed': 27,
 'snmp-service-exposed': 28,
 'telnet-service-exposed': 29,
 'unencrypted-im

In [13]:
# RISK OBJECT -> RISK TYPE

df = pd.read_csv('data/risk_object.csv')

severityCounts = df['severity'].value_counts() # critical (173), high (425), medium (30870), low (1444)

# host-not-present-in-vm-solution                            29331
# weak-auth-page                                               882
# login-page-missing-csp                                       770
# weak-tls-cipher                                              433
# ssh-service-exposed                                          415
# unencrypted-login-page                                       146
# smtp-service-exposed                                         135
# unencrypted-weak-auth-page                                   124
# eol-php-software                                             107
# ntp-service-exposed                                           73
# vulnerable-log4j-generic-cve-2021-44228                       62
# outdated-tls-version                                          59
# eol-nginx-software                                            59
# domain-expiring-in-30-days                                    54
# ftp-service-exposed                                           37
# aws-storage-bucket-exposed                                    28
# recursive-dns-service-exposed                                 27
# mysql-service-exposed                                         23
# eol-apache-httpd-software                                     20
# eol-microsoft-iis-software                                    19
# vnc-service-exposed                                           14
# ipp-service-exposed                                           13
# postgres-service-exposed                                      12
# unencrypted-imap-service                                      12
# amqp-service-exposed                                          11
# domain-expiring-in-7-days                                     11
# eol-openssl-software                                          10
# unencrypted-pop3-service                                       8
# rdp-service-exposed                                            3
# pop3s-service-exposed                                          3
# smb-service-exposed                                            2
# eol-eclipse-jetty-software                                     2
# unencrypted-cwmp-service                                       1
# fox-service-exposed                                            1
# vulnerable-log4j-unifi-network-appliance-cve-2021-44228        1
# ipmi-service-exposed                                           1
# mongodb-service-exposed                                        1
# gcp-storage-bucket-exposed                                     1
# expired-domain                                                 1
typeCounts = df['type'].value_counts()

df['first_computed_at'] = pd.to_datetime(df['first_computed_at'])
df['first_date'] = df['first_computed_at'].dt.strftime('%Y-%m-%d')

# first big threat 16586 on 8/3/2021
df['first_date'].value_counts()

minDate = min(df['first_computed_at'])
maxDate = max(df['first_computed_at'])

# Timeline: 07/28/2020 -> 02/17/2022
# What are the biggest risk types?
# How do risks change ['over time? in severity? 
# Details on Demand -> what are some of the risk details? -> context

# df[df['status'] == 'open']

df['type_id'] = df['type'].map(lambda x : riskMap[x])

prep_df = df[['id', 'status', 'first_date', 'type_id']]

prep_df.to_json('data/riskObjects.json', orient='records')

In [6]:
# RISK EVENTS
import pandas as pd
import pydash as _py
import json

df = pd.read_csv('data/risk_event.csv')

df['risk_type'].value_counts()

df['reason'].value_counts()

df['risk_id'].value_counts()

# update (59640), close(29958), open(2310)
df['op'].value_counts()

# Questions
# What is a lifecycle for a risk instance using these events?
# How long does it take for a risk event t

df['ts'] = pd.to_datetime(df['ts'])
df['ts'] = df['ts'].dt.strftime('%Y-%m-%d')

filtered_df = df[df['risk_id'] > 0]
filtered_df['risk_id'] = filtered_df['risk_id'].map(lambda x: int(x))

filtered_df = filtered_df[['event_id', 'risk_id', 'ts']]

eventGroups = _py.group_by(filtered_df.to_dict(orient="records"), 'risk_id')

with open('riskEvents.json', 'w') as outFile:
    json.dump(eventGroups, outFile)


  df = pd.read_csv('data/risk_event.csv')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['risk_id'] = filtered_df['risk_id'].map(lambda x: int(x))
