# Generation of Mock CIS JSON files
This notebook will take 4 CIS result files stored in S3 and generate files for a given inventory for some weeks


## Read the files in the bucket `s3a://test-data` and explode the rules 

In [2]:
%pyspark
from random import random
from jinja2 import Template
import json
import datetime

In [3]:
%pyspark
data_json = spark.read.json("s3a://test-data/*", multiLine=True)
#data_json.show()

df_final = data_json \
    .withColumn("rules_explode", explode("rules")) \
    .withColumn("rule_id", col("rules_explode.rule-id")) \
    .withColumn("rule_title", col("rules_explode.rule-title")) \
    .withColumn("rule_result", col("rules_explode.result")) \
    .drop("rules", "rules_explode")

## Create a list of rows containing all test per OS

In [5]:
%pyspark
param_windows = df_final.filter(col("benchmark-id") == "xccdf_org.cisecurity.benchmarks_benchmark_1.0.0_CIS_Microsoft_Windows_Server_2022_Benchmark").select("benchmark-id", "benchmark-title", "benchmark-version", "profile-id", "profile-title", "score").limit(1).collect()[0]
param_ol7 = df_final.filter(col("benchmark-id") == "xccdf_org.cisecurity.benchmarks_benchmark_3.1.1_CIS_Oracle_Linux_7_Benchmark").select("benchmark-id", "benchmark-title", "benchmark-version", "profile-id", "profile-title", "score").limit(1).collect()[0]
param_ol8 = df_final.filter(col("benchmark-id") == "xccdf_org.cisecurity.benchmarks_benchmark_2.0.0_CIS_Oracle_Linux_8_Benchmark").select("benchmark-id", "benchmark-title", "benchmark-version", "profile-id", "profile-title", "score").limit(1).collect()[0]
param_rhel9 = df_final.filter(col("benchmark-id") == "xccdf_org.cisecurity.benchmarks_benchmark_1.0.0_CIS_Red_Hat_Enterprise_Linux_9_Benchmark").select("benchmark-id", "benchmark-title", "benchmark-version", "profile-id", "profile-title", "score").limit(1).collect()[0]
controls_windows = df_final.filter(col("benchmark-id") == "xccdf_org.cisecurity.benchmarks_benchmark_1.0.0_CIS_Microsoft_Windows_Server_2022_Benchmark").select("rule_id", "rule_title").collect()
controls_ol7 = df_final.filter(col("benchmark-id") == "xccdf_org.cisecurity.benchmarks_benchmark_3.1.1_CIS_Oracle_Linux_7_Benchmark").select("rule_id", "rule_title").collect()
controls_ol8 = df_final.filter(col("benchmark-id") == "xccdf_org.cisecurity.benchmarks_benchmark_2.0.0_CIS_Oracle_Linux_8_Benchmark").select("rule_id", "rule_title").collect()
controls_rhel9 = df_final.filter(col("benchmark-id") == "xccdf_org.cisecurity.benchmarks_benchmark_1.0.0_CIS_Red_Hat_Enterprise_Linux_9_Benchmark").select("rule_id", "rule_title").collect()

In [6]:
%pyspark
print(param_windows['benchmark-id'])

## Inventory
Here 4 types of inventory assets based on operating system. Each host has a quality parameter [0-1] which gives the probability that their cotnrols are pass (1 = 100% probability, 0.3 = 30% probability check pass)

In [8]:
%pyspark
inventory_windows = [
    {
        'server': 's0001',
        'quality': 1
    },
    {
        'server': 's0002',
        'quality': 1
    },
    {
        'server': 's0003',
        'quality': 1
    },
    {
        'server': 's0004',
        'quality': 1
    },
    {
        'server': 's0005',
        'quality': 1
    },
    {
        'server': 's0006',
        'quality': 1
    },
    {
        'server': 's0006',
        'quality': 1
    },
    {
        'server': 's0007',
        'quality': 1
    },
    {
        'server': 's0008',
        'quality': 1
    },
    {
        'server': 's0009',
        'quality': 0.7
    }
    
]
inventory_ol7 = [
    {
        'server': 's0011',
        'quality': 0.5
    },
    {
        'server': 's0012',
        'quality': 1
    },
    {
        'server': 's0013',
        'quality': 1
    },
    {
        'server': 's0014',
        'quality': 1
    },
    {
        'server': 's0015',
        'quality': 1
    },
    {
        'server': 's0016',
        'quality': 1
    },
    {
        'server': 's0017',
        'quality': 1
    },
    {
        'server': 's0018',
        'quality': 0.9
    },
    {
        'server': 's0019',
        'quality': 0.9
    }
]
inventory_ol8 = [
    {
        'server': 's0021',
        'quality': 1
    },
    {
        'server': 's0022',
        'quality': 1
    },
    {
        'server': 's0023',
        'quality': 1
    },
    {
        'server': 's0024',
        'quality': 1
    },
    {
        'server': 's0025',
        'quality': 1
    },
    {
        'server': 's0026',
        'quality': 1
    },
    {
        'server': 's0027',
        'quality': 1
    },
    {
        'server': 's0028',
        'quality': 0.9
    },
    {
        'server': 's0029',
        'quality': 0.9
    }
]
inventory_rhel9 = [
    {
        'server': 's0031',
        'quality': 0.4
    },
    {
        'server': 's0032',
        'quality': 1
    },
    {
        'server': 's0033',
        'quality': 1
    },
    {
        'server': 's0034',
        'quality': 1
    },
    {
        'server': 's0035',
        'quality': 1
    },
    {
        'server': 's0036',
        'quality': 1
    },
    {
        'server': 's0037',
        'quality': 1
    },
    {
        'server': 's0038',
        'quality': 1
    },
    {
        'server': 's0039',
        'quality': 1
    }
]

## Definition of the quality function

In [10]:
%pyspark
def get_result_check(quality):
    if random() <= quality:
        return 'pass'
    else:
        return 'fail'

## Definition of the CIS Benchmark template in Jinja2

In [12]:
%pyspark
template = Template('''
{
    "server": "{{ server }}",
    "date": "{{ date }}",
    "benchmark-id": "{{ benchmark_id }}",
    "benchmark-title": "{{ benchmark_title }}",
    "benchmark-version": "{{ benchmark_version }}",
    "profile-id": "{{ profile_id }}",
    "profile-title": "{{ profile_title }}",
    "score": "{{ score }}",
    "rules": [
        {% for control in controls %}
        {
            "rule-id": "{{ control.rule_id }}",
            "rule-title": "{{ control.rule_title }}",
            "result": "{{ control.result}}"
        }{% if not loop.last %},{% endif %}
        {% endfor %}
    ]
}
''')

## Generate windows files

In [14]:
%pyspark
def generate_windows_files_on_date(date):
    for item in inventory_windows:
        server = item['server']
        date = date
        benchmark_id = param_windows['benchmark-id']
        benchmark_title = param_windows['benchmark-title']
        benchmark_version = param_windows['benchmark-version']
        profile_id = param_windows['profile-id']
        profile_title = param_windows['profile-title']
        passed = 0
        failed = 0
        graded_controls = []
        for control in controls_windows:
            rule_title = control['rule_title'].replace('\\', "\\\\").replace("\"","\\\"")
            control_rated = {
                'rule_id': control['rule_id'],
                'rule_title': rule_title,
                'result': get_result_check(item['quality'])
                }
            graded_controls.append(control_rated)
            if control_rated['result'] == 'pass':
                passed += 1
            if control_rated['result'] == 'fail':
                failed += 1   
        score = (passed/(failed+passed))*100
        
        rendered_content = template.render(
            server=server,
            date=date,
            benchmark_id=benchmark_id,
            benchmark_title = benchmark_title,
            benchmark_version = benchmark_version,
            profile_id = profile_id,
            profile_title = profile_title,
            score = score,
            controls = graded_controls
        )
        json_content = json.loads(rendered_content)
        filename = date + "_" + server + ".json"
        with open("/data-transfer/cis/win/" + filename, 'w') as json_file:
            json.dump(json_content, json_file, indent=4)
        print("(WIN) File " + filename + " generated!")

## Generate Oracle Linux 7 files

In [16]:
%pyspark
def generate_ol7_files_on_date(date):
    for item in inventory_ol7:
        server = item['server']
        date = date
        benchmark_id = param_ol7['benchmark-id']
        benchmark_title = param_ol7['benchmark-title']
        benchmark_version = param_ol7['benchmark-version']
        profile_id = param_ol7['profile-id']
        profile_title = param_ol7['profile-title']
        passed = 0
        failed = 0
        graded_controls = []
        for control in controls_ol7:
            rule_title = control['rule_title'].replace('\\', "\\\\").replace("\"","\\\"")
            control_rated = {
                'rule_id': control['rule_id'],
                'rule_title': rule_title,
                'result': get_result_check(item['quality'])
                }
            graded_controls.append(control_rated)
            if control_rated['result'] == 'pass':
                passed += 1
            if control_rated['result'] == 'fail':
                failed += 1   
        score = (passed/(failed+passed))*100
        
        rendered_content = template.render(
            server=server,
            date=date,
            benchmark_id=benchmark_id,
            benchmark_title = benchmark_title,
            benchmark_version = benchmark_version,
            profile_id = profile_id,
            profile_title = profile_title,
            score = score,
            controls = graded_controls
        )
        json_content = json.loads(rendered_content)
        filename = date + "_" + server + ".json"
        with open("/data-transfer/cis/ol7/" + filename, 'w') as json_file:
            json.dump(json_content, json_file, indent=4)
        print("(OL7) File " + filename + " generated!")

## Generate Oracle Linux 8 Files

In [18]:
%pyspark
def generate_ol8_files_on_date(date):
    for item in inventory_ol8:
        server = item['server']
        date = date
        benchmark_id = param_ol8['benchmark-id']
        benchmark_title = param_ol8['benchmark-title']
        benchmark_version = param_ol8['benchmark-version']
        profile_id = param_ol8['profile-id']
        profile_title = param_ol8['profile-title']
        passed = 0
        failed = 0
        graded_controls = []
        for control in controls_ol8:
            rule_title = control['rule_title'].replace('\\', "\\\\").replace("\"","\\\"")
            control_rated = {
                'rule_id': control['rule_id'],
                'rule_title': rule_title,
                'result': get_result_check(item['quality'])
                }
            graded_controls.append(control_rated)
            if control_rated['result'] == 'pass':
                passed += 1
            if control_rated['result'] == 'fail':
                failed += 1   
        score = (passed/(failed+passed))*100
        
        rendered_content = template.render(
            server=server,
            date=date,
            benchmark_id=benchmark_id,
            benchmark_title = benchmark_title,
            benchmark_version = benchmark_version,
            profile_id = profile_id,
            profile_title = profile_title,
            score = score,
            controls = graded_controls
        )
        json_content = json.loads(rendered_content)
        filename = date + "_" + server + ".json"
        with open("/data-transfer/cis/ol8/" + filename, 'w') as json_file:
            json.dump(json_content, json_file, indent=4)
        print("(OL7) File " + filename + " generated!")

## Generate Red Hat Enterprise Linux 9 Files

In [20]:
%pyspark
def generate_rhel9_files_on_date(date):
    for item in inventory_rhel9:
        server = item['server']
        date = date
        benchmark_id = param_rhel9['benchmark-id']
        benchmark_title = param_rhel9['benchmark-title']
        benchmark_version = param_rhel9['benchmark-version']
        profile_id = param_rhel9['profile-id']
        profile_title = param_rhel9['profile-title']
        passed = 0
        failed = 0
        graded_controls = []
        for control in controls_rhel9:
            rule_title = control['rule_title'].replace('\\', "\\\\").replace("\"","\\\"")
            control_rated = {
                'rule_id': control['rule_id'],
                'rule_title': rule_title,
                'result': get_result_check(item['quality'])
                }
            graded_controls.append(control_rated)
            if control_rated['result'] == 'pass':
                passed += 1
            if control_rated['result'] == 'fail':
                failed += 1   
        score = (passed/(failed+passed))*100
        
        rendered_content = template.render(
            server=server,
            date=date,
            benchmark_id=benchmark_id,
            benchmark_title = benchmark_title,
            benchmark_version = benchmark_version,
            profile_id = profile_id,
            profile_title = profile_title,
            score = score,
            controls = graded_controls
        )
        json_content = json.loads(rendered_content)
        filename = date + "_" + server + ".json"
        with open("/data-transfer/cis/rhel9/" + filename, 'w') as json_file:
            json.dump(json_content, json_file, indent=4)
        print("(OL7) File " + filename + " generated!")

## Generation of the mock files
This script will generate one report per week per inventory asset for the given dates

In [22]:
%pyspark
start_date = datetime.date(2023, 1, 1)
end_date = datetime.date(2023, 9, 1)

date_range = []

current_date = start_date
while current_date <= end_date:
    date_range.append(current_date)
    current_date += datetime.timedelta(days=7)

for date in date_range:
    date_str = date.strftime("%Y-%m-%d")
    print("Generating for date: " + date_str)
    generate_windows_files_on_date(date_str)
    generate_ol7_files_on_date(date_str)
    generate_ol8_files_on_date(date_str)
    generate_rhel9_files_on_date(date_str)

In [23]:

%pyspark
