#### Run the cell below to set variables used across the notebook.

In [None]:
jsonl_file = 'data/combined_filtered.jsonl'
start_timestamp = "2025-06-01T12:00:00Z"
end_timestamp = "2025-06-01T18:00:00Z"
clientIp_filter = "51.145.5.6"

#### Run the cell below to load the data based on the start and end timestamps, and filter by client IP.

In [None]:
import json

rows = []
with open(jsonl_file, 'r', encoding='utf-8') as infile:
    for line in infile:
        obj = json.loads(line)
        props = obj.get("properties", {})
        clientIp = props.get("clientIp")
        ts = obj.get("time")
        if clientIp == clientIp_filter and (ts and start_timestamp <= ts <= end_timestamp):
            rows.append(obj)

print(f"Filtered {len(rows)} rows for client IP {clientIp_filter} between {start_timestamp} and {end_timestamp}")

#### Run the cell below to load the whole dataset.

In [None]:
import json

rows = []
with open(jsonl_file, 'r', encoding='utf-8') as infile:
    for line in infile:
        rows.append(json.loads(line))

print(f"Loaded {len(rows)} rows from {jsonl_file}")

#### The cells below will process the data and output _hopefully_ useful information.

In [None]:
from collections import Counter

# Use a set comprehension to get distinct request URIs directly from the rows
distinct_request_uris = {
    row.get("properties", {}).get("requestUri") + f" ({row.get('properties', {}).get('httpStatusCode')})"
    for row in rows
    if row.get("properties", {}).get("requestUri") is not None
}

# Count occurrences of each URI using a generator expression
uri_counts = Counter(
    row.get("properties", {}).get("requestUri") + f" ({row.get('properties', {}).get('httpStatusCode')})"
    for row in rows
    if row.get("properties", {}).get("requestUri") is not None
)

# Print the number of distinct request URIs
print(f"Number of distinct request URIs: {len(distinct_request_uris)}")

# Print the distinct URIs and their counts
print("Distinct request URIs and their counts:")
for uri, count in uri_counts.most_common():
    print(f"{uri}: {count}")

In [None]:
from collections import Counter

# Use a set comprehension to get distinct client IPs directly from the rows
distinct_client_ips = {row.get("properties", {}).get("clientIp") for row in rows if row.get("properties", {}).get("clientIp") is not None}

# Count occurrences of each Client IP using a generator expression
clientIp_counts = Counter(row.get("properties", {}).get("clientIp") for row in rows if row.get("properties", {}).get("clientIp") is not None)

# Print the number of distinct Client IPs
print(f"Number of distinct Client IPs: {len(distinct_client_ips)}")

# Print the distinct Client IPs and their counts
print("Distinct Client IPs and their counts:")
for clientIp, count in clientIp_counts.most_common():
    print(f"{clientIp}: {count}")

In [None]:
from collections import Counter

# Use a set comprehension to get distinct user agents directly from the rows
distinct_user_agent = {row.get("properties", {}).get("userAgent") for row in rows if row.get("properties", {}).get("userAgent") is not None}

# Count occurrences of each User Agent using a generator expression
userAgent_counts = Counter(row.get("properties", {}).get("userAgent") for row in rows if row.get("properties", {}).get("userAgent") is not None)

# Print the number of distinct User Agents
print(f"Number of distinct User Agents: {len(distinct_user_agent)}")

# Print the distinct User Agents and their counts
print("Distinct User Agents and their counts:")
for userAgent, count in userAgent_counts.most_common():
    print(f"{userAgent}: {count}")

In [None]:
print("Sample of filtered data:")
for row in rows[:2]:  # Print first 2 rows as a sample
    props = row.get("properties", {})
    print(json.dumps({
        "time": row.get("time"),
        "httpMethod": props.get("httpMethod"),
        "userAgent": props.get("userAgent"),
        "clientIp": props.get("clientIp"),
        "clientCountry": props.get("clientCountry"),
        "timeTaken": props.get("timeTaken"),
        "httpStatusCode": props.get("httpStatusCode")
    }, indent=2))