In [1]:
import json

start_timestamp = "2025-06-01T12:00:00Z"
end_timestamp = "2025-06-01T13:00:00Z"
clientIp_filter = "51.145.5.6"

jsonl_file = 'data/combined.jsonl'

filtered_rows = []
with open(jsonl_file, 'r', encoding='utf-8') as infile:
    for line in infile:
        obj = json.loads(line)
        props = obj.get("properties", {})
        clientIp = props.get("clientIp")
        ts = obj.get("time")
        if clientIp == clientIp_filter and (ts and start_timestamp <= ts <= end_timestamp):
            filtered_rows.append(obj)

print(f"Filtered {len(filtered_rows)} rows for client IP {clientIp_filter} between {start_timestamp} and {end_timestamp}")

Filtered 793 rows for client IP 51.145.5.6 between 2025-06-01T12:00:00Z and 2025-06-01T13:00:00Z


In [18]:
from collections import Counter

request_uris = []

for row in filtered_rows:
    props = row.get("properties", {})
    request_uri = props.get("requestUri", None)
    result = props.get("httpStatusCode", None)
    if request_uri is not None:
        request_uris.append(request_uri + f" ({result})")  # Append result to the URI for clarity

# Count occurrences of each URI
uri_counts = Counter(request_uris)

# Print the distinct URIs and their counts
print("Distinct request URIs and their counts:")
for uri, count in uri_counts.most_common():
    print(f"{uri}: {count}")

Distinct request URIs and their counts:
https://reply-jury-summons.service.gov.uk:443/assets/images/govuk-crest.svg (200): 3418
https://reply-jury-summons.service.gov.uk:443/js/govuk/govuk-frontend.min.js (200): 1894
https://reply-jury-summons.service.gov.uk:443/js/jquery.min.js (200): 1894
https://reply-jury-summons.service.gov.uk:443/assets/fonts/bold-b542beb274-v2.woff2 (200): 1894
https://reply-jury-summons.service.gov.uk:443/assets/fonts/light-94a07e06a1-v2.woff2 (200): 1894
https://reply-jury-summons.service.gov.uk:443/ (200): 1893
https://reply-jury-summons.service.gov.uk:443/js/cookies.js (200): 1893
https://reply-jury-summons.service.gov.uk:443/js/html5shiv.min.js (200): 1893
https://reply-jury-summons.service.gov.uk:443/js/respond.min.js (200): 1893
https://reply-jury-summons.service.gov.uk:443/css/style.css (200): 1893
https://reply-jury-summons.service.gov.uk:443/assets/images/favicon.ico (200): 1555
https://reply-jury-summons.service.gov.uk:443/assets/rebrand/images/favico

In [7]:
print("Sample of filtered data:")
for row in filtered_rows[:2]:  # Print first 2 rows as a sample
    props = row.get("properties", {})
    print(json.dumps({
        "time": row.get("time"),
        "httpMethod": props.get("httpMethod"),
        "userAgent": props.get("userAgent"),
        "clientIp": props.get("clientIp"),
        "clientCountry": props.get("clientCountry"),
        "timeTaken": props.get("timeTaken"),
        "httpStatusCode": props.get("httpStatusCode")
    }, indent=2))

Sample of filtered data:
{
  "time": "2025-06-01T12:00:01.0000000Z",
  "httpMethod": "GET",
  "userAgent": "Mozilla/5.0 (iPhone; CPU iPhone OS 18_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Mobile/15E148 Safari/604.1",
  "clientIp": "86.147.16.207",
  "clientCountry": "United Kingdom",
  "timeTaken": "0.058",
  "httpStatusCode": "200"
}
{
  "time": "2025-06-01T12:00:01.0000000Z",
  "httpMethod": "GET",
  "userAgent": "Mozilla/5.0 (iPhone; CPU iPhone OS 18_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Mobile/15E148 Safari/604.1",
  "clientIp": "86.147.16.207",
  "clientCountry": "United Kingdom",
  "timeTaken": "0.008",
  "httpStatusCode": "200"
}
