In [1]:
import json
from datetime import datetime
from pathlib import Path
from difflib import SequenceMatcher

In [2]:
LOG_FILE = Path("inferred_log.jsonl")

records = []
with open(LOG_FILE, "r", encoding="utf-8") as f:
    for line in f:
        records.append(json.loads(line))

print("Loaded inferred entries:", len(records))
records[:3]

Loaded inferred entries: 5


[{'timestamp': '2026-02-17_12-01-27',
  'task': 'User is tracking activity in a Python Jupyter notebook.'},
 {'timestamp': '2026-02-17_12-02-45',
  'task': 'The user is reviewing Tesseract OCR documentation.'},
 {'timestamp': '2026-02-17_12-04-58',
  'task': 'User is viewing a GitHub repository for an auto-timesheet system.'}]

In [3]:
def similarity(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

In [4]:
SIM_THRESHOLD = 0.65  # adjust if needed

sessions = []

if records:
    start_time = datetime.strptime(records[0]["timestamp"], "%Y-%m-%d_%H-%M-%S")
    current_task = records[0]["task"]

    for i in range(1, len(records)):
        prev = records[i-1]
        curr = records[i]

        prev_time = datetime.strptime(prev["timestamp"], "%Y-%m-%d_%H-%M-%S")
        curr_time = datetime.strptime(curr["timestamp"], "%Y-%m-%d_%H-%M-%S")

        sim = similarity(prev["task"], curr["task"])

        if sim < SIM_THRESHOLD:
            sessions.append({
                "start": start_time,
                "end": prev_time,
                "task": current_task
            })
            start_time = curr_time
            current_task = curr["task"]

    sessions.append({
        "start": start_time,
        "end": datetime.strptime(records[-1]["timestamp"], "%Y-%m-%d_%H-%M-%S"),
        "task": current_task
    })

print("Sessions created:", len(sessions))


Sessions created: 4


In [5]:
for s in sessions:
    duration = (s["end"] - s["start"]).total_seconds() / 60

    print(f"[{s['start'].strftime('%H:%M')} – {s['end'].strftime('%H:%M')}]  ({duration:.1f} min)")
    print(s["task"])
    print("-" * 60)


[12:01 – 12:01]  (0.0 min)
User is tracking activity in a Python Jupyter notebook.
------------------------------------------------------------
[12:02 – 12:02]  (0.0 min)
The user is reviewing Tesseract OCR documentation.
------------------------------------------------------------
[12:04 – 12:04]  (0.0 min)
User is viewing a GitHub repository for an auto-timesheet system.
------------------------------------------------------------
[12:07 – 12:08]  (1.2 min)
The user is setting up Tesseract OCR software.
------------------------------------------------------------


In [6]:
OUTPUT_FILE = Path("grouped_sessions.json")

serializable = []

for s in sessions:
    serializable.append({
        "start": s["start"].strftime("%H:%M"),
        "end": s["end"].strftime("%H:%M"),
        "task": s["task"]
    })

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(serializable, f, indent=2)

print("Saved → grouped_sessions.json")


Saved → grouped_sessions.json
