In [12]:
import pandas as pd
log_path = "../data/raw/HDFS_2k.log"

with open(log_path, "r") as f:
    lines = f.readlines()

len(lines)

2000

In [13]:
lines[10]

'081109 204722 567 INFO dfs.DataNode$PacketResponder: Received block blk_5402003568334525940 of size 67108864 from /10.251.214.112\n'

### Regex for HDFS Logs

In [14]:
import re

log_pattern = re.compile(
    r'(?P<date>\d{6})\s+'
    r'(?P<time>\d{6})\s+'
    r'(?P<ms>\d+)\s+'
    r'(?P<level>\w+)\s+'
    r'(?P<component>[^:]+):\s+'
    r'(?P<message>.*)'
)

### Parse Again (Slowly)

In [15]:
parsed_logs = []

for line in lines:
    match = log_pattern.match(line)
    if match:
        parsed_logs.append(match.groupdict())

len(parsed_logs)

2000

In [16]:
import pandas as pd

df = pd.DataFrame(parsed_logs)
df.head()

Unnamed: 0,date,time,ms,level,component,message
0,81109,203615,148,INFO,dfs.DataNode$PacketResponder,PacketResponder 1 for block blk_38865049064139...
1,81109,203807,222,INFO,dfs.DataNode$PacketResponder,PacketResponder 0 for block blk_-6952295868487...
2,81109,204005,35,INFO,dfs.FSNamesystem,BLOCK* NameSystem.addStoredBlock: blockMap upd...
3,81109,204015,308,INFO,dfs.DataNode$PacketResponder,PacketResponder 2 for block blk_82291938032499...
4,81109,204106,329,INFO,dfs.DataNode$PacketResponder,PacketResponder 2 for block blk_-6670958622368...


### Build a Proper Timestamp (One Last Fix)

In [17]:
df["timestamp"] = pd.to_datetime(
    df["date"] + df["time"],
    format="%y%m%d%H%M%S"
)

df.dtypes

date                 object
time                 object
ms                   object
level                object
component            object
message              object
timestamp    datetime64[ns]
dtype: object

### Observations

Real-world event data rarely matches initial assumptions.
Inspecting raw data before parsing is critical.
Event schemas must be derived, not assumed.
Once parsed, system logs behave like event-driven analytics data.
Foundations from earlier days made this dataset manageable.

### Sort Events Properly

In [18]:
df = df.sort_values(by=["component", "timestamp"])
df.head()

Unnamed: 0,date,time,ms,level,component,message,timestamp
28,81109,205931,13,INFO,dfs.DataBlockScanner,Verification succeeded for blk_-49809165198942...,2008-11-09 20:59:31
69,81109,213436,13,INFO,dfs.DataBlockScanner,Verification succeeded for blk_-28277162389727...,2008-11-09 21:34:36
175,81110,2337,13,INFO,dfs.DataBlockScanner,Verification succeeded for blk_-15479543530655...,2008-11-10 00:23:37
196,81110,11237,13,INFO,dfs.DataBlockScanner,Verification succeeded for blk_699619438987858...,2008-11-10 01:12:37
345,81110,83453,13,INFO,dfs.DataBlockScanner,Verification succeeded for blk_314136351752080...,2008-11-10 08:34:53


### Calculate Time Gaps Between Events

In [19]:
df["prev_timestamp"] = df.groupby("component")["timestamp"].shift(1)
df["gap_seconds"] = (df["timestamp"] - df["prev_timestamp"]).dt.total_seconds()

In [20]:
df[["component", "timestamp", "prev_timestamp", "gap_seconds"]].head(10)

Unnamed: 0,component,timestamp,prev_timestamp,gap_seconds
28,dfs.DataBlockScanner,2008-11-09 20:59:31,NaT,
69,dfs.DataBlockScanner,2008-11-09 21:34:36,2008-11-09 20:59:31,2105.0
175,dfs.DataBlockScanner,2008-11-10 00:23:37,2008-11-09 21:34:36,10141.0
196,dfs.DataBlockScanner,2008-11-10 01:12:37,2008-11-10 00:23:37,2940.0
345,dfs.DataBlockScanner,2008-11-10 08:34:53,2008-11-10 01:12:37,26536.0
346,dfs.DataBlockScanner,2008-11-10 08:50:42,2008-11-10 08:34:53,949.0
347,dfs.DataBlockScanner,2008-11-10 08:59:33,2008-11-10 08:50:42,531.0
357,dfs.DataBlockScanner,2008-11-10 09:36:43,2008-11-10 08:59:33,2230.0
568,dfs.DataBlockScanner,2008-11-10 11:21:55,2008-11-10 09:36:43,6312.0
645,dfs.DataBlockScanner,2008-11-10 12:17:58,2008-11-10 11:21:55,3363.0


### Define Session Breaks

In [21]:
SESSION_GAP = 300

df["new_session"] = (df["gap_seconds"].isna()) | (df["gap_seconds"] > SESSION_GAP)

### Assign Session IDs

In [22]:
df["session_id"] = df.groupby("component")["new_session"].cumsum()

In [23]:
df[["component", "timestamp", "gap_seconds", "session_id"]].head(15)

Unnamed: 0,component,timestamp,gap_seconds,session_id
28,dfs.DataBlockScanner,2008-11-09 20:59:31,,1
69,dfs.DataBlockScanner,2008-11-09 21:34:36,2105.0,2
175,dfs.DataBlockScanner,2008-11-10 00:23:37,10141.0,3
196,dfs.DataBlockScanner,2008-11-10 01:12:37,2940.0,4
345,dfs.DataBlockScanner,2008-11-10 08:34:53,26536.0,5
346,dfs.DataBlockScanner,2008-11-10 08:50:42,949.0,6
347,dfs.DataBlockScanner,2008-11-10 08:59:33,531.0,7
357,dfs.DataBlockScanner,2008-11-10 09:36:43,2230.0,8
568,dfs.DataBlockScanner,2008-11-10 11:21:55,6312.0,9
645,dfs.DataBlockScanner,2008-11-10 12:17:58,3363.0,10


### Session-Level Metrics

In [24]:
session_summary = (
    df.groupby(["component", "session_id"])
      .agg(
          session_start=("timestamp", "min"),
          session_end=("timestamp", "max"),
          event_count=("message", "count")
      )
      .reset_index()
)

session_summary["session_duration_sec"] = (
    session_summary["session_end"] - session_summary["session_start"]
).dt.total_seconds()

session_summary.head()

Unnamed: 0,component,session_id,session_start,session_end,event_count,session_duration_sec
0,dfs.DataBlockScanner,1,2008-11-09 20:59:31,2008-11-09 20:59:31,1,0.0
1,dfs.DataBlockScanner,2,2008-11-09 21:34:36,2008-11-09 21:34:36,1,0.0
2,dfs.DataBlockScanner,3,2008-11-10 00:23:37,2008-11-10 00:23:37,1,0.0
3,dfs.DataBlockScanner,4,2008-11-10 01:12:37,2008-11-10 01:12:37,1,0.0
4,dfs.DataBlockScanner,5,2008-11-10 08:34:53,2008-11-10 08:34:53,1,0.0


### Event Sequences (Mini Funnel)

In [25]:
df["event_order"] = df.groupby(["component", "session_id"]).cumcount() + 1

In [26]:
df[
    (df["component"] == df["component"].iloc[0]) &
    (df["session_id"] == 1)
][["event_order", "level", "message"]]

Unnamed: 0,event_order,level,message
28,1,INFO,Verification succeeded for blk_-49809165198942...


## Day 7 â€“ Key Takeaways

Sessions convert raw events into meaningful behavior

Time gaps define behavioral boundaries

Components behave like users in product analytics

Sequence analysis reveals system workflows

Interpretation matters more than code