# Authentication Telemetry Ingestion

## Phase
Phase 2 â€” Multi-Source Telemetry Expansion

## Objective
Normalize authentication/login event data into SOC-compatible telemetry format.


In [13]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
import random 

PROJECT_ROOT = Path(r"D:\soc-dashboard-suite-main\soc-dashboard-suite-main")

RAW_AUTH_DIR = PROJECT_ROOT / "data" / "raw" / "auth"
NORMALIZED_AUTH_DIR = PROJECT_ROOT / "data" / "normalized" / "auth"

NORMALIZED_AUTH_DIR.mkdir(parents=True, exist_ok=True)


users = [f"user{i}" for i in range(1, 51)]
roles = ["user"] * 40 + ["admin"] * 5 + ["executive"] * 5
countries = ["US", "IN", "UK", "DE", "SG"]

start_time = datetime(2025, 1, 1)

rows = []

for i in range(5000):
    user = random.choice(users)
    role = random.choice(roles)
    time = start_time + timedelta(minutes=random.randint(0, 60*24*30))
    hour = time.hour

    # Mostly normal working hours
    if random.random() < 0.85:
        time = time.replace(hour=random.randint(8, 18))
    else:
        time = time.replace(hour=random.randint(0, 5))

    row = {
        "timestamp": time,
        "user_id": user,
        "user_role": role,
        "source_ip": f"192.168.{random.randint(1,255)}.{random.randint(1,255)}",
        "source_country": random.choice(countries),
        "device_id": f"device_{random.randint(1,20)}",
        "login_status": random.choice(["success", "failed", "success", "success"])
    }
    rows.append(row)

auth_df = pd.DataFrame(rows)
auth_df.to_csv(RAW_AUTH_DIR / "synthetic_authentication_logs.csv", index=False)

print("Synthetic authentication dataset created.")
auth_df.head()


Synthetic authentication dataset created.


Unnamed: 0,timestamp,user_id,user_role,source_ip,source_country,device_id,login_status
0,2025-01-27 15:48:00,user27,user,192.168.155.89,UK,device_18,success
1,2025-01-23 11:02:00,user13,user,192.168.122.32,DE,device_7,success
2,2025-01-12 05:45:00,user21,user,192.168.182.199,SG,device_3,success
3,2025-01-29 00:09:00,user26,user,192.168.107.224,IN,device_17,success
4,2025-01-24 01:26:00,user41,executive,192.168.148.27,IN,device_9,failed


In [14]:
raw_files = list(RAW_AUTH_DIR.glob("*.csv"))

if not raw_files:
    print("No authentication data found yet. Pipeline ready for future ingestion.")
else:
    auth_df = pd.read_csv(raw_files[0], low_memory=False)
    auth_df.head()


In [15]:
if raw_files:
    auth_df.columns = [c.strip().lower() for c in auth_df.columns]

    normalized_auth = pd.DataFrame()

    normalized_auth["event_time"] = pd.to_datetime(auth_df.get("timestamp"), errors="coerce")
    normalized_auth["user_id"] = auth_df.get("user_id")
    normalized_auth["user_role"] = auth_df.get("user_role")
    normalized_auth["source_ip"] = auth_df.get("source_ip")
    normalized_auth["source_country"] = auth_df.get("source_country")
    normalized_auth["device_id"] = auth_df.get("device_id")
    normalized_auth["login_status"] = auth_df.get("login_status")

    normalized_auth["event_type"] = "authentication"

    normalized_auth.head()


In [16]:
if raw_files:
    output_path = NORMALIZED_AUTH_DIR / "authentication_events_normalized.csv"
    normalized_auth.to_csv(output_path, index=False)
    print("Saved normalized authentication events to:", output_path)


Saved normalized authentication events to: D:\soc-dashboard-suite-main\soc-dashboard-suite-main\data\normalized\auth\authentication_events_normalized.csv
