In [1]:

"""
Email Header Investigator
- Parse raw email headers
- Extract Received hops, IPs, authentication results (SPF/DKIM/DMARC if present)
- Attempt reverse DNS lookups of IPs
- Produce a readable investigation report

Author: Akhila (for educational / legitimate investigative use)
"""

import re #for Regular Expressions (to find IP addresses, hostnames)
import socket #to perform reverse DNS lookup of IP addresses.
import textwrap 
from datetime import datetime #to parse timestamps from Received headers.
from email import policy #ensures headers are parsed with standard modern rules.
from email.parser import Parser  #built-in Python library to parse raw email headers.

# Regular expressions
#(These are patterns to detect IP addresses.)
IPV4_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
IPV6_RE = re.compile(r"\b([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}\b")
RECEIVED_HEADER_RE = re.compile(r"^Received:", flags=re.IGNORECASE)
AUTH_RESULTS_RE = re.compile(r"Authentication-Results:", flags=re.IGNORECASE)

#Converts raw text headers into a structured Python object (message)
def parse_raw_headers(raw_header_text: str): #dictionary of header fields
    """
    Parse raw header text into an email.message.Message-like object
    using the stdlib email parser for robust header folding handling.
    """
    parser = Parser(policy=policy.default)
    message = parser.parsestr(raw_header_text)
    headers = dict(message.items())
    # Keep Received lines as list (they may be multiple)
    received_lines = message.get_all('Received') or [] #list of all Received headers
    auth_results = message.get_all('Authentication-Results') or message.get_all('Authentication-Results'.lower()) or [] #SPF/DKIM/DMARC authentication results.
    return headers, received_lines, auth_results

def find_ips(text: str):
    """
    Return list of unique IPs (IPv4 and IPv6) found in a string.
    """
    ipv4 = IPV4_RE.findall(text)
    ipv6 = IPV6_RE.findall(text)
    # ipv6 regex returns tuple groups, reconstruct using finditer for full match
    ipv6_full = [m.group(0) for m in IPV6_RE.finditer(text)]
    ips = []
    for ip in ipv4 + ipv6_full:
        if ip not in ips:
            ips.append(ip)
    return ips

def parse_received_line(received_text: str):
    """
    Attempt to break a Received: header into useful parts:
    - raw text
    - ips found
    - any hostnames found (basic heuristic)
    - date (if present after ';')
    """
    # Split at last semicolon for timestamp (common convention)
    parts = received_text.rsplit(';', 1)
    timestamp = None
    body = received_text
    if len(parts) == 2:
        body = parts[0].strip()
        ts_text = parts[1].strip()
        # Try parsing a variety of date formats; fallback to raw string
        try:
            timestamp = datetime.strptime(ts_text, "%a, %d %b %Y %H:%M:%S %z")
        except Exception:
            # Not a standard format — save raw
            timestamp = ts_text

    ips = find_ips(received_text)
    # Hostnames heuristic: words containing dots and letters/numbers but no spaces and not an email addr
    hostnames = []
    for token in re.split(r"\s+", received_text):
        token = token.strip("();,")
        if token and '.' in token and '@' not in token and not IPV4_RE.match(token):
            # basic sanitation: avoid long sentences
            if len(token) <= 255 and re.search(r'[A-Za-z]', token):
                if token not in hostnames:
                    hostnames.append(token)
    return {
        "raw": received_text,
        "body": body,
        "timestamp": timestamp,
        "ips": ips,
        "hostnames": hostnames
    }

def reverse_dns(ip: str, timeout: float = 3.0):
    """
    Attempt reverse DNS lookup for an IP. Returns hostname or None.
    """
    try:
        # set timeout for socket operations
        original_timeout = socket.getdefaulttimeout()
        socket.setdefaulttimeout(timeout)
        host, aliases, _ = socket.gethostbyaddr(ip)
        socket.setdefaulttimeout(original_timeout)
        return host
    except Exception:
        # could be no reverse DNS or blocked
        try:
            socket.setdefaulttimeout(original_timeout)
        except Exception:
            pass
        return None

def analyze_headers(raw_header_text: str, do_reverse_dns=True):
    headers, received_lines, auth_results = parse_raw_headers(raw_header_text)

    # Important top fields
    top_fields = {
        'From': headers.get('From'),
        'To': headers.get('To'),
        'Date': headers.get('Date'),
        'Subject': headers.get('Subject'),
        'Message-ID': headers.get('Message-ID'),
        'Return-Path': headers.get('Return-Path'),
        'Authentication-Results': auth_results
    }

    # Parse Received lines (note: first in the list is the top-most header added by the receiving MTA;
    # the earliest hop is usually the last Received header)
    hops = []
    for r in received_lines:
        hop = parse_received_line(r)
        if do_reverse_dns and hop['ips']:
            hop['ip_info'] = []
            for ip in hop['ips']:
                rdns = reverse_dns(ip)
                hop['ip_info'].append({"ip": ip, "reverse_dns": rdns})
        else:
            hop['ip_info'] = [{"ip": ip, "reverse_dns": None} for ip in hop['ips']]
        hops.append(hop)

    # Build a concise list of unique IPs (in order of appearance)
    unique_ips = []
    for hop in hops:
        for ip in hop['ips']:
            if ip not in unique_ips:
                unique_ips.append(ip)

    return {
        "top_fields": top_fields,
        "hops": hops,
        "unique_ips": unique_ips,
        "all_received_raw": received_lines
    }

def generate_report(analysis: dict, show_raw_received=False):
    """
    Produce a multi-line string report summarizing findings.
    """
    out = []
    tf = analysis['top_fields']
    out.append("=== Email Header Investigation Report ===\n")
    out.append("Top headers:")
    out.append(f"  From: {tf.get('From')}")
    out.append(f"  To: {tf.get('To')}")
    out.append(f"  Date: {tf.get('Date')}")
    out.append(f"  Subject: {tf.get('Subject')}")
    out.append(f"  Message-ID: {tf.get('Message-ID')}")
    out.append(f"  Return-Path: {tf.get('Return-Path')}\n")

    # Authentication results if present
    if tf.get('Authentication-Results'):
        out.append("Authentication-Results headers (may contain SPF/DKIM/DMARC verdicts):")
        for ar in tf['Authentication-Results']:
            out.append("  " + ar)
    else:
        out.append("Authentication-Results: NOT found in headers (server may not have added it).")

    out.append("\nUnique IP addresses found (ordered by appearance):")
    if not analysis['unique_ips']:
        out.append("  None found in headers.")
    else:
        for ip in analysis['unique_ips']:
            out.append(f"  - {ip}")

    out.append("\nReceived hops (top-to-bottom as in header order):")
    if not analysis['hops']:
        out.append("  No Received headers found.")
    else:
        for idx, hop in enumerate(analysis['hops'], 1):
            out.append(f"\n  Hop #{idx}:")
            out.append(f"    Raw: {hop['raw'][:200]}{'...' if len(hop['raw'])>200 else ''}")
            if hop['timestamp']:
                out.append(f"    Timestamp (from header): {hop['timestamp']}")
            if hop['hostnames']:
                out.append(f"    Hostname-like tokens: {', '.join(hop['hostnames'][:5])}")
            if hop['ips']:
                for info in hop['ip_info']:
                    ip = info['ip']
                    rdns = info.get('reverse_dns')
                    out.append(f"    IP: {ip}  |  reverse DNS: {rdns if rdns else 'N/A'}")
            else:
                out.append("    No IP found in this Received line.")

    out.append("\nSuggested next steps:")
    out.append("  1) For each IP: run WHOIS, ASN lookup, and geolocation. (Use online services or `python-whois` / geoip databases.)")
    out.append("  2) If Authentication-Results indicate 'spf=fail' or 'dkim=fail', collect full raw email (headers+body) and run:")
    out.append("       - pyspf for SPF evaluation (pip install pyspf)")
    out.append("       - dkimpy for DKIM signature verification (pip install dkimpy)")
    out.append("  3) Correlate earliest Received header with your mail server logs to check origin.")
    out.append("  4) If you suspect criminal activity, preserve full raw email, timestamps, and contact your local law enforcement or the email hosting provider.\n")

    if show_raw_received:
        out.append("=== Raw Received headers ===")
        for r in analysis['all_received_raw']:
            out.append(r)
            out.append("-----")

    return "\n".join(out)

# Example usage / CLI
if __name__ == "__main__":
    SAMPLE_HEADER = """Received: from unknown (HELO mail.example.com) (198.51.100.23)
    by mx.google.com with ESMTP id abc123;
    Tue, 30 Sep 2025 10:15:42 +0000
Received: from mail-out.example.org (mail-out.example.org [203.0.113.5])
    by mx.example.com with ESMTP id xyz789;
    Tue, 30 Sep 2025 10:14:02 +0000
From: "Alice Example" <alice@example.org>
To: bob@example.com
Subject: Test email
Message-ID: <CA+abcd1234@mail.example.org>
Date: Tue, 30 Sep 2025 10:13:55 +0000
Return-Path: <alice@example.org>
Authentication-Results: mx.google.com; spf=pass (google.com: domain of example.org designates 203.0.113.5 as permitted sender) smtp.mailfrom=example.org; dkim=pass header.i=@example.org
"""

    print("Parsing SAMPLE_HEADER... (reverse DNS enabled)\n")
    analysis = analyze_headers(SAMPLE_HEADER, do_reverse_dns=True)
    report = generate_report(analysis, show_raw_received=False)
    print(report)

Parsing SAMPLE_HEADER... (reverse DNS enabled)

=== Email Header Investigation Report ===

Top headers:
  From: Alice Example <alice@example.org>
  To: bob@example.com
  Date: Tue, 30 Sep 2025 10:13:55 +0000
  Subject: Test email
  Message-ID: <CA+abcd1234@mail.example.org>
  Return-Path: <alice@example.org>

Authentication-Results headers (may contain SPF/DKIM/DMARC verdicts):
  mx.google.com; spf=pass (google.com: domain of example.org designates 203.0.113.5 as permitted sender) smtp.mailfrom=example.org; dkim=pass header.i=@example.org

Unique IP addresses found (ordered by appearance):
  - 198.51.100.23
  - 10:15:42
  - 203.0.113.5
  - 10:14:02

Received hops (top-to-bottom as in header order):

  Hop #1:
    Raw: from unknown (HELO mail.example.com) (198.51.100.23)    by mx.google.com with ESMTP id abc123;    Tue, 30 Sep 2025 10:15:42 +0000
    Timestamp (from header): 2025-09-30 10:15:42+00:00
    Hostname-like tokens: mail.example.com, mx.google.com
    IP: 198.51.100.23  |  rev