# Reading the Complete Email-Password 

In [None]:
"""
Reading downloaded files store, selected only gmail emails.
Storing selected emails into single json file.
json file contains selection email and passwords.
"""
import os
import json

def process_files_in_folder(folder_path, email_passwords):
    for filename in os.listdir(folder_path):
        full_path = os.path.join(folder_path, filename)

        # Skip directories
        if os.path.isdir(full_path):
            continue

        # Skip files that start with a digit or are named 'symbols'
        if filename[0].isdigit() or filename.lower() == "symbols":
            continue
        #if not filename.endswith('.txt'):
           # continue

        try:
            with open(full_path, 'r', encoding='utf-8', errors='ignore') as file:
                for line in file:
                    line = line.strip()
                    if not line:
                        continue
                    line = line[1:]  # Remove first character (assumed '0')
                    if ':' not in line:
                        continue
                    email, password = line.split(':', 1)
                    if email.endswith("@gmail.com"):
                        if email not in email_passwords:
                            email_passwords[email] = []
                        email_passwords[email].append(password)
        except Exception as e:
            print(f"Error reading file {full_path}: {e}")

def process_multiple_folders(parent_directory):
    email_passwords = {}
    for root, dirs, files in os.walk(parent_directory):
        process_files_in_folder(root, email_passwords)
    return email_passwords

parent_dir = 'data'  # Folder path
result = process_multiple_folders(parent_dir)
with open("email_passwords.json", "w", encoding="utf-8") as f:
    json.dump(result, f, indent=4)

# Filtering out Gmail Accounts with multiple incidents of password breaches

In [None]:
import json

# Path to your input file
input_file = "email_passwords.json"
output_file = "email_passwords_used.json"

# Read JSON data
try:
    with open(input_file, "r", encoding="utf-8") as f:
        result = json.load(f)
    print(f"Loaded {len(result)} total email entries from {input_file}")
except FileNotFoundError:
    print(f"Error: The file '{input_file}' was not found.")
    exit(1)
except json.JSONDecodeError as e:
    print(f"Error decoding JSON from '{input_file}': {e}")
    exit(1)

# Filter to only Gmail accounts with at least 20 password breaches
result_main = {
    email: passwords
    for email, passwords in result.items()
    if email.lower().endswith("@gmail.com") and len(passwords) >= 20
}

# Save filtered data
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(result_main, f, indent=4)



## Getting Metadata and additional information from haveibeenpwned.com

In [None]:
import time
import requests
import urllib.parse

pwned_Api_key = {{ HIBP_KEY }}

def get_hibp_metadata_for_emails(
    emails_or_dict,
    api_key: str,
    delay: float = 1.6,                 # HIBP >=1.5s between requests for my subscription
    include_unverified: bool = True,
    truncate: bool = False,
    domain: str | None = None,
    retries: int = 3,
    backoff: float = 2.0,
    return_nan_on_404: bool = False     # set True to return ["NAN"] instead of [] when no breaches
):
    """
    Fetch HIBP breach metadata for each email.

    Inputs:
      emails_or_dict: list/iterable of emails OR dict with emails as keys
      api_key: your HIBP API key
      delay: seconds to sleep between requests (rate limit)
      include_unverified: include unverified breaches
      truncate: HIBP 'truncateResponse' option
      domain: optional domain filter (e.g., "adobe.com")
      retries: request retries for transient errors (429/503/network)
      backoff: exponential backoff base
      return_nan_on_404: if True, store ["NAN"] when no breaches; else store []

    Returns:
      dict[email] = list[{"Name","Domain","BreachDate"}] or [] (or ["NAN"] if configured)
    """
    # Accept dict or list
    if isinstance(emails_or_dict, dict):
        emails = list(emails_or_dict.keys())
    else:
        emails = list(emails_or_dict)

    sess = requests.Session()
    headers = {
        "hibp-api-key": api_key,
        "User-Agent": "Research/1.0"
    }
    base_url = "https://haveibeenpwned.com/api/v3/breachedaccount/"

    results: dict[str, list | str] = {}

    for email in emails:
        url = base_url + urllib.parse.quote(email) 
        params = {
            "truncateResponse": "true" if truncate else "false",
            "includeUnverified": "true" if include_unverified else "false",
        }
        if domain:
            params["domain"] = domain

        # Retry loop for transient errors
        last_err = None
        for attempt in range(retries):
            try:
                resp = sess.get(url, headers=headers, params=params, timeout=30)

                if resp.status_code == 200:
                    breaches = resp.json()  # list of breach dicts
                    # Normalize to the requested minimal fields
                    results[email] = [
                        {
                            "Name": b.get("Name"),
                            "Domain": b.get("Domain"),
                            "BreachDate": b.get("BreachDate"),
                        }
                        for b in breaches
                    ]
                    break  # success

                elif resp.status_code == 404:
                    # No breach for this account
                    results[email] = ["NAN"] if return_nan_on_404 else []
                    break

                elif resp.status_code in (429, 503):
                    # Rate limited or service unavailable — backoff and retry
                    sleep_s = backoff ** attempt
                    time.sleep(sleep_s)
                    continue

                else:
                    # Other HTTP errors — record a simple marker
                    results[email] = [f"ERROR {resp.status_code}: {resp.text[:120]}"]
                    break

            except requests.RequestException as e:
                last_err = e
                time.sleep(backoff ** attempt)
                continue

        else:
            # Exhausted retries
            msg = f"REQUEST_FAILED: {last_err}" if last_err else "REQUEST_FAILED"
            results[email] = [msg]

        # Respect HIBP per-request delay
        time.sleep(delay)

    return results
api_key = pwned_Api_key  
hibp_metadata = get_hibp_metadata_for_emails(reduced_dict, api_key)

## Merging Email-Password data and Metadata using Roundrobin

In [None]:
from typing import Dict, List, Any
import random
from collections import defaultdict

def merge_breach_password_dicts(
    dict_a: Dict[str, List[str]],  # Email -> List of passwords
    dict_b: Dict[str, List[Dict[str, str]]],  # Email -> List of breach dicts
    allowed_names: List[str],  # List of allowed breach names
    assignment_strategy: str = "round_robin",  # How to assign passwords to breaches
    random_seed: int = 42
) -> Dict[str, List[Dict[str, str]]]:
    """
    Merge two dictionaries to create a combined breach-password mapping.
    
    Args:
        dict_a: Dictionary mapping emails to lists of passwords
        dict_b: Dictionary mapping emails to lists of breach information
        allowed_names: List of breach names to include in the result
        assignment_strategy: Method for assigning passwords to breaches
            - "round_robin": Cycle through passwords for each breach
            - "random": Randomly assign passwords to breaches
            - "sequential": Assign passwords in order to breaches
        random_seed: Seed for random assignment (if using random strategy)
    
    Returns:
        Dictionary mapping emails to filtered breach data with passwords
    """
    
    if random_seed is not None:
        random.seed(random_seed)
    
    # Convert allowed_names to set for faster lookup
    allowed_names_set = set(allowed_names)
    
    # Find common emails between both dictionaries
    common_emails = set(dict_a.keys()) & set(dict_b.keys())
    
    # print(f"DictA has {len(dict_a)} emails")
    # print(f"DictB has {len(dict_b)} emails") 
    # print(f"Common emails: {len(common_emails)}")
    # print(f"Allowed breach names: {len(allowed_names)}")
    
    result = {}
    stats = {
        "processed_emails": 0,
        "total_breaches_before_filter": 0,
        "total_breaches_after_filter": 0,
        "emails_with_no_valid_breaches": 0,
        "breach_name_distribution": defaultdict(int)
    }
    
    for email in common_emails:
        passwords = dict_a[email]
        breaches = dict_b[email]
        
        stats["processed_emails"] += 1
        stats["total_breaches_before_filter"] += len(breaches)
        
        # Filter breaches to only include allowed names
        filtered_breaches = []
        for breach in breaches:
            if isinstance(breach, dict) and breach.get("Name") in allowed_names_set:
                filtered_breaches.append(breach.copy())  # Copy to avoid modifying original
                stats["breach_name_distribution"][breach["Name"]] += 1
        
        stats["total_breaches_after_filter"] += len(filtered_breaches)
        
        if not filtered_breaches:
            stats["emails_with_no_valid_breaches"] += 1
            result[email] = []  # Empty list for emails with no valid breaches
            continue
        
        if not passwords:
            # If no passwords available, skip this email
            continue
        
        # Assign passwords to breaches based on strategy
        enhanced_breaches = assign_passwords_to_breaches(
            filtered_breaches, passwords, assignment_strategy
        )
        
        result[email] = enhanced_breaches
    
    # Print statistics
    print_merge_statistics(stats, allowed_names)
    
    return result

def assign_passwords_to_breaches(
    breaches: List[Dict[str, str]], 
    passwords: List[str], 
    strategy: str
) -> List[Dict[str, str]]:
    """
    Assign passwords to breach records based on the specified strategy.
    """
    enhanced_breaches = []
    
    if strategy == "round_robin":
        # Cycle through passwords for each breach
        for i, breach in enumerate(breaches):
            password = passwords[i % len(passwords)]
            enhanced_breach = breach.copy()
            enhanced_breach["Password"] = password
            enhanced_breaches.append(enhanced_breach)
            
    elif strategy == "random":
        # Randomly assign passwords to breaches
        for breach in breaches:
            password = random.choice(passwords)
            enhanced_breach = breach.copy()
            enhanced_breach["Password"] = password
            enhanced_breaches.append(enhanced_breach)
            
    elif strategy == "sequential":
        # Assign passwords in order, cycling if needed
        for i, breach in enumerate(breaches):
            password = passwords[i % len(passwords)]
            enhanced_breach = breach.copy()
            enhanced_breach["Password"] = password
            enhanced_breaches.append(enhanced_breach)
            
    else:
        raise ValueError(f"Unknown assignment strategy: {strategy}")
    
    return enhanced_breaches

def print_merge_statistics(stats: Dict[str, Any], allowed_names: List[str]) -> None:
    """Print detailed statistics about the merge operation."""
    # print("\n=== MERGE STATISTICS ===")
    # print(f"Processed emails: {stats['processed_emails']}")
    # print(f"Total breaches before filtering: {stats['total_breaches_before_filter']}")
    # print(f"Total breaches after filtering: {stats['total_breaches_after_filter']}")
    # print(f"Emails with no valid breaches: {stats['emails_with_no_valid_breaches']}")
    
    # if stats['total_breaches_before_filter'] > 0:
    #     filter_rate = (stats['total_breaches_after_filter'] / stats['total_breaches_before_filter']) * 100
    #     print(f"Breach retention rate: {filter_rate:.2f}%")
    
    # print("\nBreach name distribution in result:")
    # for name in allowed_names:
    #     count = stats['breach_name_distribution'].get(name, 0)
    #     print(f"  {name}: {count} occurrences")
    
    # unused_names = set(allowed_names) - set(stats['breach_name_distribution'].keys())
    # if unused_names:
    #     print(f"\nUnused breach names: {sorted(unused_names)}")

def validate_input_data(
    dict_a: Dict[str, List[str]], 
    dict_b: Dict[str, List[Dict[str, str]]], 
    allowed_names: List[str]
) -> None:
    """Validate the input data structures."""
    
    # # Validate dict_a
    # print(f"DictA validation:")
    valid_a = 0
    for email, passwords in list(dict_a.items())[:5]:  # Check first 5
        if isinstance(passwords, list) and all(isinstance(p, str) for p in passwords):
            valid_a += 1
        print(f"  {email}: {len(passwords) if isinstance(passwords, list) else 'Invalid'} passwords")
    
    # Validate dict_b
    # print(f"DictB validation:")
    valid_b = 0
    for email, breaches in list(dict_b.items())[:5]:  # Check first 5
        if isinstance(breaches, list):
            valid_breaches = 0
            for breach in breaches:
                if isinstance(breach, dict) and all(k in breach for k in ["Name", "Domain", "BreachDate"]):
                    valid_breaches += 1
            # print(f"  {email}: {len(breaches)} total, {valid_breaches} valid breach records")
            if valid_breaches == len(breaches):
                valid_b += 1
        else:
            print(f"  {email}: Invalid breach data")
    
    # print(f"Allowed names: {len(allowed_names)} names")

# Main function to use
def create_merged_dictionary(
    dict_a: Dict[str, List[str]], 
    dict_b: Dict[str, List[Dict[str, str]]], 
    allowed_names: List[str],
    validate_inputs: bool = True,
    show_sample: bool = True,
    assignment_strategy: str = "round_robin"
) -> Dict[str, List[Dict[str, str]]]:
    """
    Main function to create the merged dictionary.
    
    Args:
        dict_a: Email -> List of passwords
        dict_b: Email -> List of breach information  
        allowed_names: List of allowed breach names
        validate_inputs: Whether to validate input data
        assignment_strategy: How to assign passwords ("round_robin", "random", "sequential")
    
    Returns:
        Merged dictionary with breach data + passwords
    """
    
    if validate_inputs:
        validate_input_data(dict_a, dict_b, allowed_names)
    
    # Create the merged dictionary
    merged_dict = merge_breach_password_dicts(
        dict_a, dict_b, allowed_names, assignment_strategy
    )
    
    
    return merged_dict

##################
if __name__ == "__main__":
    # Email and passwords
    dict_a = result_main
    # Email and breach metadata
    dict_b = hibp_metadata
    # Selected the most dorminant 30 platforms 
    allowed_names = top_30_breach_names
    
    # Create the merged dictionary
    train_data = create_merged_dictionary(
        dict_a,
        dict_b, 
        allowed_names,
        assignment_strategy="round_robin"
    )
    with open("trainData.json", "w", encoding="utf-8") as f:
        json.dump(train_data, f, ensure_ascii=False)


## Generating Honeyword using simple substitution and LLM

In [None]:
"""
Generate honeywords for each breach password in `trainData.json`.

- Adds a "Honeywords" list (100 items) to each breach dict that has a "Password" key.
- By default: 50 local heuristic-generated honeywords + 50 generated by OpenAI (optional).
- Saves augmented dataset to `trainData_with_honeywords.json`.
"""

import json
import os
import random
import time
import string
from typing import List, Set, Dict
import openai

# CONFIGURATION
INPUT_FILE = "trainData_enriched_only.json"
OUTPUT_FILE = "trainData_enriched_with_honeywords.json"

TOTAL_HONEYWORDS = 100
OPENAI_SHARE = 0#.5          # fraction generated by OpenAI (0.0..1.0). 0.5 -> 50%
USE_OPENAI = True             # set False to skip OpenAI and use all-local generation
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")  
OPENAI_MODEL = "gpt-4o-mini"  # change if you prefer another available model
OPENAI_BATCH = 10             # how many passwords to ask in each model call (batching helps)
OPENAI_DELAY = 1.5            # seconds between OpenAI requests to avoid throttling
OPENAI_RETRIES = 3
RANDOM_SEED = 42
LOCAL_MUTATION_TRIES = 400    # safety cap on mutation attempts to reach uniqueness when needed

random.seed(RANDOM_SEED)


### ---------- Local honeyword generators (heuristics) ----------

def leetify(s: str) -> str:
    subs = str.maketrans({
        'a': '@', 'A': '@',
        'o': '0', 'O': '0',
        'i': '1', 'I': '1',
        'e': '3', 'E': '3',
        's': '$', 'S': '$',
        't': '7', 'T': '7',
        'l': '1', 'L': '1'
    })
    return s.translate(subs)

def append_suffix(s: str, suffix: str) -> str:
    return s + suffix

def prepend_prefix(s: str, prefix: str) -> str:
    return prefix + s

def inset_char_random(s: str) -> str:
    pos = random.randint(0, max(0, len(s)))
    ch = random.choice(string.ascii_letters + string.digits + "!@#$%")
    return s[:pos] + ch + s[pos:]

def transpose_two_chars(s: str) -> str:
    if len(s) < 2:
        return s
    i = random.randint(0, len(s)-2)
    lst = list(s)
    lst[i], lst[i+1] = lst[i+1], lst[i]
    return "".join(lst)

def replace_vowel_with_digit(s: str) -> str:
    return s.replace('a','4').replace('A','4').replace('e','3').replace('E','3').replace('o','0').replace('O','0')

def reverse_string(s: str) -> str:
    return s[::-1]

COMMON_SUFFIXES = ["123", "2020", "2021", "2022", "!", "!!", "@123", "#1", "XYZ", "_", "-"]
COMMON_PREFIXES = ["!", "x", "the", "my", "NN", "00"]

def local_mutations(base: str) -> List[str]:
    """Produce a number of heuristic mutations of base password"""
    muts = []
    # Basic variations
    muts.append(base + random.choice(COMMON_SUFFIXES))
    muts.append(prepend_prefix(base, random.choice(COMMON_PREFIXES)))
    muts.append(leetify(base))
    muts.append(leetify(base) + random.choice(COMMON_SUFFIXES))
    muts.append(replace_vowel_with_digit(base))
    muts.append(reverse_string(base))
    muts.append(transpose_two_chars(base))
    muts.append(inset_char_random(base))
    muts.append(base + str(random.randint(10,99)))
    muts.append(base.capitalize())
    muts.append(base.lower() + random.choice(["!", "@", "#"]))
    muts.append(base + "-" + str(random.randint(100,999)))
    # Add some random strings that resemble passwords
    muts.append("".join(random.choices(string.ascii_letters + string.digits, k=min(8, max(4, len(base)))))
                )
    muts.append(base + random.choice(["_x", "_1", "_01"]))
    # and so on...
    # return unique and filtered
    unique = []
    for m in muts:
        if m and m != base and m not in unique:
            unique.append(m)
    return unique

def generate_local_honeywords(password: str, count: int) -> List[str]:
    """
    Generate `count` honeywords locally using deterministic+random mutations.
    Attempt to produce plausible and diverse variants; ensure uniqueness and not equal to the real password.
    """
    out: List[str] = []
    tries = 0
    # deterministic anchor mutations first
    anchors = local_mutations(password)
    for a in anchors:
        if len(out) >= count:
            break
        if a != password and a not in out:write out this ipynb to py files for github
            out.append(a)
    # then continue with randomized heuristics
    while len(out) < count and tries < LOCAL_MUTATION_TRIES:
        tries += 1
        choice = random.choice([
            leetify, append_suffix, prepend_prefix, inset_char_random,
            transpose_two_chars, replace_vowel_with_digit, reverse_string
        ])
        if choice in (append_suffix, prepend_prefix):
            if choice is append_suffix:
                candidate = append_suffix(password, random.choice(COMMON_SUFFIXES + [str(random.randint(1,9999))]))
            else:
                candidate = prepend_prefix(password, random.choice(COMMON_PREFIXES + ["user", "admin", "sys"]))
        else:
            candidate = choice(password)
        # small chance to add a random digit tail
        if random.random() < 0.25:
            candidate = candidate + str(random.randint(0,9999))
        if candidate != password and candidate not in out:
            out.append(candidate)
    # if still short, fill with random plausible strings
    while len(out) < count:
        length = max(6, min(12, len(password)))
        candidate = "".join(random.choices(string.ascii_letters + string.digits + "!@#", k=length))
        if candidate != password and candidate not in out:
            out.append(candidate)
    return out[:count]


### ---------- OpenAI-based honeyword generation ----------

def call_openai_generate_honeywords(passwords: List[str], per_password: int, api_key: str, model: str = OPENAI_MODEL) -> Dict[str, List[str]]:
    """
    Given a list of passwords, ask OpenAI to generate `per_password` honeywords for each one.
    Returns dict password -> list[honeywords].
    Note: This function batches requests if necessary and includes simple retry/backoff.
    """
    if openai is None:
        raise RuntimeError("openai package not available. Install via `pip install openai` or set USE_OPENAI=False.")

    if not api_key:
        raise RuntimeError("OpenAI API key not provided. Set OPENAI_API_KEY or pass api_key.")

    openai.api_key = api_key

    results: Dict[str, List[str]] = {}
    # We'll ask the model to output JSON mapping, but to keep it simple we do per-password or small batches.
    # Build a prompt template:
    for pw in passwords:
        # craft a careful prompt: request honeywords only, in JSON array, avoid including the original password.
        prompt = (
            f"You are a helpful assistant that generates plausible password variants (honeywords) derived from "
            f"an original password. Do NOT output the original password. For the input password: '{pw}', "
            f"produce exactly {per_password} plausible honeywords (password-like strings). "
            "Each honeyword should look believable, contain a mix of letters/digits/punctuation, "
            "and be different from the original password and from each other. Output ONLY a JSON array of strings. "
            "Example output: [\"p@ssw0rd123\",\"Password!2020\", ...]\n"
        )

        # attempt call
        attempt = 0
        while attempt < OPENAI_RETRIES:
            attempt += 1
            try:
                resp = openai.ChatCompletion.create(
                    model=model,
                    messages=[
                        {"role":"system", "content": "You are a password mutation generator; produce plausible password variants."},
                        {"role":"user", "content": prompt}
                    ],
                    temperature=0.8,
                    max_tokens=512,
                    n=1
                )
                text = resp.choices[0].message.content.strip()
                # attempt to parse JSON from model output
                try:
                    parsed = json.loads(text)
                    if isinstance(parsed, list):
                        # filter results and ensure uniqueness and not equal to pw
                        cleaned = []
                        for item in parsed:
                            if isinstance(item, str) and item != pw and item not in cleaned:
                                cleaned.append(item)
                        # If model returned fewer variants, fill locally
                        if len(cleaned) < per_password:
                            needed = per_password - len(cleaned)
                            cleaned += generate_local_honeywords(pw, needed)
                        results[pw] = cleaned[:per_password]
                        break
                    else:
                        # unexpected structure: fall back to local generation
                        results[pw] = generate_local_honeywords(pw, per_password)
                        break
                except json.JSONDecodeError:
                    # model not returning strict JSON; attempt to extract lines that look like words
                    lines = [ln.strip().strip('",') for ln in text.splitlines() if ln.strip()]
                    candidates = []
                    for ln in lines:
                        # try to split comma-separated
                        for token in ln.replace(';', ',').split(','):
                            token = token.strip().strip('"').strip("'")
                            if token and token != pw and token not in candidates:
                                candidates.append(token)
                    if len(candidates) >= per_password:
                        results[pw] = candidates[:per_password]
                        break
                    else:
                        # fallback to local generation plus what we have
                        needed = per_password - len(candidates)
                        candidates += generate_local_honeywords(pw, needed)
                        results[pw] = candidates[:per_password]
                        break

            except Exception as e:
                # retry/backoff
                wait = 2 ** attempt
                time.sleep(wait)
                if attempt >= OPENAI_RETRIES:
                    # on final failure, fallback to local generator
                    results[pw] = generate_local_honeywords(pw, per_password)
                    break
                continue

        # small delay between requests
        time.sleep(OPENAI_DELAY)

    return results


### ---------- Orchestration ----------

def generate_honeywords_for_dataset(input_path: str = INPUT_FILE,
                                    output_path: str = OUTPUT_FILE,
                                    total_honeywords: int = TOTAL_HONEYWORDS,
                                    openai_share: float = OPENAI_SHARE,
                                    use_openai: bool = USE_OPENAI,
                                    openai_api_key: str = OPENAI_API_KEY):
    # 1. load dataset
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"Input file not found: {input_path}")
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    #print(f"Loaded {len(data)} emails from {input_path}")

    per_pw_openai = int(total_honeywords * openai_share)
    per_pw_local = total_honeywords - per_pw_openai

    #print(f"Per-password: {per_pw_local} local, {per_pw_openai} openai (use_openai={use_openai})")

    # We'll process email by email to keep memory reasonable
    total_breaches = 0
    emails_processed = 0

    # If OpenAI is enabled, prepare a list of passwords to call model on in batches
    model_password_queue = []
    model_passwords_set = set()

    # First pass: determine which breach entries need honeywords and collect model queue
    for email, breaches in data.items():
        if not isinstance(breaches, list):
            continue
        for b in breaches:
            if isinstance(b, dict) and "Password" in b:
                total_breaches += 1
                if per_pw_openai > 0 and use_openai:
                    pw = b["Password"]
                    # avoid duplicate model calls for same password
                    if pw not in model_passwords_set:
                        model_passwords_set.add(pw)
                        model_password_queue.append(pw)

    #print(f"Found {total_breaches} breaches with passwords; unique passwords queued for model: {len(model_password_queue)}")

    # 2. Run OpenAI batch generation (if enabled)
    openai_results: Dict[str, List[str]] = {}
    if use_openai and per_pw_openai > 0:
        if openai is None:
            print("openai package not installed, falling back to local generation for all honeywords.")
            openai_results = {}
        elif not openai_api_key:
            print("No OPENAI_API_KEY provided; falling back to local generation for OpenAI portion.")
            openai_results = {}
        else:
            # To limit calls, we will call API for each password (or small batches) and gather results.
            # Note: we implemented per-password call logic in call_openai_generate_honeywords
            # For big workloads you might want to adapt to a batching scheme where you ask the model
            # to return mappings for multiple passwords at once.
            print(f"Calling OpenAI for {len(model_password_queue)} unique passwords...")
            # process in slices to avoid giant calls
            for i in range(0, len(model_password_queue), OPENAI_BATCH):
                batch = model_password_queue[i:i+OPENAI_BATCH]
                try:
                    # call model for each password in batch (sequentially inside helper to keep prompt simple)
                    batch_results = call_openai_generate_honeywords(batch, per_pw_openai, openai_api_key, model=OPENAI_MODEL)
                    openai_results.update(batch_results)
                except Exception as e:
                    print(f"OpenAI batch failed at index {i}: {e}. Falling back to local for this batch.")
                    # fallback handled below when assembling final honeywords
                # tiny pause
                time.sleep(OPENAI_DELAY)

    # 3. Second pass: attach honeywords to each breach entry
    for email_idx, (email, breaches) in enumerate(list(data.items()), start=1):
        modified = False
        if not isinstance(breaches, list):
            continue
        for b in breaches:
            if isinstance(b, dict) and "Password" in b:
                pw = b["Password"]
                # assemble honeywords list: first include OpenAI outputs (if available), then local ones to fill
                honey_final: List[str] = []
                # OpenAI part
                if per_pw_openai > 0 and use_openai:
                    ai_list = openai_results.get(pw) or []
                    # filter and ensure not equal to pw
                    ai_list = [h for h in ai_list if isinstance(h, str) and h != pw]
                    # dedupe and take up to per_pw_openai
                    seen = set()
                    ai_filtered = []
                    for item in ai_list:
                        if item not in seen:
                            seen.add(item); ai_filtered.append(item)
                            if len(ai_filtered) >= per_pw_openai:
                                break
                    honey_final.extend(ai_filtered)
                # Local part (generate enough to reach per_pw_local + maybe fill from AI gap)
                needed_local = per_pw_local + max(0, per_pw_openai - len(honey_final))
                local_list = generate_local_honeywords(pw, needed_local)
                # remove any accidental equality with pw or items already in honey_final
                local_list = [h for h in local_list if h != pw and h not in honey_final]
                honey_final.extend(local_list)
                # final dedupe and trim to total_honeywords
                final_unique = []
                seen = set()
                for h in honey_final:
                    if h not in seen and h != pw:
                        seen.add(h)
                        final_unique.append(h)
                    if len(final_unique) >= total_honeywords:
                        break
                # if still short, fill with random plausible strings
                while len(final_unique) < total_honeywords:
                    filler = "".join(random.choices(string.ascii_letters + string.digits + "!@#$%", k=10))
                    if filler != pw and filler not in final_unique:
                        final_unique.append(filler)
                # attach field
                b["Honeywords"] = final_unique[:total_honeywords]
                modified = True
        if modified:
            emails_processed += 1
        # optional progress print every N emails
        if email_idx % 1000 == 0:
            print(f"Processed {email_idx} emails; {emails_processed} emails had honeywords attached so far...")

    # 4. save augmented dataset
    with open(output_path, "w", encoding="utf-8") as out_f:
        json.dump(data, out_f, ensure_ascii=False, indent=2)

    #print(f"Saved augmented dataset to {output_path}")
    #print(f"Total emails processed with honeywords: {emails_processed}")

if __name__ == "__main__":
    generate_honeywords_for_dataset(
        input_path=INPUT_FILE,
        output_path=OUTPUT_FILE,
        total_honeywords=TOTAL_HONEYWORDS,
        openai_share=OPENAI_SHARE,
        use_openai=USE_OPENAI,
        openai_api_key=OPENAI_API_KEY
    )


# Split trainData_enriched_with_honeywords.json into separate CSV files by breach name

In [None]:
"""
Split trainData_enriched_with_honeywords.json into separate CSV files by breach name.

Each CSV file will contain:
- Columns: Email, Password, Timestamp, Honeyword_1, Honeyword_2, ..., Honeyword_1000
- Rows: One row per email that was involved in that specific breach
"""

import json
import csv
import os
from collections import defaultdict
from typing import Dict, List
from datetime import datetime

# Configuration
INPUT_FILE = "trainData_enriched_with_honeywords.json"
OUTPUT_DIR = "breach_csv_files"
MAX_HONEYWORDS = 1000  # Adjust if your data has different number

def load_data(filepath: str) -> Dict:
    """Load the JSON data file."""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Input file not found: {filepath}")
    
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)

def organize_by_breach(data: Dict) -> Dict[str, List[Dict]]:
    """
    Organize data by breach name.
    
    Returns:
        Dictionary mapping breach_name -> list of records
        Each record contains: email, password, timestamp, honeywords
    """
    breach_data = defaultdict(list)
    
    for email, breaches in data.items():
        if not isinstance(breaches, list):
            continue
            
        for breach in breaches:
            if not isinstance(breach, dict):
                continue
                
            breach_name = breach.get("Name", "Unknown")
            password = breach.get("Password", "")
            timestamp = breach.get("BreachDate", "")
            honeywords = breach.get("Honeywords", [])
            
            # Create record for this breach
            record = {
                "Email": email,
                "Password": password,
                "Timestamp": timestamp,
                "Honeywords": honeywords
            }
            
            breach_data[breach_name].append(record)
    
    return breach_data

def write_breach_csv(breach_name: str, records: List[Dict], output_dir: str):
    """
    Write a single CSV file for a specific breach.
    
    Args:
        breach_name: Name of the breach
        records: List of records for this breach
        output_dir: Directory to save CSV files
    """
    if not records:
        return
    
    # Sanitize filename
    safe_name = "".join(c if c.isalnum() or c in ('-', '_') else '_' for c in breach_name)
    filepath = os.path.join(output_dir, f"{safe_name}.csv")
    
    # Determine max honeywords in this breach
    max_hw = max(len(record["Honeywords"]) for record in records)
    
    # Create column headers
    headers = ["Email", "Password", "Timestamp"]
    headers.extend([f"Honeyword_{i+1}" for i in range(max_hw)])
    
    # Write CSV
    with open(filepath, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        
        for record in records:
            row = [
                record["Email"],
                record["Password"],
                record["Timestamp"]
            ]
            
            # Add honeywords (pad with empty strings if needed)
            honeywords = record["Honeywords"]
            row.extend(honeywords)
            row.extend([""] * (max_hw - len(honeywords)))  # Padding
            
            writer.writerow(row)
    
    return filepath

def create_summary_report(breach_data: Dict[str, List[Dict]], output_dir: str):
    """Create a summary report of all breaches."""
    summary_path = os.path.join(output_dir, "_SUMMARY_REPORT.txt")
    
    with open(summary_path, "w", encoding="utf-8") as f:
        f.write("=" * 60 + "\n")
        f.write("BREACH DATA SUMMARY REPORT\n")
        f.write("=" * 60 + "\n\n")
        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Total breaches: {len(breach_data)}\n\n")
        
        # Sort breaches by number of records (descending)
        sorted_breaches = sorted(breach_data.items(), 
                                key=lambda x: len(x[1]), 
                                reverse=True)
        
        f.write("-" * 60 + "\n")
        f.write(f"{'Breach Name':<40} {'Records':>10}\n")
        f.write("-" * 60 + "\n")
        
        total_records = 0
        for breach_name, records in sorted_breaches:
            f.write(f"{breach_name:<40} {len(records):>10}\n")
            total_records += len(records)
        
        f.write("-" * 60 + "\n")
        f.write(f"{'TOTAL':<40} {total_records:>10}\n")
        f.write("=" * 60 + "\n")
    
    print(f"\nSummary report saved to: {summary_path}")

def main():
    """Main execution function."""
    print(f"Loading data from {INPUT_FILE}...")
    data = load_data(INPUT_FILE)
    
    print("\nOrganizing data by breach name...")
    breach_data = organize_by_breach(data)
    print(f"Found {len(breach_data)} unique breaches")
    
    # Create output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"\nCreating CSV files in '{OUTPUT_DIR}/' directory...")
    
    # Write CSV for each breach
    csv_count = 0
    for breach_name, records in breach_data.items():
        filepath = write_breach_csv(breach_name, records, OUTPUT_DIR)
        if filepath:
            csv_count += 1
            
    # Create summary report
    create_summary_report(breach_data, OUTPUT_DIR)
    
    print(f"✓ Total records processed: {sum(len(records) for records in breach_data.values())}")

if __name__ == "__main__":
    main()


### Spliting the trainData_enriched_with_honeywords.json into separate CSV files by breach name

In [None]:
"""
Split trainData_enriched_with_honeywords.json into separate CSV files by breach name.

Each CSV file will contain:
- Columns: Email, Password, Timestamp, Honeyword_1, Honeyword_2, ..., Honeyword_10
- Rows: One row per email that was involved in that specific breach
"""

import json
import csv
import os
from collections import defaultdict
from typing import Dict, List
from datetime import datetime

# Configuration
INPUT_FILE = "trainData_enriched_with_honeywords.json"
OUTPUT_DIR = "output"
MAX_HONEYWORDS = 100  # Adjust if your data has different number

def load_data(filepath: str) -> Dict:
    """Load the JSON data file."""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Input file not found: {filepath}")
    
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)

def organize_by_breach(data: Dict) -> Dict[str, List[Dict]]:
    """
    Organize data by breach name.
    
    Returns:
        Dictionary mapping breach_name -> list of records
        Each record contains: email, password, timestamp, honeywords
    """
    breach_data = defaultdict(list)
    
    for email, breaches in data.items():
        if not isinstance(breaches, list):
            continue
            
        for breach in breaches:
            if not isinstance(breach, dict):
                continue
                
            breach_name = breach.get("Name", "Unknown")
            password = breach.get("Password", "")
            timestamp = breach.get("BreachDate", "")
            honeywords = breach.get("Honeywords", [])
            
            # Create record for this breach
            record = {
                "Email": email,
                "Password": password,
                "Timestamp": timestamp,
                "Honeywords": honeywords
            }
            
            breach_data[breach_name].append(record)
    
    return breach_data

def write_breach_csv(breach_name: str, records: List[Dict], output_dir: str):
    """
    Write a single CSV file for a specific breach.
    
    Args:
        breach_name: Name of the breach
        records: List of records for this breach
        output_dir: Directory to save CSV files
    """
    if not records:
        return
    
    # Sanitize filename
    safe_name = "".join(c if c.isalnum() or c in ('-', '_') else '_' for c in breach_name)
    filepath = os.path.join(output_dir, f"{safe_name}.csv")
    
    # Determine max honeywords in this breach
    max_hw = max(len(record["Honeywords"]) for record in records)
    
    # Create column headers
    headers = ["Email", "Password", "Timestamp"]
    headers.extend([f"Honeyword_{i+1}" for i in range(max_hw)])
    
    # Write CSV
    with open(filepath, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        
        for record in records:
            row = [
                record["Email"],
                record["Password"],
                record["Timestamp"]
            ]
            
            # Add honeywords (pad with empty strings if needed)
            honeywords = record["Honeywords"]
            row.extend(honeywords)
            row.extend([""] * (max_hw - len(honeywords)))  # Padding
            
            writer.writerow(row)
    
    return filepath

def create_breach_list(data: Dict) -> List[tuple]:
    """
    Extract unique breach names and their dates.
    
    Returns:
        List of tuples: (breach_name, breach_date)
    """
    breach_info = {}  # Use dict to track unique breaches
    
    for email, breaches in data.items():
        if not isinstance(breaches, list):
            continue
            
        for breach in breaches:
            if not isinstance(breach, dict):
                continue
                
            name = breach.get("Name", "Unknown")
            date = breach.get("BreachDate", "")
            
            # Store unique breach with its date
            if name not in breach_info:
                breach_info[name] = date
    
    # Convert to sorted list (by date)
    breach_list = [(name, date) for name, date in breach_info.items()]
    breach_list.sort(key=lambda x: x[1])  # Sort by date
    
    return breach_list

def save_breach_list(breach_list: List[tuple], output_dir: str):
    """Save the list of breaches with dates to a CSV file."""
    filepath = os.path.join(output_dir, "breach_list.csv")
    
    with open(filepath, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Breach_Name", "Breach_Date"])
        
        for name, date in breach_list:
            writer.writerow([name, date])
    
    print(f"\nBreach list saved to: {filepath}")
    return filepath

def create_summary_report(breach_data: Dict[str, List[Dict]], output_dir: str):
    """Create a summary report of all breaches."""
    summary_path = os.path.join(output_dir, "_SUMMARY_REPORT.txt")
    
    with open(summary_path, "w", encoding="utf-8") as f:
        f.write("=" * 60 + "\n")
        f.write("BREACH DATA SUMMARY REPORT\n")
        f.write("=" * 60 + "\n\n")
        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Total breaches: {len(breach_data)}\n\n")
        
        # Sort breaches by number of records (descending)
        sorted_breaches = sorted(breach_data.items(), 
                                key=lambda x: len(x[1]), 
                                reverse=True)
        
        f.write("-" * 60 + "\n")
        f.write(f"{'Breach Name':<40} {'Records':>10}\n")
        f.write("-" * 60 + "\n")
        
        total_records = 0
        for breach_name, records in sorted_breaches:
            f.write(f"{breach_name:<40} {len(records):>10}\n")
            total_records += len(records)
        
        f.write("-" * 60 + "\n")
        f.write(f"{'TOTAL':<40} {total_records:>10}\n")
        f.write("=" * 60 + "\n")
    
    print(f"\nSummary report saved to: {summary_path}")

def main():
    """Main execution function."""
    print(f"Loading data from {INPUT_FILE}...")
    data = load_data(INPUT_FILE)
    print(f"Loaded {len(data)} emails")
    
    # Create output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # Create and save breach list
    print("\nExtracting breach list...")
    breach_list = create_breach_list(data)
    print(f"Found {len(breach_list)} unique breaches")
    save_breach_list(breach_list, OUTPUT_DIR)
    
    # Display breach list
    print("\n" + "=" * 60)
    print("BREACH LIST (Name and Date)")
    print("=" * 60)
    for name, date in breach_list:
        print(f"{name:<40} {date}")
    print("=" * 60)
    
    print("\nOrganizing data by breach name...")
    breach_data = organize_by_breach(data)
    
    print(f"\nCreating CSV files in '{OUTPUT_DIR}/' directory...")
    
    # Write CSV for each breach
    csv_count = 0
    for breach_name, records in breach_data.items():
        filepath = write_breach_csv(breach_name, records, OUTPUT_DIR)
        if filepath:
            csv_count += 1
            print(f"  [{csv_count}/{len(breach_data)}] {breach_name}: {len(records)} records -> {os.path.basename(filepath)}")
    
    # Create summary report
    create_summary_report(breach_data, OUTPUT_DIR)
    
    print(f"\n✓ Successfully created {csv_count} CSV files in '{OUTPUT_DIR}/' directory")
    print(f"✓ Breach list saved to: breach_list.csv")
    print(f"✓ Total records processed: {sum(len(records) for records in breach_data.values())}")

if __name__ == "__main__":
    main()

## Random Simulation adding breach Time

In [None]:
"""
Add breachTime column to all CSV files in breach_csv_files directory.
"""
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
import random

# Configuration
BASE_DIR = 'breach_csv_files'
OUTPUT_DIR = os.path.join(BASE_DIR, 'with_breach_time')
END_DATE = datetime(2025, 12, 14)
RANDOM_SEED = 42

# Step size range (number of rows affected by each time increment)
MIN_STEP_SIZE = 31
MAX_STEP_SIZE = 78

def parse_timestamp(ts_str):
    """Parse timestamp string to datetime object."""
    if pd.isna(ts_str):
        return None
    
    try:
        # Try different formats
        for fmt in ['%Y-%m-%d', '%Y-%m-%d %H:%M:%S', '%m/%d/%Y', '%d/%m/%Y']:
            try:
                return datetime.strptime(str(ts_str), fmt)
            except:
                continue
        
        # Try pandas parser as fallback
        return pd.to_datetime(ts_str)
    except:
        return None

def generate_breach_times(base_timestamp, num_rows, end_date, random_seed=None):
    """
    Generate monotonically increasing timestamps between base_timestamp and end_date.
    """
    if random_seed is not None:
        random.seed(random_seed)
        np.random.seed(random_seed)
    
    # Ensure base_timestamp is before end_date
    if base_timestamp >= end_date:
        # If base is after end date, swap them
        base_timestamp, end_date = end_date, base_timestamp
    
    # Calculate total time span in seconds
    total_seconds = (end_date - base_timestamp).total_seconds()
    
    if total_seconds <= 0:
        # If they're equal or very close, just return the base timestamp for all
        return [base_timestamp] * num_rows
    
    # Generate step sizes (how many rows share the same timestamp increment)
    step_sizes = []
    remaining_rows = num_rows
    
    while remaining_rows > 0:
        step = random.randint(MIN_STEP_SIZE, MAX_STEP_SIZE)
        step = min(step, remaining_rows)  # Don't exceed remaining rows
        step_sizes.append(step)
        remaining_rows -= step
    
    num_steps = len(step_sizes)
    
    weights = np.random.exponential(scale=1.0, size=num_steps)
    weights = weights / weights.sum()  # Normalize to sum to 1
    
    time_increments = weights * total_seconds
    
    # Generate timestamps
    timestamps = []
    current_time = base_timestamp
    
    for i, step_size in enumerate(step_sizes):
        # Add time increment
        current_time = current_time + timedelta(seconds=time_increments[i])
        
        # Ensure we don't exceed end_date
        if current_time > end_date:
            current_time = end_date
        
        # Assign this timestamp to 'step_size' rows
        timestamps.extend([current_time] * step_size)
    
    return timestamps

def process_csv_file(filepath, output_dir, end_date):
    filename = os.path.basename(filepath)
    
    # Skip non-CSV files and special files
    if not filename.endswith('.csv') or filename.startswith('_'):
        return None
    
    print(f"\nProcessing: {filename}")
    
    try:
        # Read CSV
        df = pd.read_csv(filepath)
        
        if len(df) == 0:
            print(f"  Skipping empty file: {filename}")
            return None
        
        # Identify the timestamp column (should be 3rd column, index 2)
        timestamp_col = df.columns[2]
        print(f"  Timestamp column: '{timestamp_col}'")
        print(f"  Number of rows: {len(df)}")
        
        # Parse the base timestamp from first row
        base_timestamp_str = df.iloc[0][timestamp_col]
        base_timestamp = parse_timestamp(base_timestamp_str)
        
        if base_timestamp is None:
            print(f"  Warning: Could not parse base timestamp: {base_timestamp_str}")
            base_timestamp = datetime(2020, 1, 1)  # Default fallback
        
        print(f"  Base timestamp: {base_timestamp}")
        print(f"  End date: {end_date}")
        
        # Generate breach times
        breach_times = generate_breach_times(
            base_timestamp=base_timestamp,
            num_rows=len(df),
            end_date=end_date,
            random_seed=RANDOM_SEED + hash(filename) % 10000  # Unique seed per file
        )
        
        # Convert to strings for CSV
        breach_times_str = [dt.strftime('%Y-%m-%d %H:%M:%S') for dt in breach_times]
        
        # Insert breachTime as 4th column (index 3)
        df.insert(3, 'breachTime', breach_times_str)
        
        # Save to output directory
        output_path = os.path.join(output_dir, filename)
        df.to_csv(output_path, index=False)
        
        print(f"  ✓ Saved to: {output_path}")
        print(f"  First breachTime: {breach_times_str[0]}")
        print(f"  Last breachTime: {breach_times_str[-1]}")
        
        return {
            'filename': filename,
            'rows': len(df),
            'base_timestamp': base_timestamp,
            'first_breach_time': breach_times[0],
            'last_breach_time': breach_times[-1]
        }
        
    except Exception as e:
        print(f"  Error processing {filename}: {e}")
        return None

def main():
    """Main execution function."""
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    
    print("=" * 70)
    print("ADD BREACH TIME COLUMN TO CSV FILES")
    print("=" * 70)
    print(f"Input directory: {BASE_DIR}")
    print(f"Output directory: {OUTPUT_DIR}")
    print(f"End date: {END_DATE.strftime('%Y-%m-%d')}")
    print(f"Step size range: {MIN_STEP_SIZE} to {MAX_STEP_SIZE} rows")
    print("=" * 70)
    
    # Create output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # Get all CSV files in directory
    csv_files = [
        os.path.join(BASE_DIR, f) 
        for f in os.listdir(BASE_DIR) 
        if f.endswith('.csv') and not f.startswith('_')
    ]
    
    print(f"\nFound {len(csv_files)} CSV files to process")
    
    # Process each file
    results = []
    for i, filepath in enumerate(csv_files, 1):
        print(f"\n[{i}/{len(csv_files)}]", end=" ")
        result = process_csv_file(filepath, OUTPUT_DIR, END_DATE)
        if result:
            results.append(result)
    
    # Print summary
    print("\n" + "=" * 70)
    print("PROCESSING SUMMARY")
    print("=" * 70)
    print(f"Total files processed: {len(results)}")
    print(f"Total rows processed: {sum(r['rows'] for r in results):,}")
    
    if results:
        print("\nSample breach time ranges:")
        print("-" * 70)
        for result in results[:10]:  # Show first 10
            print(f"{result['filename']:<40}")
            print(f"  Base: {result['base_timestamp'].strftime('%Y-%m-%d')}")
            print(f"  First breachTime: {result['first_breach_time'].strftime('%Y-%m-%d %H:%M:%S')}")
            print(f"  Last breachTime: {result['last_breach_time'].strftime('%Y-%m-%d %H:%M:%S')}")
    
    print("\n" + "=" * 70)
    print("✓ PROCESSING COMPLETE!")
    print("=" * 70)
    print(f"Output files saved to: {OUTPUT_DIR}")

if __name__ == "__main__":
    main()

## Merge all CSV files, Breach soucre as first column

In [None]:
import pandas as pd
import os
from typing import List

# Configuration
INPUT_DIR = '/home/Passwords/breach_csv_files/with_breach_time'
OUTPUT_FILE = '/home/Passwords/breach_csv_files/merged_all_breaches_100hw.csv'
MAX_COLUMNS = 104  # First 104 columns (4 base columns + 100 honeywords)

def get_csv_files(directory: str) -> List[str]:
    """Get all CSV files in the directory."""
    if not os.path.exists(directory):
        raise FileNotFoundError(f"Directory not found: {directory}")
    
    csv_files = [
        os.path.join(directory, f)
        for f in os.listdir(directory)
        if f.endswith('.csv') and not f.startswith('_')
    ]
    
    return sorted(csv_files)

def read_and_process_csv(filepath: str, max_columns: int = MAX_COLUMNS) -> pd.DataFrame:
    """
    Read a CSV file, limit to first max_columns, and add source filename as first column.
    
    Args:
        filepath: Path to CSV file
        max_columns: Maximum number of columns to keep from original file
    
    Returns:
        DataFrame with 'Breach_Source' as first column and limited columns
    """
    filename = os.path.basename(filepath)
    breach_name = filename.replace('.csv', '')
    
    try:
        # Read the CSV
        df = pd.read_csv(filepath)
        
        # Limit to first max_columns
        if len(df.columns) > max_columns:
            df = df.iloc[:, :max_columns]
            print(f"  Trimmed to {max_columns} columns")
        
        # Insert breach source as first column
        df.insert(0, 'Breach_Source', breach_name)
        
        return df
    
    except Exception as e:
        print(f"Error reading {filename}: {e}")
        return None

def merge_all_csv_files(input_dir: str, output_file: str, max_columns: int = MAX_COLUMNS):
    """
    Merge all CSV files from input directory into a single CSV file.
    
    Args:
        input_dir: Directory containing CSV files
        output_file: Path to output merged CSV file
        max_columns: Maximum number of columns to keep from each file
    """
    print("=" * 70)
    print("MERGE ALL BREACH CSV FILES (LIMITED TO 100 HONEYWORDS)")
    print("=" * 70)
    print(f"Input directory: {input_dir}")
    print(f"Output file: {output_file}")
    print(f"Max columns per file: {max_columns}")
    print("=" * 70)
    
    # Get all CSV files
    csv_files = get_csv_files(input_dir)
    
    if not csv_files:
        print("No CSV files found in the directory!")
        return
    
    print(f"\nFound {len(csv_files)} CSV files to merge")
    
    # Read and process each file
    all_dataframes = []
    total_rows = 0
    
    for i, filepath in enumerate(csv_files, 1):
        filename = os.path.basename(filepath)
        print(f"\n[{i}/{len(csv_files)}] Processing: {filename}")
        
        df = read_and_process_csv(filepath, max_columns)
        
        if df is not None and len(df) > 0:
            print(f"  Rows: {len(df):,}")
            print(f"  Columns: {len(df.columns)} (including Breach_Source)")
            
            # Show column structure
            base_cols = [col for col in df.columns if not col.startswith('Honeyword_')]
            hw_cols = [col for col in df.columns if col.startswith('Honeyword_')]
            print(f"  Base columns: {base_cols}")
            print(f"  Honeyword columns: {len(hw_cols)} (Honeyword_1 to Honeyword_{len(hw_cols)})")
            
            all_dataframes.append(df)
            total_rows += len(df)
        else:
            print(f"  Skipped (empty or error)")
    
    # Merge all dataframes
    if not all_dataframes:
        print("\nNo data to merge!")
        return
    
    print(f"\n{'='*70}")
    print("MERGING DATAFRAMES...")
    print(f"{'='*70}")
    
    merged_df = pd.concat(all_dataframes, ignore_index=True)
    
    print(f"Total rows in merged file: {len(merged_df):,}")
    print(f"Total columns: {len(merged_df.columns)}")
    
    # Display column names
    print(f"\nColumn structure:")
    print(f"  1. Breach_Source (added)")
    base_cols = [col for col in merged_df.columns[1:] if not col.startswith('Honeyword_')]
    hw_cols = [col for col in merged_df.columns if col.startswith('Honeyword_')]
    
    print(f"  Base columns ({len(base_cols)}): {base_cols}")
    print(f"  Honeyword columns ({len(hw_cols)}): Honeyword_1 to Honeyword_{len(hw_cols)}")
    
    # Display breach source distribution
    print(f"\n{'='*70}")
    print("BREACH SOURCE DISTRIBUTION")
    print(f"{'='*70}")
    breach_counts = merged_df['Breach_Source'].value_counts().sort_index()
    for breach, count in breach_counts.items():
        print(f"{breach:<40} {count:>8,} rows")
    
    # Save merged file
    print(f"\n{'='*70}")
    print("SAVING MERGED FILE...")
    print(f"{'='*70}")
    
    merged_df.to_csv(output_file, index=False)
    
    print(f"✓ Successfully saved to: {output_file}")
    print(f"✓ Total rows: {len(merged_df):,}")
    print(f"✓ Total columns: {len(merged_df.columns)}")
    
    # Display sample rows (first few columns only)
    print(f"\n{'='*70}")
    print("SAMPLE ROWS (First 5 rows, first 8 columns)")
    print(f"{'='*70}")
    sample_cols = min(8, len(merged_df.columns))
    print(merged_df.iloc[:5, :sample_cols].to_string())
    
    # Display statistics
    print(f"\n{'='*70}")
    print("SUMMARY STATISTICS")
    print(f"{'='*70}")
    print(f"Files merged: {len(all_dataframes)}")
    print(f"Total rows: {len(merged_df):,}")
    print(f"Total columns: {len(merged_df.columns)}")
    print(f"  - Breach_Source: 1")
    print(f"  - Base columns: {len(base_cols)}")
    print(f"  - Honeyword columns: {len(hw_cols)}")
    print(f"Unique breach sources: {merged_df['Breach_Source'].nunique()}")
    
    # Calculate file size
    file_size_mb = os.path.getsize(output_file) / (1024*1024)
    print(f"File size: {file_size_mb:.2f} MB")
    
    # Memory usage
    memory_mb = merged_df.memory_usage(deep=True).sum() / (1024*1024)
    print(f"Memory usage: {memory_mb:.2f} MB")
    
    print(f"{'='*70}")
    
    # Verify honeyword columns
    print(f"\n{'='*70}")
    print("HONEYWORD COLUMN VERIFICATION")
    print(f"{'='*70}")
    hw_columns = [col for col in merged_df.columns if col.startswith('Honeyword_')]
    print(f"Total Honeyword columns: {len(hw_columns)}")
    if hw_columns:
        print(f"First Honeyword column: {hw_columns[0]}")
        print(f"Last Honeyword column: {hw_columns[-1]}")
        
        # Check for any missing honeyword numbers
        expected_hw = [f'Honeyword_{i}' for i in range(1, 101)]
        actual_hw = [col for col in hw_columns if col in expected_hw]
        print(f"Expected 100 honeywords, found: {len(actual_hw)}")
        
        if len(actual_hw) < 100:
            missing = set(expected_hw) - set(actual_hw)
            print(f"Missing honeyword columns: {sorted(missing)[:10]}...")
    
    print(f"{'='*70}")

def main():
    """Main execution function."""
    merge_all_csv_files(INPUT_DIR, OUTPUT_FILE, MAX_COLUMNS)
    
    print("\n✓ MERGE COMPLETE!")
    print(f"\nMerged file location: {OUTPUT_FILE}")
    print("This file contains:")
    print("  - Breach_Source (1st column)")
    print("  - Email, Password, Timestamp, breachTime (base columns)")
    print("  - Honeyword_1 through Honeyword_100 (100 honeywords)")

if __name__ == "__main__":
    main()

# Training Pipeline for Breach Detection

In [None]:
#!/usr/bin/env python3
"""
Feature Engineering + Label Simulation + Temporal Splits
This module handles:
- Per-email timeline construction
- Feature computation (similarity, temporal, cross-platform)
- Label simulation with campaign drift
- Temporal train/val/test splitting
- Caching to parquet files

GPU Acceleration:
- Uses PyTorch for TF-IDF similarity computation on GPU when available
"""

from __future__ import annotations

import os
import json
import math
import argparse
import warnings
from dataclasses import dataclass, asdict
from typing import Dict, List, Tuple, Optional, Any

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix

import torch

# -------------------------
# Configuration
# -------------------------

@dataclass
class BuildConfig:
    n_honeywords: int = 50
    tfidf_ngram: int = 3
    max_features: int = 5000  # NEW: Limit vocabulary size to prevent OOM
    sim_threshold_noisy: float = 0.85
    early_frac: float = 0.40
    cross_platform_leak_days: int = 30

@dataclass
class SplitConfig:
    train_frac: float = 0.60
    val_frac: float = 0.20
    test_frac: float = 0.20


# -------------------------
# Utils
# -------------------------

def ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)

def save_json(obj: Any, path: str) -> None:
    with open(path, "w") as f:
        json.dump(obj, f, indent=2, default=str)

def to_datetime_safe(s: pd.Series) -> pd.Series:
    return pd.to_datetime(s, errors="coerce", utc=True).dt.tz_convert(None)

def days_between(a: pd.Timestamp, b: pd.Timestamp) -> float:
    if pd.isna(a) or pd.isna(b):
        return float("nan")
    return (a - b).total_seconds() / (3600 * 24)

def get_device() -> torch.device:
    """Get the best available device (CUDA > MPS > CPU)."""
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"Using CUDA GPU: {torch.cuda.get_device_name(0)}")
    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using Apple MPS GPU")
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device


# -------------------------
# GPU-Accelerated Similarity Index (FIXED)
# -------------------------

def _collect_all_strings(df: pd.DataFrame, n_hw: int) -> List[str]:
    """Collect all password and honeyword strings for TF-IDF fitting."""
    cols_hw = [f"Honeyword_{i}" for i in range(1, n_hw + 1) if f"Honeyword_{i}" in df.columns]
    all_strings = []
    all_strings.extend(df["Password"].astype(str).fillna("").tolist())latform Detection using Models
    for c in cols_hw:
        all_strings.extend(df[c].astype(str).fillna("").tolist())
    all_strings = [s if isinstance(s, str) else str(s) for s in all_strings]
    all_strings = [s for s in all_strings if s is not None]
    if "" not in all_strings:
        all_strings.append("")
    return all_strings


class GPUSimilarityIndex:
    """
    TF-IDF char n-gram vectorizer with GPU-accelerated similarity computation.
    
    FIXED: Uses sparse matrices to avoid memory issues with large vocabularies.
    Only converts small batches to dense when computing similarities.
    """
    
    def __init__(
        self, 
        strings: List[str], 
        ngram: int = 3, 
        max_features: int = 5000,
        device: Optional[torch.device] = None
    ):
        self.device = device if device else get_device()
        
        # FIX 1: Limit vocabulary size with max_features
        # FIX 2: Use float32 instead of float64
        self.vectorizer = TfidfVectorizer(
            analyzer="char", 
            ngram_range=(ngram, ngram), 
            lowercase=False,
            max_features=max_features,  # Prevents huge vocabulary
            dtype=np.float32  # Half the memory of float64
        )
        
        # Get unique strings while preserving order
        self.strings = list(dict.fromkeys(strings))
        self.s2i = {s: i for i, s in enumerate(self.strings)}
        
        print(f"Fitting TF-IDF on {len(self.strings)} unique strings...")
        
        # Fit and transform to sparse matrix
        tfidf_sparse = self.vectorizer.fit_transform(self.strings)
        
        print(f"TF-IDF vocabulary size: {tfidf_sparse.shape[1]}")
        print(f"TF-IDF matrix shape: {tfidf_sparse.shape}")
        print(f"TF-IDF sparsity: {1 - tfidf_sparse.nnz / (tfidf_sparse.shape[0] * tfidf_sparse.shape[1]):.4f}")
        
        # FIX 3: Keep as sparse on CPU, only move small chunks to GPU
        # Store as CSR (Compressed Sparse Row) for efficient row access
        self.tfidf_sparse = tfidf_sparse.tocsr().astype(np.float32)
        
        # Pre-compute norms on CPU (cheap operation)
        # Using scipy's sparse operations
        squared_norms = np.array(self.tfidf_sparse.multiply(self.tfidf_sparse).sum(axis=1)).flatten()
        self.norms = np.sqrt(squared_norms).clip(min=1e-8)
        
        print(f"TF-IDF index ready (kept sparse to save memory)")

    def idx(self, s: str) -> int:
        """Get index for a string."""
        return self.s2i.get(s, self.s2i.get("", 0))

    def sims_to_history(self, current_idx: int, hist_indices: List[int]) -> Tuple[float, float]:
        """
        Compute max and mean cosine similarity between current string and history.
        
        FIX: Operates on sparse matrices, only converts to dense for small vectors.
        """
        if not hist_indices:
            return 0.0, 0.0
        
        # Get current vector (sparse row)
        cur_vec = self.tfidf_sparse[current_idx]  # CSR row
        cur_norm = self.norms[current_idx]
        
        # Get history vectors (sparse rows)
        hist_vecs = self.tfidf_sparse[hist_indices]  # CSR submatrix
        hist_norms = self.norms[hist_indices]
        
        # Compute dot products using sparse operations
        # cur_vec is (1, vocab_size), hist_vecs.T is (vocab_size, n_hist)
        dots = hist_vecs.dot(cur_vec.T).toarray().flatten()  # Only this is dense
        
        # Normalize to get cosine similarities
        sims = dots / (hist_norms * cur_norm + 1e-8)
        
        if len(sims) == 0:
            return 0.0, 0.0
        
        return float(np.max(sims)), float(np.mean(sims))

    def batch_sims_to_history(
        self, 
        current_indices: List[int], 
        hist_indices_list: List[List[int]]
    ) -> List[Tuple[float, float]]:
        """
        Batch computation of similarities for multiple queries.
        More efficient for large batches.
        """
        results = []
        for cur_idx, hist_indices in zip(current_indices, hist_indices_list):
            results.append(self.sims_to_history(cur_idx, hist_indices))
        return results


# -------------------------
# Feature Engineering
# -------------------------

def build_features_and_labels(
    df: pd.DataFrame,
    build_cfg: BuildConfig,
    device: Optional[torch.device] = None,
) -> pd.DataFrame:
    """
    Build all features and simulated labels for the dataset.
    
    Features computed:
    - Password similarity features (current vs historical passwords/honeywords)
    - Temporal features (time since last leak/attack, sequence index)
    - Cross-platform features (platforms seen, overlap score, cross-platform reuse)
    
    Labels simulated based on credential-stuffing attack patterns with campaign drift.
    """
    required = {"Breach_Source", "Email", "Password", "Timestamp", "AttackTime"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {sorted(missing)}")

    if device is None:
        device = get_device()

    # Normalize types
    df = df.copy()
    df["Breach_Source"] = df["Breach_Source"].astype(str)
    df["Email"] = df["Email"].astype(str)
    df["Password"] = df["Password"].astype(str).fillna("")
    df["Timestamp"] = to_datetime_safe(df["Timestamp"])
    df["AttackTime"] = to_datetime_safe(df["AttackTime"])

    # Sort by per-email AttackTime for stateful features
    df = df.sort_values(["Email", "AttackTime"], kind="mergesort").reset_index(drop=True)

    n_hw = build_cfg.n_honeywords
    hw_cols = [f"Honeyword_{i}" for i in range(1, n_hw + 1) if f"Honeyword_{i}" in df.columns]
    for c in hw_cols:
        df[c] = df[c].astype(str).fillna("")

    # Build TF-IDF similarity index with memory optimizations
    print("Building TF-IDF similarity index...")
    all_strings = _collect_all_strings(df, n_hw=n_hw)
    sim_index = GPUSimilarityIndex(
        all_strings, 
        ngram=build_cfg.tfidf_ngram, 
        max_features=build_cfg.max_features,  # NEW: Pass max_features
        device=device
    )

    # Initialize feature columns
    feat_cols = [
        "sim_to_prev_passwords_max",
        "sim_to_prev_passwords_mean",
        "exact_password_reuse",
        "sim_to_prev_honeywords_max",
        "sim_to_prev_honeywords_mean",
        "honeyword_trigger",
        "current_hw_matches_prev_pw",
        "time_since_last_leak",
        "time_since_last_attack",
        "attack_sequence_index",
        "delta_similarity",
        "num_platforms_seen",
        "platform_overlap_score",
        "cross_platform_reuse",
    ]
    for col in feat_cols:
        df[col] = 0.0

    # Per-email state tracking
    state_prev_pw_set: Dict[str, set] = {}
    state_prev_pw_indices: Dict[str, List[int]] = {}
    state_prev_hw_set: Dict[str, set] = {}
    state_prev_hw_indices: Dict[str, List[int]] = {}
    state_last_leak: Dict[str, pd.Timestamp] = {}
    state_last_attack: Dict[str, pd.Timestamp] = {}
    state_last_simmax: Dict[str, float] = {}
    state_seq_idx: Dict[str, int] = {}
    state_platforms_seen: Dict[str, set] = {}
    state_pw_to_platforms: Dict[str, Dict[str, set]] = {}

    print("Computing features...")
    for i in range(len(df)):
        if i % 5000 == 0 and i > 0:
            print(f"  Processed {i}/{len(df)} rows...")
        
        row = df.iloc[i]
        email = row["Email"]
        pw = row["Password"]
        platform = row["Breach_Source"]
        cur_idx = sim_index.idx(pw)

        cur_hw = [row[c] for c in hw_cols if c in row.index]
        cur_hw = [h for h in cur_hw if isinstance(h, str) and h != ""]

        prev_pw_set = state_prev_pw_set.get(email, set())
        prev_pw_indices = state_prev_pw_indices.get(email, [])
        prev_hw_set = state_prev_hw_set.get(email, set())
        prev_hw_indices = state_prev_hw_indices.get(email, [])

        exact_reuse = 1.0 if pw in prev_pw_set else 0.0
        sim_pw_max, sim_pw_mean = sim_index.sims_to_history(cur_idx, prev_pw_indices)
        sim_hw_max, sim_hw_mean = sim_index.sims_to_history(cur_idx, prev_hw_indices)

        hw_trigger = 1.0 if any(h in prev_pw_set for h in cur_hw) else 0.0
        cur_hw_matches_prev_pw = 1.0 if any(h in prev_pw_set for h in cur_hw) else 0.0

        last_leak = state_last_leak.get(email, pd.NaT)
        last_attack = state_last_attack.get(email, pd.NaT)
        tsl = days_between(row["AttackTime"], last_leak)
        tsa = days_between(row["AttackTime"], last_attack)

        seq = int(state_seq_idx.get(email, 0))
        last_simmax = state_last_simmax.get(email, 0.0)
        delta_sim = sim_pw_max - last_simmax

        platforms_seen = state_platforms_seen.get(email, set())
        pw_map = state_pw_to_platforms.get(email, {})
        platforms_that_used_pw = pw_map.get(pw, set())
        
        num_platforms_seen = float(len(platforms_seen) if platforms_seen else 0)
        overlap_score = float(len(platforms_that_used_pw) / len(platforms_seen)) if platforms_seen else 0.0
        cross_platform_reuse = 1.0 if (len(platforms_that_used_pw - {platform}) > 0) else 0.0

        # Assign features
        df.at[i, "sim_to_prev_passwords_max"] = sim_pw_max
        df.at[i, "sim_to_prev_passwords_mean"] = sim_pw_mean
        df.at[i, "exact_password_reuse"] = exact_reuse
        df.at[i, "sim_to_prev_honeywords_max"] = sim_hw_max
        df.at[i, "sim_to_prev_honeywords_mean"] = sim_hw_mean
        df.at[i, "honeyword_trigger"] = hw_trigger
        df.at[i, "current_hw_matches_prev_pw"] = cur_hw_matches_prev_pw
        df.at[i, "time_since_last_leak"] = 0.0 if math.isnan(tsl) else float(max(tsl, 0.0))
        df.at[i, "time_since_last_attack"] = 0.0 if math.isnan(tsa) else float(max(tsa, 0.0))
        df.at[i, "attack_sequence_index"] = seq
        df.at[i, "delta_similarity"] = delta_sim
        df.at[i, "num_platforms_seen"] = num_platforms_seen
        df.at[i, "platform_overlap_score"] = overlap_score
        df.at[i, "cross_platform_reuse"] = cross_platform_reuse

        # Update state AFTER computing features
        state_last_leak[email] = row["Timestamp"] if not pd.isna(row["Timestamp"]) else last_leak
        state_last_attack[email] = row["AttackTime"] if not pd.isna(row["AttackTime"]) else last_attack
        state_last_simmax[email] = sim_pw_max
        state_seq_idx[email] = int(seq) + 1

        state_prev_pw_set.setdefault(email, set()).add(pw)
        state_prev_pw_indices.setdefault(email, []).append(cur_idx)
        state_prev_hw_set.setdefault(email, set()).update([h for h in cur_hw if h != ""])
        state_prev_hw_indices.setdefault(email, []).extend([sim_index.idx(h) for h in cur_hw if h != ""])

        state_platforms_seen.setdefault(email, set()).add(platform)
        state_pw_to_platforms.setdefault(email, {}).setdefault(pw, set()).add(platform)

    # Label simulation with campaign drift
    print("Simulating labels with campaign drift...")
    df["y"] = 0

    df["_rank_in_email"] = df.groupby("Email").cumcount()
    df["_len_in_email"] = df.groupby("Email")["Email"].transform("size")
    df["_frac_in_email"] = (df["_rank_in_email"] / (df["_len_in_email"].clip(lower=1) - 1).replace(0, 1)).astype(float)

    sim_thr = build_cfg.sim_threshold_noisy
    early_frac = build_cfg.early_frac
    leak_days = build_cfg.cross_platform_leak_days

    base_exact = df["exact_password_reuse"] >= 1.0
    base_honey = df["honeyword_trigger"] >= 1.0
    base_cross = (df["cross_platform_reuse"] >= 1.0) & (df["time_since_last_leak"] < float(leak_days))

    late = df["_frac_in_email"] > early_frac
    noisy_sim = df["sim_to_prev_passwords_max"] > sim_thr

    y_late = (base_exact | base_honey | base_cross | noisy_sim).astype(int)
    y_early = (base_exact).astype(int)

    df["y"] = np.where(late, y_late, y_early).astype(int)

    df = df.drop(columns=["_rank_in_email", "_len_in_email", "_frac_in_email"])

    print(f"Label distribution: {df['y'].value_counts().to_dict()}")
    
    return df


# -------------------------
# Temporal Split
# -------------------------

def temporal_split(
    df: pd.DataFrame, 
    split_cfg: SplitConfig
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Split data temporally by AttackTime into train/val/test.
    """
    df = df.sort_values("AttackTime", kind="mergesort").reset_index(drop=True)
    n = len(df)
    n_train = int(n * split_cfg.train_frac)
    n_val = int(n * split_cfg.val_frac)
    
    train = df.iloc[:n_train].copy()
    val = df.iloc[n_train:n_train + n_val].copy()
    test = df.iloc[n_train + n_val:].copy()
    
    assert len(train) + len(val) + len(test) == n
    
    print(f"Split sizes - Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")
    
    return train, val, test


# -------------------------
# Caching
# -------------------------

def cache_paths(out_dir: str) -> Dict[str, str]:
    return {
        "features": os.path.join(out_dir, "features.parquet"),
        "train": os.path.join(out_dir, "train.parquet"),
        "val": os.path.join(out_dir, "val.parquet"),
        "test": os.path.join(out_dir, "test.parquet"),
        "build_meta": os.path.join(out_dir, "build_meta.json"),
    }

def build_and_cache(
    csv_path: str, 
    out_dir: str, 
    build_cfg: BuildConfig, 
    split_cfg: SplitConfig,
    device: Optional[torch.device] = None
) -> None:
    """
    Build features, labels, and splits, then cache to disk.
    """
    ensure_dir(out_dir)
    
    print(f"Loading CSV from {csv_path}...")
    df = pd.read_csv(csv_path)
    print(f"Loaded {len(df)} rows")
    
    df2 = build_features_and_labels(df, build_cfg, device=device)
    train, val, test = temporal_split(df2, split_cfg)

    paths = cache_paths(out_dir)
    
    print(f"Saving cached data to {out_dir}...")
    df2.to_parquet(paths["features"], index=False)
    train.to_parquet(paths["train"], index=False)
    val.to_parquet(paths["val"], index=False)
    test.to_parquet(paths["test"], index=False)

    meta = {
        "csv": csv_path,
        "rows": int(len(df2)),
        "platforms": int(df2["Breach_Source"].nunique()),
        "unique_emails": int(df2["Email"].nunique()),
        "positive_labels": int(df2["y"].sum()),
        "negative_labels": int((df2["y"] == 0).sum()),
        "build_cfg": asdict(build_cfg),
        "split_cfg": asdict(split_cfg),
    }
    save_json(meta, paths["build_meta"])
    
    print("Stage 1 complete!")


def load_cached(out_dir: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Load cached train/val/test splits from disk."""
    paths = cache_paths(out_dir)
    
    for key in ["train", "val", "test"]:
        if not os.path.exists(paths[key]):
            raise FileNotFoundError(f"Missing cached {key} split in {out_dir}. Run stage 1 first.")
    
    train = pd.read_parquet(paths["train"])
    val = pd.read_parquet(paths["val"])
    test = pd.read_parquet(paths["test"])
    
    # Ensure datetime after parquet
    for df in [train, val, test]:
        df["AttackTime"] = pd.to_datetime(df["AttackTime"], errors="coerce")
        df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
    
    return train, val, test


# -------------------------
# Main
# -------------------------

import torch

def main():
    # =========================
    # USER CONFIGURATION
    # =========================
    CSV_PATH = "clean_merged_all_breaches_50hw.csv"
    OUT_DIR = "outputFolder"

    # Build config
    N_HONEYWORDS = 50
    TFIDF_NGRAM = 3
    SIM_THRESHOLD_NOISY = 0.85
    EARLY_FRAC = 0.40
    CROSS_PLATFORM_LEAK_DAYS = 30

    # Split config
    TRAIN_FRAC = 0.60
    VAL_FRAC = 0.20
    TEST_FRAC = 0.20

    # Device (set to None for auto-detect)
    DEVICE_NAME = None  # "cuda", "cpu", "mps", or None


    # =========================
    # DEVICE SETUP
    # =========================
    if DEVICE_NAME:
        device = torch.device(DEVICE_NAME)
    else:
        device = get_device()   # your existing helper


    # =========================
    # CONFIG OBJECTS
    # =========================
    build_cfg = BuildConfig(
        n_honeywords=N_HONEYWORDS,
        tfidf_ngram=TFIDF_NGRAM,
        sim_threshold_noisy=SIM_THRESHOLD_NOISY,
        early_frac=EARLY_FRAC,
        cross_platform_leak_days=CROSS_PLATFORM_LEAK_DAYS,
    )

    split_cfg = SplitConfig(
        train_frac=TRAIN_FRAC,
        val_frac=VAL_FRAC,
        test_frac=TEST_FRAC,
    )


    # =========================
    # RUN PIPELINE
    # =========================
    build_and_cache(
        csv_path=CSV_PATH,
        out_dir=OUT_DIR,
        build_cfg=build_cfg,
        split_cfg=split_cfg,
        device=device
    )


if __name__ == "__main__":
    main()


# Training Pipeline for Platform Detection

In [None]:
"""
GPU-Optimized Monthly Time-Series Classification Pipeline for Platform Detection
"""

import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

class BreachTimeSeriesProcessor:
    """
    Processes breach data into monthly time-series classification dataset
    Optimized for GPU training with efficient memory layout
    """
    
    def __init__(self, lookback_months=12):
        """
        Args:
            lookback_months: Number of past months to include in each sample (default: 12 months = 1 year)
        """
        self.lookback_months = lookback_months
        self.platform_encoder = LabelEncoder()
        self.vectorizer = TfidfVectorizer(
            analyzer='char',
            ngram_range=(2, 4),
            max_features=500,  # Reduced for memory efficiency
            lowercase=True
        )
        self.num_honeywords = None  # Will be detected from data
        
    def load_and_prepare_data(self, csv_path):
        """Load CSV and prepare for time-series processing"""
        print("Loading data...")
        df = pd.read_csv(csv_path)
        
        # Detect honeyword columns
        honeyword_cols = [col for col in df.columns if col.startswith('Honeyword_')]
        self.num_honeywords = len(honeyword_cols)
        print(f"Detected {self.num_honeywords} honeyword columns")
        
        # Convert timestamps to datetime (handle mixed formats)
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='mixed', errors='coerce')
        df['AttackTime'] = pd.to_datetime(df['AttackTime'], format='mixed', errors='coerce')
        
        # Drop any rows with invalid dates
        df = df.dropna(subset=['Timestamp', 'AttackTime'])
        
        # Drop rows with missing passwords
        df = df.dropna(subset=['Password'])
        df = df[df['Password'].astype(str).str.len() > 0]
        
        # Check honeywords - drop rows with too many missing honeywords
        min_honeywords = int(self.num_honeywords * 0.9)  # At least 90% present
        df = df.dropna(subset=honeyword_cols, thresh=min_honeywords)
        
        # Extract month-year for aggregation
        df['LeakMonth'] = df['Timestamp'].dt.to_period('M')
        df['AttackMonth'] = df['AttackTime'].dt.to_period('M')
        
        print(f"Loaded {len(df)} records")
        print(f"Date range: {df['Timestamp'].min()} to {df['AttackTime'].max()}")
        print(f"Platforms: {df['Breach_Source'].nunique()}")
        print(f"Unique emails: {df['Email'].nunique()}")
        
        return df
    
    def compute_password_embeddings(self, passwords):
        """Convert passwords to TF-IDF embeddings for similarity computation"""
        return self.vectorizer.fit_transform(passwords).toarray()
    
    def compute_honeyword_features(self, row, pw_embedding):
        """
        Compute similarity features between password and honeywords
        Returns: dictionary of honeyword-based features
        """
        honeywords = [row[f'Honeyword_{i}'] for i in range(1, self.num_honeywords + 1)]
        
        # Filter out NaN honeywords
        honeywords = [hw for hw in honeywords if pd.notna(hw) and str(hw).strip() != '']
        
        if len(honeywords) == 0:
            # No valid honeywords - return zeros
            return {
                'hw_mean_similarity': 0.0,
                'hw_max_similarity': 0.0,
                'hw_min_similarity': 0.0,
                'hw_std_similarity': 0.0,
                'hw_median_similarity': 0.0,
                'hw_top5_mean_similarity': 0.0,
                'hw_top10_mean_similarity': 0.0,
                'hw_bottom10_mean_similarity': 0.0,
                'hw_q25_similarity': 0.0,
                'hw_q75_similarity': 0.0,
                'hw_iqr_similarity': 0.0,
                'hw_range_similarity': 0.0,
            }
        
        # Compute embeddings for honeywords
        hw_embeddings = self.vectorizer.transform(honeywords).toarray()
        
        # Compute cosine similarities
        similarities = cosine_similarity(pw_embedding.reshape(1, -1), hw_embeddings)[0]
        
        # Handle edge cases for top-k means
        top5 = min(5, len(similarities))
        top10 = min(10, len(similarities))
        bottom10 = min(10, len(similarities))
        
        return {
            'hw_mean_similarity': np.mean(similarities),
            'hw_max_similarity': np.max(similarities),
            'hw_min_similarity': np.min(similarities),
            'hw_std_similarity': np.std(similarities),
            'hw_median_similarity': np.median(similarities),
            'hw_top5_mean_similarity': np.mean(np.sort(similarities)[-top5:]),
            'hw_top10_mean_similarity': np.mean(np.sort(similarities)[-top10:]),
            'hw_bottom10_mean_similarity': np.mean(np.sort(similarities)[:bottom10]),
            'hw_q25_similarity': np.percentile(similarities, 25),
            'hw_q75_similarity': np.percentile(similarities, 75),
            'hw_iqr_similarity': np.percentile(similarities, 75) - np.percentile(similarities, 25),
            'hw_range_similarity': np.max(similarities) - np.min(similarities),
        }
    
    def compute_historical_features(self, email, current_leak_month, current_password, 
                                   historical_data, pw_embedding):
        """
        Compute features based on historical leaked passwords for this email
        Uses only data from leaks BEFORE the current leak time
        """
        # Get historical leaks for this email (only those BEFORE current leak)
        # Convert Period to Timestamp for comparison
        current_leak_ts = current_leak_month.to_timestamp()
        
        hist = historical_data[
            (historical_data['Email'] == email) & 
            (historical_data['Timestamp'] < current_leak_ts)
        ].sort_values('Timestamp')
        
        if len(hist) == 0:
            # No historical data - return default features
            return {
                'hist_leak_count': 0,
                'hist_months_since_last_leak': 0,
                'hist_max_pw_similarity': 0.0,
                'hist_mean_pw_similarity': 0.0,
                'hist_max_hw_similarity': 0.0,
                'hist_mean_hw_similarity': 0.0,
                'hist_platform_diversity': 0,
                'hist_reuse_indicator': 0.0,
            }
        
        # Get most recent historical leak
        most_recent_leak = hist.iloc[-1]
        
        # Compute time since last leak (in months)
        months_since = (current_leak_ts - 
                       most_recent_leak['Timestamp']).days / 30.44
        
        # Compute password similarity with historical passwords
        hist_passwords = hist['Password'].tolist()
        hist_pw_embeddings = self.vectorizer.transform(hist_passwords).toarray()
        pw_similarities = cosine_similarity(pw_embedding.reshape(1, -1), hist_pw_embeddings)[0]
        
        # Compute similarity with historical honeywords
        all_hist_honeywords = []
        for _, hist_row in hist.iterrows():
            for i in range(1, self.num_honeywords + 1):
                hw = hist_row[f'Honeyword_{i}']
                if pd.notna(hw) and str(hw).strip() != '':
                    all_hist_honeywords.append(hw)
        
        if len(all_hist_honeywords) > 0:
            hist_hw_embeddings = self.vectorizer.transform(all_hist_honeywords).toarray()
            hw_similarities = cosine_similarity(pw_embedding.reshape(1, -1), hist_hw_embeddings)[0]
            max_hw_sim = np.max(hw_similarities)
            mean_hw_sim = np.mean(hw_similarities)
        else:
            max_hw_sim = 0.0
            mean_hw_sim = 0.0
        
        # Platform diversity
        platform_diversity = hist['Breach_Source'].nunique()
        
        # Password reuse indicator (high similarity suggests reuse)
        reuse_indicator = 1.0 if np.max(pw_similarities) > 0.8 else 0.0
        
        return {
            'hist_leak_count': len(hist),
            'hist_months_since_last_leak': months_since,
            'hist_max_pw_similarity': np.max(pw_similarities),
            'hist_mean_pw_similarity': np.mean(pw_similarities),
            'hist_max_hw_similarity': max_hw_sim,
            'hist_mean_hw_similarity': mean_hw_sim,
            'hist_platform_diversity': platform_diversity,
            'hist_reuse_indicator': reuse_indicator,
        }
    
    def create_time_buckets(self, df):
        """Create time bucket labels for time-to-detection"""
        time_deltas = (df['AttackTime'] - df['Timestamp']).dt.total_seconds() / 3600  # hours
        
        def bucket_time(hours):
            if hours < 1:
                return 0  # < 1 hour
            elif hours < 6:
                return 1  # 1-6 hours
            elif hours < 24:
                return 2  # 6-24 hours
            elif hours < 168:  # 7 days
                return 3  # 1-7 days
            else:
                return 4  # > 7 days
        
        df['time_bucket'] = time_deltas.apply(bucket_time)
        return df
    
    def build_monthly_sequences(self, df):
        """
        Build monthly time-series sequences for each email
        Each sequence spans lookback_months and includes aggregated features
        """
        print(f"\nBuilding monthly sequences (lookback={self.lookback_months} months)...")
        
        # Fit vectorizer on all passwords first
        print("Fitting password vectorizer...")
        self.vectorizer.fit(df['Password'].tolist())
        
        # Sort by email and attack time
        df = df.sort_values(['Email', 'AttackMonth'])
        
        sequences = []
        labels_platform = []
        labels_time_bucket = []
        metadata = []
        
        # Process each email
        for email in df['Email'].unique():
            email_data = df[df['Email'] == email].copy()
            
            # Get unique months where this email was attacked
            attack_months = email_data['AttackMonth'].unique()
            
            for target_month in attack_months:
                # Get all attacks in this target month
                target_attacks = email_data[email_data['AttackMonth'] == target_month]
                
                # For this month, we'll create features based on ALL attacks in lookback window
                # Define lookback period
                start_month = target_month - self.lookback_months
                
                # Get all data in lookback window
                lookback_data = email_data[
                    (email_data['AttackMonth'] > start_month) & 
                    (email_data['AttackMonth'] <= target_month)
                ]
                
                if len(lookback_data) == 0:
                    continue
                
                # Build monthly feature sequence
                monthly_features = []
                for month_offset in range(self.lookback_months):
                    current_month = target_month - (self.lookback_months - month_offset - 1)
                    
                    month_data = lookback_data[lookback_data['AttackMonth'] == current_month]
                    
                    if len(month_data) == 0:
                        # No attacks this month - use zeros
                        month_features = self._get_zero_features()
                    else:
                        # Aggregate features for this month
                        month_features = self._aggregate_month_features(
                            month_data, email, df
                        )
                    
                    monthly_features.append(month_features)
                
                # Stack into sequence: (lookback_months, num_features)
                sequence = np.vstack(monthly_features)
                sequences.append(sequence)
                
                # Label is the most common platform in target month
                target_platform = target_attacks['Breach_Source'].mode()[0]
                labels_platform.append(target_platform)
                
                # Time bucket is average of target month
                avg_time_bucket = int(target_attacks['time_bucket'].mean())/home/Passwords/Here
                labels_time_bucket.append(avg_time_bucket)
                
                # Store metadata
                metadata.append({
                    'email': email,
                    'target_month': str(target_month),
                    'num_attacks_in_month': len(target_attacks),
                    'platforms_in_month': target_attacks['Breach_Source'].unique().tolist()
                })
        
        print(f"Generated {len(sequences)} sequences")
        
        return sequences, labels_platform, labels_time_bucket, metadata
    
    def _get_zero_features(self):
        """Return zero feature vector for months with no attacks"""
        return np.zeros((1, 26))  # 26 features total: 2 basic + 12 honeyword + 8 historical + 4 time
    
    def _aggregate_month_features(self, month_data, email, full_df):
        """
        Aggregate features for a single month
        Returns: numpy array of shape (1, num_features)
        """
        features = []
        
        # Basic statistics
        features.append(len(month_data))  # num_attacks_in_month
        features.append(month_data['Breach_Source'].nunique())  # num_platforms
        
        # Compute average honeyword features
        hw_features_list = []
        for _, row in month_data.iterrows():
            pw_embedding = self.vectorizer.transform([row['Password']]).toarray()[0]
            hw_feats = self.compute_honeyword_features(row, pw_embedding)
            hw_features_list.append(hw_feats)
        
        # Average honeyword features across all attacks in month
        hw_features_df = pd.DataFrame(hw_features_list)
        for col in hw_features_df.columns:
            features.append(hw_features_df[col].mean())
        
        # Compute average historical features
        hist_features_list = []
        for _, row in month_data.iterrows():
            pw_embedding = self.vectorizer.transform([row['Password']]).toarray()[0]
            hist_feats = self.compute_historical_features(
                email, row['LeakMonth'], row['Password'], full_df, pw_embedding
            )
            hist_features_list.append(hist_feats)
        
        # Average historical features across all attacks in month
        hist_features_df = pd.DataFrame(hist_features_list)
        for col in hist_features_df.columns:
            features.append(hist_features_df[col].mean())
        
        # Time-based features
        time_delta_hours = (month_data['AttackTime'] - month_data['Timestamp']).dt.total_seconds() / 3600
        features.append(time_delta_hours.mean())  # avg_time_to_attack_hours
        features.append(time_delta_hours.std())   # std_time_to_attack_hours
        features.append(time_delta_hours.min())   # min_time_to_attack_hours
        features.append(time_delta_hours.max())   # max_time_to_attack_hours
        
        return np.array(features).reshape(1, -1)
    
    def prepare_gpu_ready_dataset(self, sequences, labels_platform, labels_time_bucket, metadata):
        """
        Convert sequences to GPU-optimized format
        Returns: X (float32 array), y_platform (int), y_time_bucket (int), metadata
        """
        print("\nPreparing GPU-ready dataset...")
        
        # Stack sequences into 3D array: (num_samples, lookback_months, num_features)
        X = np.stack(sequences).astype(np.float32)
        
        # Encode platform labels
        y_platform = self.platform_encoder.fit_transform(labels_platform)
        
        # Time bucket labels (already integers)
        y_time_bucket = np.array(labels_time_bucket, dtype=np.int32)
        
        print(f"X shape: {X.shape} (samples, time_steps, features)")
        print(f"y_platform shape: {y_platform.shape} - {len(self.platform_encoder.classes_)} classes")
        print(f"y_time_bucket shape: {y_time_bucket.shape} - {len(np.unique(y_time_bucket))} classes")
        print(f"Memory usage: {X.nbytes / 1024**2:.2f} MB")
        
        return X, y_platform, y_time_bucket, metadata
    
    def get_feature_names(self):
        """Return names of all features in order"""
        feature_names = [
            'num_attacks_in_month',
            'num_platforms',
            # Honeyword features (12)
            'hw_mean_similarity',
            'hw_max_similarity',
            'hw_min_similarity',
            'hw_std_similarity',
            'hw_median_similarity',
            'hw_top5_mean_similarity',
            'hw_top10_mean_similarity',
            'hw_bottom10_mean_similarity',
            'hw_q25_similarity',
            'hw_q75_similarity',
            'hw_iqr_similarity',
            'hw_range_similarity',
            # Historical features (8)
            'hist_leak_count',
            'hist_months_since_last_leak',
            'hist_max_pw_similarity',
            'hist_mean_pw_similarity',
            'hist_max_hw_similarity',
            'hist_mean_hw_similarity',
            'hist_platform_diversity',
            'hist_reuse_/home/Passwords/indicator',
            # Time features (4)
            'avg_time_to_attack_hours',
            'std_time_to_attack_hours',
            'min_time_to_attack_hours',
            'max_time_to_attack_hours',
        ]
        return feature_names
    
    def save_dataset(self, X, y_platform, y_time_bucket, metadata, output_path):
        """Save processed dataset to disk"""
        print(f"\nSaving dataset to {output_path}...")
        
        np.savez_compressed(
            output_path,
            X=X,
            y_platform=y_platform,
            y_time_bucket=y_time_bucket,
            metadata=metadata,
            platform_classes=self.platform_encoder.classes_,
            feature_names=self.get_feature_names(),
            lookback_months=self.lookback_months
        )
        
        print("Dataset saved successfully!")
    
    def get_summary_stats(self, df, X, y_platform, y_time_bucket):
        """Generate summary statistics"""
        stats = {
            'total_records': len(df),
            'total_emails': df['Email'].nunique(),
            'total_platforms': df['Breach_Source'].nunique(),
            'date_range': f"{df['Timestamp'].min()} to {df['AttackTime'].max()}",
            'num_sequences': X.shape[0],
            'sequence_length': X.shape[1],
            'num_features': X.shape[2],
            'platform_distribution': dict(zip(
                self.platform_encoder.classes_,
                np.bincount(y_platform)
            )),
            'time_bucket_distribution': dict(zip(
                ['<1h', '1-6h', '6-24h', '1-7d', '>7d'],
                np.bincount(y_time_bucket)
            ))
        }
        return stats


def main():
    """Main processing pipeline"""
    
    # Initialize processor with 12-month lookback
    processor = BreachTimeSeriesProcessor(lookback_months=12)
    
    # Load data
    df = processor.load_and_prepare_data('merged_all_breaches_100hw.csv')
    
    # Create time buckets
    df = processor.create_time_buckets(df)
    
    # Build monthly sequences
    sequences, labels_platform, labels_time_bucket, metadata = processor.build_monthly_sequences(df)
    
    # Prepare GPU-ready format
    X, y_platform, y_time_bucket, metadata = processor.prepare_gpu_ready_dataset(
        sequences, labels_platform, labels_time_bucket, metadata
    )
    
    # Get summary statistics
    stats = processor.get_summary_stats(df, X, y_platform, y_time_bucket)
    
    print("\n" + "="*70)
    print("DATASET SUMMARY")
    print("="*70)
    for key, value in stats.items():
        if isinstance(value, dict):
            print(f"\n{key}:")
            for k, v in value.items():
                print(f"  {k}: {v}")
        else:
            print(f"{key}: {value}")
    
    # Save dataset
    processor.save_dataset(
        X, y_platform, y_time_bucket, metadata,
        'breach_dataset_monthly.npz'
    )
    
    # Save metadata separately for easy inspection
    metadata_df = pd.DataFrame(metadata)
    metadata_df.to_csv('breach_metadata.csv', index=False)
    
    # Save summary stats
    stats_df = pd.DataFrame([stats])
    stats_df.to_json('dataset_stats.json', orient='records', indent=2)
    
    print("\n" + "="*70)
    print("DATASET FILES CREATED:")
    print("="*70)
    print("1. breach_dataset_monthly.npz - Main dataset (X, y_platform, y_time_bucket)")
    print("2. breach_metadata.csv - Sequence metadata")
    print("3. dataset_stats.json - Summary statistics")
    print("\nReady for GPU training with PyTorch/TensorFlow!")


if __name__ == "__main__":
    main()

# Previewing Dataset

In [None]:
import numpy as np

# Load your generated dataset
data = np.load('breach_dataset_monthly.npz', allow_pickle=True)

X = data['X']                    # Shape: (num_samples, 12, 26)
                                 # num_samples varies by your dataset size
                                 # 12 = months lookback
                                 # 26 = features per month

y_platform = data['y_platform']  # Shape: (num_samples,)
                                 # Platform labels (integer encoded)

y_time_bucket = data['y_time_bucket']  # Time-to-attack classification

platform_classes = data['platform_classes']  # Your platform names

feature_names = data['feature_names']  # List of 26 feature names
print(feature_names)