In [None]:
import os
import time
import json
import warnings
import string
import random
import re
from pathlib import Path
from datetime import datetime
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)
import argparse
import pandas as pd

In [1]:
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

In [2]:
import json
with open("secrets/token.json", "r") as file:
    data = json.load(file)
HUGGINGFACE_HUB_TOKEN = data["HUGGINGFACE_HUB_TOKEN"]

In [3]:
import torch

# Monkey-patch get_default_device for this session
if not hasattr(torch, "get_default_device"):
    def _get_default_device():
        return torch.device("mps") if (torch.backends.mps.is_built() and torch.backends.mps.is_available()) else torch.device("cpu")
    torch.get_default_device = _get_default_device

In [None]:
# Suppress specific warnings
warnings.filterwarnings(
    "ignore", 
    category=UserWarning, 
    module="transformers.pytorch_utils"
)

# Logging helper
def log(msg: str):
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}")

# Device detection (will now agree with our monkey‐patch)
device = torch.device("mps") if torch.backends.mps.is_built() and torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")

    
# Model and tokenizer setup
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
log(f"Loading tokenizer for {MODEL_ID}...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    token=HUGGINGFACE_HUB_TOKEN,
)
log("Tokenizer loaded.")

log(f"Loading model {MODEL_ID} (float16, auto device_map)...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    token=HUGGINGFACE_HUB_TOKEN,
    torch_dtype=torch.float16,
    device_map=None,
    low_cpu_mem_usage=True,
)
# Move model to MPS (Apple GPU) if available, else CPU
model = model.to(device)
model.eval()
log(f"Model loaded and moved to {device}.")

# Default generation parameters
gen_kwargs = {
    "max_new_tokens": 512,
    "do_sample": True,
    "eos_token_id": tokenizer.eos_token_id,
    "pad_token_id": tokenizer.eos_token_id,
    "temperature": 1.0,
    "top_p": 0.95,
}

# Static system prompt
date = datetime.today().strftime("%Y-%m-%d")
SYSTEM_PROMPT = f"""
You are an expert macOS system administrator that classifies file paths as safe-to-delete cache files or not.
Respond in JSON only.
Current date: {date}
"""

# Build user prompt
def get_user_prompt(n: int) -> str:
    return f"""
    Generate a `multi_language_dev_dataset`: a JSON array of at least {n} objects covering common dev-cache/build folders on macOS across various ecosystems:
          - Python (`__pycache__`, `venv`)
          - Node.js (`~/.npm`, `node_modules`)
          - Java/Maven (`~/.m2/repository`, `target`)
          - Android/Gradle (`.gradle`, `build/`)
          - Go (`~/go/pkg`, `~/go/bin`)
          - Rust/Cargo (`target/`)
          - Docker (`~/.docker`)
          - VSCode (`~/.vscode/extensions`)
          - .NET/NuGet (`~/.nuget/packages`)
          - Ruby/Bundler (`~/.bundle`, `~/.gem`)
        For each entry pick realistic:
          • `file_name` & `file_path`
          • `extension` (or empty string)
          • `size_mb`
          • `last_accessed_days`
          • `category`
          • `tool`
          • `label`
          • `reason`
        Respond only with the JSON array.
    """

# Tokenization helper
def build_input(system: str, user: str):
    enc_sys = tokenizer(system, return_tensors="pt")
    enc_usr = tokenizer(user, return_tensors="pt")
    input_ids = torch.cat([enc_sys.input_ids, enc_usr.input_ids], dim=1).to(device)
    attention_mask = torch.cat([enc_sys.attention_mask, enc_usr.attention_mask], dim=1).to(device)
    return input_ids, attention_mask

# Generate and save raw + parse JSON
def generate_dataset(n_examples: int = 50, raw_dir: Path = Path("raw_txt"), json_dir: Path = Path("output_json")):
    raw_dir.mkdir(exist_ok=True)
    user_prompt = get_user_prompt(n_examples)
    ids, mask = build_input(SYSTEM_PROMPT, user_prompt)
    start = time.perf_counter()
    with torch.no_grad():
        out = model.generate(
            input_ids=ids,
            attention_mask=mask,
            **gen_kwargs
        )
    log(f"Batch time: {time.perf_counter()-start}")
    
    # Extract newly generated tokens
    gen_tokens = out[0, ids.shape[1]:]
    raw_text = tokenizer.decode(gen_tokens, skip_special_tokens=True)
    # Save raw output with error handling
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    uid = ''.join(random.choices(string.ascii_uppercase + string.digits, k=6))
    raw_file = raw_dir / f"raw_{ts}_{uid}.txt"
    try:
        raw_file.write_text(raw_text, encoding="utf-8")
        log(f"Raw output written to {raw_file}")
    except Exception as e:
        log(f"ERROR writing raw output: {e}")

    # Extract all JSON array substrings (non-greedy)
    matches = re.findall(r"\[.*?\]", raw_text, flags=re.DOTALL)
    log(raw_text[:50])
    if not matches:
        log(f"Failed to extract any JSON array from raw output, see {raw_file}")
        return raw_text
    else:
        data = []
        for idx, json_str in enumerate(matches, start=1):
            try:
                obj = json.loads(json_str)
                log(f"Parsed JSON array #{idx} with {len(obj)} items.")
                data.extend(obj if isinstance(obj, list) else [obj])
            except json.JSONDecodeError as e:
                log(f"ERROR parsing JSON array #{idx}: {e}")
    
        # 3) write parsed data out to JSON
        json_dir.mkdir(parents=True, exist_ok=True)
        json_file = json_dir / f"output_{ts}_{uid}.json"
        try:
            # indent=2 for readability; you can omit if you want compact
            json_file.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
            log(f"Parsed JSON written to {json_file}")
        except Exception as e:
            log(f"ERROR writing parsed JSON: {e}")
       # return raw_text
        return data

In [None]:
parser = argparse.ArgumentParser(description="Generate multi-language dev dataset.")
parser.add_argument("-n", "--num",           type=int,   default=50000, help="Total entries to generate.")
parser.add_argument("--chunk_size",          type=int,   default=12,    help="Entries per batch prompt.")
parser.add_argument("--temperature",         type=float, default=1.0,   help="Sampling temperature.")
parser.add_argument("--max_new_tokens",      type=int,   default=2400,  help="Max tokens per batch")
parser.add_argument("--do_sample",           action="store_true",    help="Enable sampling.")
args, _ = parser.parse_known_args()

# Update generation kwargs from CLI
gen_kwargs.update({
    "max_new_tokens": args.max_new_tokens,
    "do_sample":      args.do_sample,
    "temperature":    args.temperature if args.do_sample else 1.0,
    "top_p":          0.95 if args.do_sample else 1.0,
})

total = args.num
chunk = args.chunk_size
all_data = []
log(f"Generating {total} entries in chunks of {chunk}...")
for start in range(0, total, chunk):
    batch = min(chunk, total - start)
    log(f"Batch {start//chunk+1}: generating {batch} entries...")
    data = generate_dataset(batch)       # uses updated gen_kwargs
    all_data.extend(data)


# Save JSON dataset inside parsed_json/
parsed_dir = Path("parsed_json")
parsed_dir.mkdir(exist_ok=True)
out_json = parsed_dir / f"multi_language_dev_dataset_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
try:
    with open(out_json, "w", encoding="utf-8") as f:
        #json.dump(all_data, f, indent=2, ensure_ascii=False)
        try:
            json.dump(all_data, f, indent=2, ensure_ascii=False)
        except TypeError as e:
            log(f"Serialization error: {e}")
    log(f"Parsed JSON written to {out_json}")
except Exception as e:
    log(f"ERROR writing JSON: {e}")
    
# Build DataFrame and save CSV
try:
    df = pd.DataFrame(all_data)
    cols = ["file_name", "file_path", "extension", "size_mb", "last_accessed_days", "category" ,"tool", "label", "reason"]
    df = df[[c for c in cols if c in df.columns]]
    csv_file = out_json.with_suffix('.csv')
    df.to_csv(csv_file, index=False)
    log(f"DataFrame CSV written to {csv_file}")
except Exception as e:
    log(f"ERROR creating/saving DataFrame: {e}")
# Print preview
print(df.head())