In [1]:
%env CUDA_VISIBLE_DEVICES=0


env: CUDA_VISIBLE_DEVICES=0


In [2]:
import os
#os.makedirs(r"D:\hf_cache\hub", exist_ok=True)
os.makedirs(r"~/luudh/MyFile/vr_lab/hf_cache/hub", exist_ok=True)

# Best: point directly to the hub cache folder
os.environ["HUGGINGFACE_HUB_CACHE"] = r"~/luudh/MyFile/vr_lab/hf_cache/hub"

In [3]:
from huggingface_hub import login

login(token = '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')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
base_model = "meta-llama/Llama-3.1-8B-Instruct"
fine_tuned_model = os.path.expanduser(
    "~/luudh/MyFile/AI_Scheduling/Llama-3.1-8B-Instruct-finetuned-version2"
)

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from peft import PeftModel
import torch
import re
from trl import setup_chat_format

In [6]:
def start_model(base, fine_tuned):
    #reload tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    tokenizer.pad_token = tokenizer.eos_token
    #print(tokenizer.chat_template)

    # QLoRA config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    max_memory = {0: "46GiB", "cpu": "64GiB"} # adjust according to your GPU

    """
    max_memory = {
        0: "6GiB",         
        "cpu": "32GiB"
    }
    """
    base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        quantization_config=bnb_config,
        device_map="auto",                  # let it split GPU/CPU
        attn_implementation="eager",        # avoid flash-attn
        torch_dtype=torch.float16,
        max_memory=max_memory,
    )

    tokenizer.chat_template = None
    base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

    #merge adapter with base model
    model = PeftModel.from_pretrained(base_model_reload, fine_tuned_model)

    model = model.merge_and_unload()
    return model, tokenizer

In [7]:
import json, re

def extract_json(text: str):
    # 1) take everything from first { to last }
    start = text.find("{")
    end = text.rfind("}")
    if start == -1 or end == -1 or end <= start:
        return None

    snippet = text[start:end+1].strip()

    # 2) remove code fences if present
    if snippet.startswith("```"):
        # remove ```json or ``` and trailing ```
        snippet = re.sub(r"^```[a-zA-Z]*\n?", "", snippet)
        snippet = re.sub(r"```$", "", snippet).strip()

    # 3) fix python-style literals if model produced them
    snippet = (
        snippet.replace("True", "true")
               .replace("False", "false")
               .replace("None", "null")
    )

    # 4) now try to load
    try:
        return json.loads(snippet)
    except json.JSONDecodeError:
        # optional: print(snippet) to inspect
        return None


def generate(user_input, base_model=base_model, fine_tuned_model=fine_tuned_model):
    model, tokenizer = start_model(base_model, fine_tuned_model)

    instruction = """You are an assistant that extracts structured metadata from task descriptions.

Return the following fields:
- Task Priority (high, medium, low)
- Task Complexity (1-10 scale)
- Required Skills (list of skills, including both technical and soft/inferred skills)
- Estimated Time (in days), calculated as the number of days between today and the task deadline.
  The Estimated Time must be less than or equal to the total number of days until the Deadline.
- Deadline (in YYYY-MM-DD HH:MM format)
- NTS_skills (object with the keys: ambiguity_tolerance, communication, planning, collaboration, reasoning, risk_awareness, ownership, stakeholder_mgmt; each value is an integer 1-5)
- importance (object with the keys: ambiguity_tolerance, communication, planning, collaboration, reasoning, risk_awareness, ownership, stakeholder_mgmt; each value is an integer 1-5)

Respond only with the structured output in JSON format.
"""

    messages = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": user_input}
    ]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=300,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )

    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    data = extract_json(text)
    if data is not None:
        print(data)
        return data
    else:
        # fall back: show the raw text so you can see what model produced
        print("No valid JSON found.\n--- RAW ---\n", text)
        return None

user_input = """Design and implement a machine learning pipeline to predict customer churn using historical transaction and interaction data. 
Use Python with scikit-learn or XGBoost, and ensure the model can be evaluated using ROC-AUC and F1 metrics. 
Collaborate with the data engineering team to source clean datasets and set up daily retraining jobs via Airflow. 
The project must be delivered by 2026-06-26 14:00."""

generate(user_input)

Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.72s/it]
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


No valid JSON found.
--- RAW ---
 system
You are an assistant that extracts structured metadata from task descriptions.

Return the following fields:
- Task Priority (high, medium, low)
- Task Complexity (1-10 scale)
- Required Skills (list of skills, including both technical and soft/inferred skills)
- Estimated Time (in days), calculated as the number of days between today and the task deadline.
  The Estimated Time must be less than or equal to the total number of days until the Deadline.
- Deadline (in YYYY-MM-DD HH:MM format)
- NTS_skills (object with the keys: ambiguity_tolerance, communication, planning, collaboration, reasoning, risk_awareness, ownership, stakeholder_mgmt; each value is an integer 1-5)
- importance (object with the keys: ambiguity_tolerance, communication, planning, collaboration, reasoning, risk_awareness, ownership, stakeholder_mgmt; each value is an integer 1-5)

Respond only with the structured output in JSON format.

user
Design and implement a machine lea