In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# adapter_model = f"sergiopaniego/{output_dir}" # Replace with your HF username or organization
model_id, output_dir = "google/gemma-3-4b-it", "../gemma-3-4b-it"                                  # ⚠️ ~6.8 GB VRAM

model_id = "microsoft/phi-4"

# base_model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", device_map="cuda")
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    attn_implementation="sdpa",                   # Change to Flash Attention if GPU has support
    dtype='auto',                          # Change to bfloat16 if GPU has support
    device_map='cuda',
    # use_cache=True,                               # Whether to cache attention outputs to speed up inference
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,                        # Load the model in 4-bit precision to save memory
        bnb_4bit_compute_dtype=torch.float16,     # Data type used for internal computations in quantization
        bnb_4bit_use_double_quant=True,           # Use double quantization to improve accuracy
        bnb_4bit_quant_type="nf4"                 # Type of quantization. "nf4" is recommended for recent LLMs
    )
)
# fine_tuned_model = PeftModel.from_pretrained(base_model, output_dir)

tokenizer = AutoTokenizer.from_pretrained(model_id)


In [None]:
def inference(fine_tuned_model, tokenizer, messages):

    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt", add_special_tokens=False).to(fine_tuned_model.device)

    eos = tokenizer.eos_token_id
    eot = tokenizer.convert_tokens_to_ids("<end_of_turn>")
    terminators = [i for i in [eos, eot] if i is not None]

    generated_ids = fine_tuned_model.generate(
        **model_inputs,
        max_new_tokens=512,
        do_sample=True, temperature=0.2, top_p=0.9,
        repetition_penalty=1.05, no_repeat_ngram_size=4,
        # eos_token_id=terminators,  # stop on EOS or EOT
        # pad_token_id=tokenizer.pad_token_id or eos,
    )
    # print(generated_ids[0])
    # print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
    # print("-----------")
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]

    # Decode and extract model response
    generated_text = tokenizer.decode(output_ids, skip_special_tokens=True)
    
    return generated_text

In [None]:
import csv 

messages = [
  {
      'content': 'Given a line of dialogue from the user, generate a single, natural-sounding line that someone else could have said immediately before it in a realistic conversation. \
The response should feel contextually appropriate, emotionally coherent, and distinct in voice from the user’s line, as if two different people were talking. \
Output only the preceding line, with no quotes, explanations, or additional text.',
      'role': 'system',
  },
]

with open('VNKurisuDialogues.csv', 'r') as file:
    with open('synthetic_data.csv', 'w', newline='') as output_file:
        writer = csv.writer(output_file)
        writer.writerow(['user', 'assistant'])  # Write header row
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            messages.append({
                'content': row[1],
                'role': 'user'
            })
            generated_text = inference(base_model, tokenizer, messages)
            writer.writerow([generated_text, row[1]])  # Write the user input and generated text to the CSV
            # Flush so that data is written incrementally
            output_file.flush()
            # Remove the last user message to avoid context buildup
            messages.pop()
            

        