# Imports and env  settings

In [1]:
# %pip install "cohere" "datasets" "transformers" "accelerate" "peft" "bitsandbytes"

In [None]:
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, DataCollatorForLanguageModeling,Trainer,pipeline
from peft import LoraConfig, get_peft_model, TaskType
from typing import List, Dict, Optional, Any
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import pandas as pd
import numpy as np
import torch
import random
import cohere
import json
import os
load_dotenv(override=True)

True

In [3]:
# Import helpers & constants from the RAG file (generated automatically from ipynb)
from RAG_generic_func import (
    load_and_embedd_dataset,
    create_pinecone_index,
    upsert_vectors,       # we'll override here
    build_context,
    build_user_style,     # same
    augment_prompt,
    EMBEDDING_MODEL,
    COHERE_API_KEY,
    PINECONE_API_KEY,
)


In [None]:
COHERE_API_KEY = os.environ.get("COHERE_API_KEY_PAY", "")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY", "")

INPUT_PATH_TRAIN = r"C:\Users\Cyber_User\Documents\GitHub\Whatsapp_webApp_-Django-\Fine_Tune\fine_tune_data\bbt_train_cleaned.jsonl"
INPUT_PATH_VAL   = r"C:\Users\Cyber_User\Documents\GitHub\Whatsapp_webApp_-Django-\Fine_Tune\fine_tune_data\bbt_val_cleaned.jsonl"

OUTPUT_PATH_TRAIN = r"C:\Users\Cyber_User\Documents\GitHub\Whatsapp_webApp_-Django-\Fine_Tune\fine_tune_data\bbt_train_distilled.jsonl"
OUTPUT_PATH_VAL   = r"C:\Users\Cyber_User\Documents\GitHub\Whatsapp_webApp_-Django-\Fine_Tune\fine_tune_data\bbt_val_distilled.jsonl"

MODEL_OUTPUT_PATH = r"C:\Users\Cyber_User\Documents\GitHub\Whatsapp_webApp_-Django-\Fine_Tune\distilled"
data_files = {
    "train": OUTPUT_PATH_TRAIN,
    "validation": OUTPUT_PATH_VAL
}
# ds = load_dataset("json", data_files=data_files)
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"] # full attention
# TARGET_MODULES = ["q_proj","v_proj"]
# TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]



EMBEDDING_MODEL = "all-MiniLM-L6-v2"
KB_PATH = r"RAG_data\KB_data.csv"
OUTPUT_KB_JSONL = r"RAG_data\distillation_dataset.jsonl"

AUGMENT_FRACTION = 0.3   # fraction of human examples that also get a teacher-label version
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

USER = "Barbara"         # change if your Barbara user_id is different
INDEX_NAME = "chats-index"

# parameters for fine tuning
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
BATCH_SIZE = 4
MAX_LENGTH = 512
NUM_EPOCHS = 3
LEARNING_RATE = 5e-5

# "Weights" via oversampling: how many times to duplicate human examples
HUMAN_DUP_FACTOR = 2   # 2 = roughly double weight vs teacher

device = "cuda" if torch.cuda.is_available() else "cpu"
device


'cpu'

# Distillation to improve fine-tune

## Preparing the distilled dataset 

In [None]:
co = cohere.ClientV2(COHERE_API_KEY)

# Simple persona descriptions per speaker
PERSONAS = {
    "Sheldon": (
        "You are Sheldon Cooper from The Big Bang Theory. "
        "You are a brilliant, pedantic theoretical physicist: literal, arrogant, and verbose. "
        "You speak in precise, formal, slightly condescending language and often reference science, physics, and your own intellect."
    ),
    "Leonard": (
        "You are Leonard Hofstadter from The Big Bang Theory. "
        "You are kind, self-conscious, often nervous, and try to keep the peace between your friends. "
        "You speak in a casual, slightly awkward but caring tone, and you often try to sound reasonable and supportive."
    ),
    "Penny": (
        "You are Penny from The Big Bang Theory. "
        "You are friendly, sarcastic, and down-to-earth, with good social intuition. "
        "You use casual everyday language, sometimes tease the guys, and react emotionally and humorously to their geeky behavior."
    ),
    "Howard": (
        "You are Howard Wolowitz from The Big Bang Theory. "
        "You are an aerospace engineer with an overconfident, sometimes creepy flirtatious style. "
        "You crack innuendo-filled jokes, brag about your accomplishments, and speak in a playful, comedic tone, especially about space and women."
    ),
    "Raj": (
        "You are Rajesh Koothrappali from The Big Bang Theory. "
        "You are sensitive, romantic, and somewhat socially awkward, with a love of pop culture and fantasy. "
        "You speak in an emotional, sometimes dramatic way, and you often talk about love, loneliness, and your interests like movies and comics."
    ),
    "Amy": (
        "You are Amy Farrah Fowler from The Big Bang Theory. "
        "You are a neurobiologist with a mix of scientific seriousness and socially awkward earnestness. "
        "You speak in a formal, analytical tone about emotions and relationships, and you are intensely devoted to Sheldon and your friends."
    ),
    "Bernadette": (
        "You are Bernadette Rostenkowski-Wolowitz from The Big Bang Theory. "
        "You have a sweet, high-pitched speaking style that can turn surprisingly strict or intimidating. "
        "You are practical, sometimes bossy, and you often mix cute phrasing with sharp, no-nonsense comments."
    ),
    # other non-central charactes
    "DEFAULT": (
        "You are a character from The Big Bang Theory. "
        "Respond in a style consistent with that character's personality and the show's comedic tone."
    ),
}

def get_persona(speaker: str) -> str:
    if not speaker:
        return PERSONAS["DEFAULT"]
    return PERSONAS.get(speaker, PERSONAS["DEFAULT"])

In [None]:
def distil_file(input_path: str, output_path: str, max_examples: int | None = None):
    """Reads original BBT JSONL and translates to teacher_target using Cohere"""
    with open(input_path, "r", encoding="utf-8") as fin:
        lines = [json.loads(l) for l in fin]

    if max_examples is not None:
        lines = lines[:max_examples]

    with open(output_path, "w", encoding="utf-8") as fout:
        for ex in tqdm(lines, desc=f"Distilling {input_path}"):
            prompt = ex.get("prompt", "")
            target_speaker = ex.get("target_speaker", "")

            persona = get_persona(target_speaker)

            # We use Cohere chat endpoint with system + user message
            try:
                response = co.chat(
                    model="command-a-03-2025",
                    messages=[
                        {
                            "role": "system",
                            "content": (
                                persona
                                + " You will be given the dialogue context. "
                                  "Continue the next line exactly as this character would speak. "
                                  "Respond with ONLY the next line of dialogue, no quotes, "
                                  "and do NOT add speaker tags."
                            ),
                        },
                        {
                            "role": "user",
                            "content": prompt,
                        },
                    ],
                    temperature=0.7,
                    max_tokens=96,
                )
                teacher_text = response.message.content[0].text.strip()

            except Exception as e:
                print(f"Error on example with ep={ex.get('ep')} scene={ex.get('scene')}:", e)
                # if Cohere fails we just use original script target 
                teacher_text = ex.get("target", "").strip()

            ex["teacher_target"] = teacher_text

            fout.write(json.dumps(ex, ensure_ascii=False) + "\n")

In [None]:
distil_file(INPUT_PATH_TRAIN, OUTPUT_PATH_TRAIN, max_examples=None) # meaning all - depends on restrictions of cohere account
distil_file(INPUT_PATH_VAL, OUTPUT_PATH_VAL, max_examples=None)

## Running fine tune again

In [None]:
output_dir = r"Fine_Tune\outputs\tinyllama_bbt_distilled_lora"

ds = load_dataset("json", data_files=data_files)
train_ds = ds["train"]
val_ds = ds["validation"]

tok = AutoTokenizer.from_pretrained(MODEL_ID)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

base = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype="auto",
)

base.config.pad_token_id = tok.pad_token_id

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=TARGET_MODULES,
)

model = get_peft_model(base, lora_config)
model.print_trainable_parameters()

In [None]:
max_len = 512

def build_example(ex):
    prompt = ex.get("prompt", "")
    # Use teacher's answer as label (distillation)
    target = ex.get("teacher_target", ex.get("target", ""))
    x = prompt + target

    enc_full   = tok(x, max_length=max_len, truncation=True)
    enc_prompt = tok(prompt, max_length=max_len, truncation=True)

    input_ids = enc_full["input_ids"]
    labels    = input_ids.copy()

    # mask prompt part
    n_prompt = len(enc_prompt["input_ids"])
    for i in range(min(n_prompt, len(labels))):
        labels[i] = -100

    return {
        "input_ids": input_ids,
        "attention_mask": enc_full["attention_mask"],
        "labels": labels,
    }

cols = ["input_ids", "attention_mask", "labels"]

train_tok = train_ds.map(
    build_example,
    remove_columns=train_ds.column_names,
)
val_tok = val_ds.map(
    build_example,
    remove_columns=val_ds.column_names,
)

train_tok.set_format(type="torch", columns=cols)
val_tok.set_format(type="torch", columns=cols)


In [None]:
# We don't want random masking (MLM); we already prepared labels ourselves.
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tok,
    mlm=False,
)

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=1000,
    save_total_limit=3,
    num_train_epochs=3,
    learning_rate=2e-4,
    warmup_ratio=0.03,
    fp16=True,           # if GPU supports it
    bf16=False,          #  True if on A100
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    data_collator=data_collator,
)

In [None]:
trainer.train()
trainer.save_model(output_dir)
tok.save_pretrained(output_dir)
print("Finished training + saved model + tokenizer.")

## Evaluation

# Distillation on actual chats

## Getting teacher labels

In [5]:
def build_user_style(
    df: pd.DataFrame,
    user_id: str,
    k: int = 10,
    text_col: str = "text",
    random_sample: bool = True,
    seed: int | None = 42,
) -> tuple[list[str], str]:
    """
    Return:
      - list of example messages (lines)
      - a single multi-line string user_style

    If there are no messages for this user_id, returns ([], "").
    """
    user_df = df[df["sender_user_id"] == user_id].copy()

    if len(user_df) == 0:
        return [], ""

    user_df = user_df.sort_values("sent_at")

    if random_sample and len(user_df) > k:
        rng = np.random.default_rng(seed)
        idx = rng.choice(user_df.index.to_list(), size=k, replace=False)
        user_df = user_df.loc[idx].sort_values("sent_at")
    else:
        user_df = user_df.tail(k)

    lines = [str(msg) for msg in user_df[text_col].tolist()]
    user_style = "\n".join(lines)
    return lines, user_style


def upsert_vectors(
    index,               # Pinecone index object
    dataset: pd.DataFrame,
    embeddings: np.ndarray,
    batch_size: int = 128,
):
    """
    Upsert vectors to a Pinecone index.

    Args:
        index: pc.Index instance.
        dataset: DataFrame containing metadata; must align with embeddings.
        embeddings: numpy array [n_rows, dim].
    """
    from tqdm import tqdm

    print("Upserting the embeddings to the Pinecone index...")

    if embeddings.shape[0] != len(dataset):
        raise ValueError(
            f"Embeddings rows ({embeddings.shape[0]}) != dataset rows ({len(dataset)})"
        )

    metadata_fields = [col for col in dataset.columns if col != "embedding"]

    num_rows = embeddings.shape[0]
    ids = [str(i) for i in range(num_rows)]

    meta = []
    for _, row in dataset.iterrows():
        entry = {col: row[col] for col in metadata_fields}
        meta.append(entry)

    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, len(to_upsert), batch_size)):
        i_end = min(i + batch_size, len(to_upsert))
        index.upsert(vectors=to_upsert[i:i_end])

    print("Upserting complete!")
    return index

In [6]:
print("=== RAG initialization ===")

# Cohere client
co = cohere.Client(api_key=COHERE_API_KEY)

# 1) Load KB
whatsapp_chats = pd.read_csv(KB_PATH)

# 2) Embed entire KB once
model_emb = SentenceTransformer(EMBEDDING_MODEL)
kb_df_all, embeddings = load_and_embedd_dataset(whatsapp_chats, model_emb)

# 3) Keep only rows where Barbara is the receiver (for retrieval)
kb_df_to_barbara = kb_df_all[kb_df_all["receiver_user_id"] == USER].sort_values("conv_turn")
embeddings_to_barbara = embeddings[kb_df_to_barbara.index.to_list()]

print("Rows for Barbara as receiver:", len(kb_df_to_barbara))

# 4) Create Pinecone index once
pc = create_pinecone_index(INDEX_NAME, embeddings_to_barbara.shape[1])

# 5) Upsert embeddings once
index = pc.Index(INDEX_NAME)
index = upsert_vectors(index, kb_df_to_barbara, embeddings_to_barbara)

# 6) Shared context & style for Barbara (can tune conv_id)
context = build_context(
    kb_df_all,
    conv_id="chat:u_1_u_2",  # adjust conv_id as needed
    k=10,
)

_, user_style = build_user_style(
    kb_df_all,
    user_id=USER,
    k=10,
)

print("RAG initialization done!")

=== RAG initialization ===
Loading and embedding the dataset


Batches:   0%|          | 0/74 [00:00<?, ?it/s]

Done!
Rows for Barbara as receiver: 1181
Creating a Pinecone index...
Done!
Upserting the embeddings to the Pinecone index...


100%|██████████| 10/10 [00:36<00:00,  3.63s/it]

Upserting complete!
RAG initialization done!





In [7]:
def cohere_rag_answer(query: str) -> Optional[str]:
    """
    Use cached embeddings + Pinecone index + user_style + context
    to get a Cohere+RAG answer for a query.

    Returns None if something fails.
    """
    query = str(query).strip()
    if not query:
        return None

    try:
        augmented_prompt, _ = augment_prompt(
            query=query,
            user_style=user_style,
            context=context,
            model=model_emb,
            index=index,
        )

        response = co.chat(
            model="command-a-03-2025",
            message=augmented_prompt,
        )
        text = response.text.strip()
        if not text:
            return None
        return text

    except Exception as e:
        print(f"[WARN] Cohere failed for query: {query[:60]!r}... ({e})")
        return None


def generate_teacher_answer(query: str, kb_path: str = KB_PATH) -> Optional[str]:
    """
    Wrapper used by the dataset builder.
    Now uses the cached RAG state instead of re-embedding each time.
    """
    return cohere_rag_answer(query)


In [8]:
def build_input_text(row: pd.Series) -> str:
    """
    Build the text that will go into the student model.
    For now it's simple; later you can plug in full RAG context, etc.
    """
    query = str(row["text"]).strip()

    prompt = (
        f"You are {USER}. Answer in her natural WhatsApp style.\n\n"
        "### QUERY\n"
        f"{query}\n\n"
        "### INSTRUCTIONS\n"
        f"Reply as {USER} would reply in WhatsApp."
    )
    return prompt


In [9]:
df = pd.read_csv(KB_PATH)

df["answer"] = df["answer"].astype(str)
mask_receiver_barbara = df["receiver_user_id"] == USER

# rows with non-empty human answer
mask_has_human = df["answer"].str.strip().ne("")
human_df = df[mask_receiver_barbara & mask_has_human].copy()
print(f"Total rows in KB: {len(df)}")
print(f"Rows with receiver == {USER!r} and non-empty human answer: {len(human_df)}")

examples: List[Dict[str, Any]] = []
for _, row in human_df.iterrows():
    input_text = build_input_text(row)
    human_answer = row["answer"].strip()

    examples.append({
        "input_text": input_text,
        "target_text": human_answer,
        "label_source": "human",   # used later for sampling/weighting
    })

print(f"Base human examples: {len(examples)}")

Total rows in KB: 2360
Rows with receiver == 'Barbara' and non-empty human answer: 1181
Base human examples: 1181


In [10]:
# randomly sampling rows where we add teacher
indices = list(human_df.index)
n_aug = int(AUGMENT_FRACTION * len(indices))
augment_indices = set(random.sample(indices, n_aug))
print(f"Will augment {n_aug} rows with teacher answers")

for idx in augment_indices:
    row = human_df.loc[idx]
    query = str(row["text"]).strip()
    input_text = build_input_text(row)

    teacher_answer = generate_teacher_answer(query)
    if teacher_answer is None:
        print("Haven't generated answer")
        continue

    examples.append({
        "input_text": input_text,
        "target_text": teacher_answer,
        "label_source": "teacher",
    })
    
print(f"Total examples after adding teacher labels: {len(examples)}")

Will augment 354 rows with teacher answers
Total examples after adding teacher labels: 1535


In [11]:
out_df = pd.DataFrame(examples)
out_df.to_json(
    OUTPUT_KB_JSONL,
    orient="records",
    lines=True,
    force_ascii=False,
)
print(f"Saved distillation dataset to {OUTPUT_KB_JSONL}")

Saved distillation dataset to RAG_data\distillation_dataset.jsonl


## Cleaningthe dataset - Run only when the KB wasn't clean enough

In [None]:
def extract_query_from_input(input_text: str) -> str:
    """
    Extract the text between '### QUERY' and '\\n\\n### INSTRUCTIONS'
    from the input_text. Returns an empty string if pattern not found.
    """
    if not isinstance(input_text, str):
        return ""
    
    marker_query = "### QUERY"
    marker_instr = "\n\n### INSTRUCTIONS"
    
    pos_q = input_text.find(marker_query)
    if pos_q == -1:
        return ""
    
    # start after the line "### QUERY\n"
    pos_start = input_text.find("\n", pos_q)
    if pos_start == -1:
        return ""
    pos_start += 1  # move past the newline
    
    pos_end = input_text.find(marker_instr, pos_start)
    if pos_end == -1:
        # take until the end if instructions marker not found
        pos_end = len(input_text)
    
    query = input_text[pos_start:pos_end]
    return query.strip()


def cohere_barbara_reply(query: str) -> Optional[str]:
    """
    Ask Cohere to answer as Barbara in WhatsApp style given just the query.
    No RAG here - quick cleaning only.
    """
    query = str(query).strip()
    if not query:
        return None
    
    prompt = (
        "You are Barbara. Answer in her natural WhatsApp style.\n\n"
        "### QUERY\n"
        f"{query}\n\n"
        "### INSTRUCTIONS\n"
        "Reply as Barbara would reply in WhatsApp. Use natural, short WhatsApp-style messages, "
        "can include line breaks and emojis. Only output the reply, no explanations."
    )
    
    try:
        resp = co.chat(
            model="command-r-08-2024",
            message=prompt,
        )
        text = resp.text.strip()
        if not text:
            return None
        return text
    except Exception as e:
        print(f"[WARN] Cohere failed for query: {query[:60]!r}... ({e})")
        return None


In [15]:
ENCRYPTION_LINE = (
    "Messages and calls are end-to-end encrypted. "
    "Only people in this chat can read, listen to, or share them."
)

In [16]:
df = pd.read_json(OUTPUT_KB_JSONL, lines=True)
# Remove all mentions of the WhatsApp system message from input_text
df["input_text"] = df["input_text"].str.replace(ENCRYPTION_LINE, "", regex=False)

In [17]:
def is_audio_only_query(input_text: str) -> bool:
    q = extract_query_from_input(input_text)
    # Clean possible invisible chars (like RTL mark) and lower
    q_clean = q.replace("\u200e", "").strip().lower()
    return (q_clean == "audio omitted") or (q_clean == "")


audio_only_mask = df["input_text"].apply(is_audio_only_query)
print("Rows with audio-only query:", audio_only_mask.sum())

df = df[~audio_only_mask].copy()
print("Rows after dropping audio-only queries:", len(df))

Rows with audio-only query: 65
Rows after dropping audio-only queries: 1470


In [18]:
# Find rows where the *answer* contains "audio omitted"
def has_audio_omitted_answer(target_text: str) -> bool:
    if not isinstance(target_text, str):
        return False
    t_clean = target_text.replace("\u200e", "").lower()
    return "audio omitted" in t_clean

mask_audio_answer = df["target_text"].apply(has_audio_omitted_answer)
print("Rows with 'audio omitted' in target_text:", mask_audio_answer.sum())

# For each such row: generate a new Barbara-style answer using Cohere
rows_to_fix = df[mask_audio_answer].copy()

for idx, row in rows_to_fix.iterrows():
    query = extract_query_from_input(row["input_text"])
    new_answer = cohere_barbara_reply(query)
    
    if new_answer is not None:
        df.at[idx, "target_text"] = new_answer
    else:
        # If Cohere fails for some reason, you can either:
        #  - keep the old target_text, or
        #  - drop the row. Let's drop to keep dataset clean.
        df = df.drop(index=idx)
        print(f"Dropped row {idx} because Cohere couldn't generate answer.")


Rows with 'audio omitted' in target_text: 153


In [19]:
OUTPUT_JSONL = r"RAG_data\distillation_dataset_clean.jsonl"  # clean appended file

In [20]:
print("Final cleaned rows:", len(df))

# Append to the existing file instead of overwriting
with open(OUTPUT_JSONL, "a", encoding="utf-8") as f:
    for _, row in df.iterrows():
        f.write(json.dumps(row.to_dict(), ensure_ascii=False))
        f.write("\n")

print(f"Appended cleaned rows to {OUTPUT_JSONL}")

Final cleaned rows: 1470
Appended cleaned rows to RAG_data\distillation_dataset_clean.jsonl


## Runing fine-tune

In [None]:
output_dir = r"Fine_Tune\distilled\KB_lora"
max_len = 800

In [None]:
ds = load_dataset("json", data_files={"data": OUTPUT_JSONL})["data"]
print(ds)

splits = ds.train_test_split(test_size=0.1, seed=42)
train_ds = splits["train"]
val_ds   = splits["test"]

print("Train size:", len(train_ds), "| Val size:", len(val_ds))


Dataset({
    features: ['input_text', 'target_text', 'label_source'],
    num_rows: 1470
})
Train size: 1323 | Val size: 147


In [None]:
train_human   = train_ds.filter(lambda ex: ex.get("label_source", "") == "human")
train_teacher = train_ds.filter(lambda ex: ex.get("label_source", "") == "teacher")
print("Train human:", len(train_human), "| Train teacher:", len(train_teacher))

# oversanpling human responses by concatenating them HUMAN_DUP_FACTOR times
train_human_oversampled = concatenate_datasets([train_human] * HUMAN_DUP_FACTOR)
train_balanced = concatenate_datasets([train_human_oversampled, train_teacher]).shuffle(seed=42)
train_ds = train_balanced
print("New train size after oversampling:", len(train_ds))

Train human: 1024 | Train teacher: 299
New train size after oversampling: 2347


In [None]:
tok = AutoTokenizer.from_pretrained(MODEL_ID)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

base = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype="auto",
)
base.config.pad_token_id = tok.pad_token_id

tokenizer_config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=TARGET_MODULES,
)

model = get_peft_model(base, lora_config)
model.print_trainable_parameters()

In [None]:
def build_example(ex: Dict[str, Any]) -> Dict[str, Any]:
    # Prompt from our dataset
    input_text  = ex.get("input_text", "")
    target_text = ex.get("target_text", "")

    # Build full prompt + answer
    prompt = input_text + "\n\n### Barbara:\n"
    x = prompt + target_text

    # Tokenize full and prompt separately
    enc_full   = tok(x, max_length=max_len, truncation=True)
    enc_prompt = tok(prompt, max_length=max_len, truncation=True)

    input_ids = enc_full["input_ids"]
    labels    = input_ids.copy()

    # Mask prompt part in labels
    n_prompt = len(enc_prompt["input_ids"])
    for i in range(min(n_prompt, len(labels))):
        labels[i] = -100

    return {
        "input_ids": input_ids,
        "attention_mask": enc_full["attention_mask"],
        "labels": labels,
    }


In [None]:
cols = ["input_ids", "attention_mask", "labels"]

train_tok = train_ds.map(
    build_example,
    remove_columns=train_ds.column_names,
)
val_tok = val_ds.map(
    build_example,
    remove_columns=val_ds.column_names,
)

train_tok.set_format(type="torch", columns=cols)
val_tok.set_format(type="torch", columns=cols)

len(train_tok), len(val_tok)


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tok,
    mlm=False,   # no random masking; we already set labels
)

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=1000,
    save_total_limit=3,
    num_train_epochs=3,
    learning_rate=2e-4,
    warmup_ratio=0.03,
    fp16=True,     # if GPU supports it
    bf16=False,    # True if on A100 etc.
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    data_collator=data_collator,
)

trainer.train()


In [None]:
trainer.save_model(output_dir)
tok.save_pretrained(output_dir)
print("Finished training + saved model + tokenizer.")

generation test

In [None]:
pipe = pipeline(
    "text-generation",
    model=output_dir,
    tokenizer=tok,
    device_map="auto",
)

def generate_barbara_reply(query: str, max_new_tokens: int = 80):
    input_text = (
        "You are Barbara. Answer in her natural WhatsApp style.\n\n"
        "### QUERY\n"
        f"{query}\n\n"
        "### INSTRUCTIONS\n"
        "Reply as Barbara would reply in WhatsApp."
    )
    prompt = input_text + "\n\n### Barbara:\n"

    out = pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
    )[0]["generated_text"]

    if "### Barbara:" in out:
        reply = out.split("### Barbara:", 1)[1].strip()
    else:
        reply = out.strip()

    return reply

print(generate_barbara_reply("Hi, I'm sick today, can I get an extension?"))
