In this part we applied a knowledge-distillation approach where a large model (Cohere Command + our RAG pipeline) acts as a teacher and a smaller TinyLlama model serves as the student.

For selected messages, Cohere generated high-quality “teacher answers” based on the query, context, retrieved similar messages, and examples of some user's style. The student model was then fine-tuned on a mixed dataset that included both the real human replies and these teacher-generated replies.

To preserve users authentic style, human-labeled examples were oversampled, while teacher examples were added selectively.
For some important inputs, we created two versions of the same training sample—one with the human answer and one with the teacher answer—to enrich supervision.

Through this setup, the student model learns user's tone while also benefiting from the teacher model clarity and completeness.

Themain goal of this experimental part to see if the distillation technique can improve generalization and produces higher-quality from the smaller model. If so if we will have even bigger student model in theory we can dramatically reduce the resources in production.

# Imports and env  settings

In [None]:
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, DataCollatorForLanguageModeling,Trainer,pipeline
from peft import LoraConfig, get_peft_model, TaskType
from typing import List, Dict, Optional, Any
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import pandas as pd
import numpy as np
import torch
import random
import cohere
import json
import os
load_dotenv(override=True)

In [None]:
# Import helpers & constants from the RAG file (generated automatically from ipynb)
from RAG_generic_func import (
    load_and_embedd_dataset,
    create_pinecone_index,
    upsert_vectors,       # we'll override here
    build_context,
    build_user_style,     # same
    augment_prompt,
    EMBEDDING_MODEL,
    COHERE_API_KEY,
    PINECONE_API_KEY,
)


When running on VM we got errors from imports that were used in distillation part so we created separate imports. Errors were mostly from newer versiom packages like numpy not supporting the GPU we used (it was older version)

In [None]:
# modules used during fine-tuning itself(on VM) 
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments,Trainer, default_data_collator,BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from typing import List, Dict, Optional, Any
import torch
import numpy as np
import os 
import random

In [None]:
COHERE_API_KEY = os.environ.get("COHERE_API_KEY_PAY", "")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY", "")

output_dir = "/home/student/Whatsapp_webApp_-Django-/Fine_Tune/distilled/KB_lora"
max_len = 800

TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"] # full attention
# TARGET_MODULES = ["q_proj","v_proj"]
# TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

EMBEDDING_MODEL = "all-MiniLM-L6-v2"
KB_PATH = r"RAG_data\KB_data_distilled.csv"
OUTPUT_KB_JSONL = r"RAG_data\distillation_dataset.jsonl"

AUGMENT_FRACTION = 0.3   # fraction of human examples that also get a teacher-label version
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

USER = "Barbara" # the user whose tone we try to copy
INDEX_NAME = "chats-index"

# parameters for fine tuning
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
BATCH_SIZE = 4
MAX_LENGTH = 512
NUM_EPOCHS = 3
LEARNING_RATE = 5e-5

# "Weights" via oversampling: how many times to duplicate human examples
HUMAN_DUP_FACTOR = 2   # 2 = roughly double weight vs teacher
OUTPUT_JSONL = "RAG_data/distillation_dataset_clean.jsonl"  # clean appended file
device = "cuda" if torch.cuda.is_available() else "cpu"
device


'cuda'

# Distillation to improve fine-tune

## Getting teacher labels

In [5]:
def build_user_style(
    df: pd.DataFrame,
    user_id: str,
    k: int = 10,
    text_col: str = "text",
    random_sample: bool = True,
    seed: int | None = 42,
) -> tuple[list[str], str]:
    """
    Return:
      - list of example messages (lines)
      - a single multi-line string user_style

    If there are no messages for this user_id, returns ([], "").
    """
    user_df = df[df["sender_user_id"] == user_id].copy()

    if len(user_df) == 0:
        return [], ""

    user_df = user_df.sort_values("sent_at")

    if random_sample and len(user_df) > k:
        rng = np.random.default_rng(seed)
        idx = rng.choice(user_df.index.to_list(), size=k, replace=False)
        user_df = user_df.loc[idx].sort_values("sent_at")
    else:
        user_df = user_df.tail(k)

    lines = [str(msg) for msg in user_df[text_col].tolist()]
    user_style = "\n".join(lines)
    return lines, user_style


def upsert_vectors(
    index,               # Pinecone index object
    dataset: pd.DataFrame,
    embeddings: np.ndarray,
    batch_size: int = 128,
):
    """
    Upsert vectors to a Pinecone index.

    Args:
        index: pc.Index instance.
        dataset: DataFrame containing metadata; must align with embeddings.
        embeddings: numpy array [n_rows, dim].
    """
    from tqdm import tqdm

    print("Upserting the embeddings to the Pinecone index...")

    if embeddings.shape[0] != len(dataset):
        raise ValueError(
            f"Embeddings rows ({embeddings.shape[0]}) != dataset rows ({len(dataset)})"
        )

    metadata_fields = [col for col in dataset.columns if col != "embedding"]

    num_rows = embeddings.shape[0]
    ids = [str(i) for i in range(num_rows)]

    meta = []
    for _, row in dataset.iterrows():
        entry = {col: row[col] for col in metadata_fields}
        meta.append(entry)

    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, len(to_upsert), batch_size)):
        i_end = min(i + batch_size, len(to_upsert))
        index.upsert(vectors=to_upsert[i:i_end])

    print("Upserting complete!")
    return index

In [6]:
print("=== RAG initialization ===")

# Cohere client
co = cohere.Client(api_key=COHERE_API_KEY)

# 1) Load KB
whatsapp_chats = pd.read_csv(KB_PATH)

# 2) Embed entire KB once
model_emb = SentenceTransformer(EMBEDDING_MODEL)
kb_df_all, embeddings = load_and_embedd_dataset(whatsapp_chats, model_emb)

# 3) Keep only rows where Barbara is the receiver (for retrieval)
kb_df_to_barbara = kb_df_all[kb_df_all["receiver_user_id"] == USER].sort_values("conv_turn")
embeddings_to_barbara = embeddings[kb_df_to_barbara.index.to_list()]

print("Rows for Barbara as receiver:", len(kb_df_to_barbara))

# 4) Create Pinecone index once
pc = create_pinecone_index(INDEX_NAME, embeddings_to_barbara.shape[1])

# 5) Upsert embeddings once
index = pc.Index(INDEX_NAME)
index = upsert_vectors(index, kb_df_to_barbara, embeddings_to_barbara)

# 6) Shared context & style for Barbara (can tune conv_id)
context = build_context(
    kb_df_all,
    conv_id="chat:u_1_u_2",  # adjust conv_id as needed
    k=10,
)

_, user_style = build_user_style(
    kb_df_all,
    user_id=USER,
    k=10,
)

print("RAG initialization done!")

=== RAG initialization ===
Loading and embedding the dataset


Batches:   0%|          | 0/74 [00:00<?, ?it/s]

Done!
Rows for Barbara as receiver: 1181
Creating a Pinecone index...
Done!
Upserting the embeddings to the Pinecone index...


100%|██████████| 10/10 [00:36<00:00,  3.63s/it]

Upserting complete!
RAG initialization done!





In [7]:
def cohere_rag_answer(query: str) -> Optional[str]:
    """
    Use cached embeddings + Pinecone index + user_style + context
    to get a Cohere+RAG answer for a query.

    Returns None if something fails.
    """
    query = str(query).strip()
    if not query:
        return None

    try:
        augmented_prompt, _ = augment_prompt(
            query=query,
            user_style=user_style,
            context=context,
            model=model_emb,
            index=index,
        )

        response = co.chat(
            model="command-a-03-2025",
            message=augmented_prompt,
        )
        text = response.text.strip()
        if not text:
            return None
        return text

    except Exception as e:
        print(f"[WARN] Cohere failed for query: {query[:60]!r}... ({e})")
        return None


def generate_teacher_answer(query: str, kb_path: str = KB_PATH) -> Optional[str]:
    """
    Wrapper used by the dataset builder.
    Now uses the cached RAG state instead of re-embedding each time.
    """
    return cohere_rag_answer(query)


In [8]:
def build_input_text(row: pd.Series) -> str:
    """
    Build the text that will go into the student model.
    For now it's simple; later you can plug in full RAG context, etc.
    """
    query = str(row["text"]).strip()

    prompt = (
        f"You are {USER}. Answer in her natural WhatsApp style.\n\n"
        "### QUERY\n"
        f"{query}\n\n"
        "### INSTRUCTIONS\n"
        f"Reply as {USER} would reply in WhatsApp."
    )
    return prompt


In [9]:
df = pd.read_csv(KB_PATH)

df["answer"] = df["answer"].astype(str)
mask_receiver_barbara = df["receiver_user_id"] == USER

# rows with non-empty human answer
mask_has_human = df["answer"].str.strip().ne("")
human_df = df[mask_receiver_barbara & mask_has_human].copy()
print(f"Total rows in KB: {len(df)}")
print(f"Rows with receiver == {USER!r} and non-empty human answer: {len(human_df)}")

examples: List[Dict[str, Any]] = []
for _, row in human_df.iterrows():
    input_text = build_input_text(row)
    human_answer = row["answer"].strip()

    examples.append({
        "input_text": input_text,
        "target_text": human_answer,
        "label_source": "human",   # used later for sampling/weighting
    })

print(f"Base human examples: {len(examples)}")

Total rows in KB: 2360
Rows with receiver == 'Barbara' and non-empty human answer: 1181
Base human examples: 1181


In [10]:
# randomly sampling rows where we add teacher
indices = list(human_df.index)
n_aug = int(AUGMENT_FRACTION * len(indices))
augment_indices = set(random.sample(indices, n_aug))
print(f"Will augment {n_aug} rows with teacher answers")

for idx in augment_indices:
    row = human_df.loc[idx]
    query = str(row["text"]).strip()
    input_text = build_input_text(row)

    teacher_answer = generate_teacher_answer(query)
    if teacher_answer is None:
        print("Haven't generated answer")
        continue

    examples.append({
        "input_text": input_text,
        "target_text": teacher_answer,
        "label_source": "teacher",
    })
    
print(f"Total examples after adding teacher labels: {len(examples)}")

Will augment 354 rows with teacher answers
Total examples after adding teacher labels: 1535


In [11]:
out_df = pd.DataFrame(examples)
out_df.to_json(
    OUTPUT_KB_JSONL,
    orient="records",
    lines=True,
    force_ascii=False,
)
print(f"Saved distillation dataset to {OUTPUT_KB_JSONL}")

Saved distillation dataset to RAG_data\distillation_dataset.jsonl


## Cleaningthe dataset - Run only when the KB wasn't clean enough

In [None]:
def extract_query_from_input(input_text: str) -> str:
    """
    Extract the text between '### QUERY' and '\\n\\n### INSTRUCTIONS'
    from the input_text. Returns an empty string if pattern not found.
    """
    if not isinstance(input_text, str):
        return ""
    
    marker_query = "### QUERY"
    marker_instr = "\n\n### INSTRUCTIONS"
    
    pos_q = input_text.find(marker_query)
    if pos_q == -1:
        return ""
    
    # start after the line "### QUERY\n"
    pos_start = input_text.find("\n", pos_q)
    if pos_start == -1:
        return ""
    pos_start += 1  # move past the newline
    
    pos_end = input_text.find(marker_instr, pos_start)
    if pos_end == -1:
        # take until the end if instructions marker not found
        pos_end = len(input_text)
    
    query = input_text[pos_start:pos_end]
    return query.strip()


def cohere_barbara_reply(query: str) -> Optional[str]:
    """
    Ask Cohere to answer as Barbara in WhatsApp style given just the query.
    No RAG here - quick cleaning only.
    """
    query = str(query).strip()
    if not query:
        return None
    
    prompt = (
        "You are Barbara. Answer in her natural WhatsApp style.\n\n"
        "### QUERY\n"
        f"{query}\n\n"
        "### INSTRUCTIONS\n"
        "Reply as Barbara would reply in WhatsApp. Use natural, short WhatsApp-style messages, "
        "can include line breaks and emojis. Only output the reply, no explanations."
    )
    
    try:
        resp = co.chat(
            model="command-r-08-2024",
            message=prompt,
        )
        text = resp.text.strip()
        if not text:
            return None
        return text
    except Exception as e:
        print(f"[WARN] Cohere failed for query: {query[:60]!r}... ({e})")
        return None


In [15]:
ENCRYPTION_LINE = (
    "Messages and calls are end-to-end encrypted. "
    "Only people in this chat can read, listen to, or share them."
)

In [16]:
df = pd.read_json(OUTPUT_KB_JSONL, lines=True)
# Remove all mentions of the WhatsApp system message from input_text
df["input_text"] = df["input_text"].str.replace(ENCRYPTION_LINE, "", regex=False)

In [17]:
def is_audio_only_query(input_text: str) -> bool:
    q = extract_query_from_input(input_text)
    # Clean possible invisible chars (like RTL mark) and lower
    q_clean = q.replace("\u200e", "").strip().lower()
    return (q_clean == "audio omitted") or (q_clean == "")


audio_only_mask = df["input_text"].apply(is_audio_only_query)
print("Rows with audio-only query:", audio_only_mask.sum())

df = df[~audio_only_mask].copy()
print("Rows after dropping audio-only queries:", len(df))

Rows with audio-only query: 65
Rows after dropping audio-only queries: 1470


In [18]:
# Find rows where the *answer* contains "audio omitted"
def has_audio_omitted_answer(target_text: str) -> bool:
    if not isinstance(target_text, str):
        return False
    t_clean = target_text.replace("\u200e", "").lower()
    return "audio omitted" in t_clean

mask_audio_answer = df["target_text"].apply(has_audio_omitted_answer)
print("Rows with 'audio omitted' in target_text:", mask_audio_answer.sum())

# For each such row: generate a new Barbara-style answer using Cohere
rows_to_fix = df[mask_audio_answer].copy()

for idx, row in rows_to_fix.iterrows():
    query = extract_query_from_input(row["input_text"])
    new_answer = cohere_barbara_reply(query)
    
    if new_answer is not None:
        df.at[idx, "target_text"] = new_answer
    else:
        # If Cohere fails for some reason, you can either:
        #  - keep the old target_text, or
        #  - drop the row. Let's drop to keep dataset clean.
        df = df.drop(index=idx)
        print(f"Dropped row {idx} because Cohere couldn't generate answer.")


Rows with 'audio omitted' in target_text: 153


In [10]:
OUTPUT_JSONL = "RAG_data/distillation_dataset_clean.jsonl"  # clean appended file

In [20]:
print("Final cleaned rows:", len(df))

# Append to the existing file instead of overwriting
with open(OUTPUT_JSONL, "a", encoding="utf-8") as f:
    for _, row in df.iterrows():
        f.write(json.dumps(row.to_dict(), ensure_ascii=False))
        f.write("\n")

print(f"Appended cleaned rows to {OUTPUT_JSONL}")

Final cleaned rows: 1470
Appended cleaned rows to RAG_data\distillation_dataset_clean.jsonl


## Runing fine-tune

In [None]:
ds = load_dataset("json", data_files={"data": OUTPUT_JSONL})["data"]
print(ds)

splits = ds.train_test_split(test_size=0.1, seed=RANDOM_SEED)
train_ds = splits["train"]
val_ds   = splits["test"]

print("Train size:", len(train_ds), "| Val size:", len(val_ds))

# splitting on human and teacher generated
train_human   = train_ds.filter(lambda ex: ex.get("label_source", "") == "human")
train_teacher = train_ds.filter(lambda ex: ex.get("label_source", "") == "teacher")
print("Train human:", len(train_human), "| Train teacher:", len(train_teacher))

# oversampling human 
train_human_oversampled = concatenate_datasets([train_human] * HUMAN_DUP_FACTOR)
train_balanced = concatenate_datasets([train_human_oversampled, train_teacher]).shuffle(seed=RANDOM_SEED)
train_ds = train_balanced
print("New train size after oversampling:", len(train_ds))


Dataset({
    features: ['input_text', 'target_text', 'label_source'],
    num_rows: 1470
})
Train size: 1323 | Val size: 147
Train human: 1024 | Train teacher: 299
New train size after oversampling: 2347


In [None]:
tok = AutoTokenizer.from_pretrained(MODEL_ID)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

# 4-bit quantization (QLoRA style)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# model in 4-bit, note that automatically is must be placed on GPU
base = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
)
base.config.pad_token_id = tok.pad_token_id

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=TARGET_MODULES,
)
model = get_peft_model(base, lora_config)
model.print_trainable_parameters()

# to save up memory during training (we got a lot of out ofmemory errors)
model.gradient_checkpointing_enable()
model.config.use_cache = False


trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.2044


In [None]:
def build_example(ex: Dict[str, Any]) -> Dict[str, Any]:
    input_text  = ex.get("input_text", "")
    target_text = ex.get("target_text", "")

    # propmt + full text
    prompt = input_text + "\n\n### Barbara:\n"
    x = prompt + target_text

    # full padded to max_len for batching
    enc_full = tok(
        x,
        max_length=max_len,
        truncation=True,
        padding="max_length",
    )
    # prompt onl with NO padding 
    enc_prompt = tok(
        prompt,
        truncation=True,
        padding=False,
        add_special_tokens=False,
    )

    input_ids = enc_full["input_ids"]
    labels    = input_ids.copy()

    # asking only pront
    n_prompt = len(enc_prompt["input_ids"])
    n_prompt = min(n_prompt, len(labels))
    for i in range(n_prompt):
        labels[i] = -100

    return {
        "input_ids": input_ids,
        "attention_mask": enc_full["attention_mask"],
        "labels": labels,
    }


cols = ["input_ids", "attention_mask", "labels"]

train_tok = train_ds.map(
    build_example,
    remove_columns=train_ds.column_names,
)
val_tok = val_ds.map(
    build_example,
    remove_columns=val_ds.column_names,
)

train_tok.set_format(type="torch", columns=cols)
val_tok.set_format(type="torch", columns=cols)

print("Tokenized train size:", len(train_tok), "| val size:", len(val_tok))
ex0 = train_tok[0]
print("len(input_ids):", len(ex0["input_ids"]))
print("len(labels):   ", len(ex0["labels"]))
print("num supervised tokens:", sum(1 for t in ex0["labels"].tolist() if t != -100))


Map:   0%|          | 0/2347 [00:00<?, ? examples/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

Tokenized train size: 2347 | val size: 147
len(input_ids): 800
len(labels):    800
num supervised tokens: 736


In [23]:
args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,
    fp16=True,
    learning_rate=2e-4,
    logging_steps=20,
    eval_strategy="epoch",  
    save_strategy="epoch", 
    save_total_limit=2,
    report_to="none",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    lr_scheduler_type="cosine",
    warmup_ratio=0.01,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    data_collator=default_data_collator,
    tokenizer=tok,
)

trainer.train()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
0,0.1062,0.097299
1,0.1014,0.092053
2,0.0955,0.091213


  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


TrainOutput(global_step=219, training_loss=0.6032676092565876, metrics={'train_runtime': 44557.8061, 'train_samples_per_second': 0.158, 'train_steps_per_second': 0.005, 'total_flos': 3.48651690196992e+16, 'train_loss': 0.6032676092565876, 'epoch': 2.9846678023850086})

In [24]:
output_dir = "/home/student/Whatsapp_webApp_-Django-/Fine_Tune/distilled/KB_lora"
trainer.save_model(output_dir)
tok.save_pretrained(output_dir)
print("Finished training + saved model + tokenizer.")

Finished training + saved model + tokenizer.




Generation test

In [None]:
pipe = pipeline(
    "text-generation",
    model=output_dir,
    tokenizer=tok,
    device_map="auto",
)

def generate_barbara_reply(query: str, max_new_tokens: int = 80):
    input_text = (
        "You are Barbara. Answer in her natural WhatsApp style.\n\n"
        "### QUERY\n"
        f"{query}\n\n"
        "### INSTRUCTIONS\n"
        "Reply as Barbara would reply in WhatsApp."
    )
    prompt = input_text + "\n\n### Barbara:\n"

    out = pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
    )[0]["generated_text"]

    if "### Barbara:" in out:
        reply = out.split("### Barbara:", 1)[1].strip()
    else:
        reply = out.strip()

    return reply

print(generate_barbara_reply("Hi, I'm sick today, can I get an extension?"))
