# **Importation & Lib :** 

In [None]:
!pip install --upgrade --quiet bitsandbytes triton accelerate transformers
!pip install -q --upgrade transformers accelerate peft trl faiss-cpu
!pip install -q wandb

In [13]:
import os
os.environ["TRANSFORMERS_NO_ADDITIONAL_CHAT_TEMPLATES"] = "1"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

import sys, importlib
import transformers
import datasets
import trl
import tqdm
import json, random, re, numpy as np, pandas as pd
from datasets import load_dataset, Dataset  
from transformers import AutoTokenizer
import torch, faiss
from transformers import AutoTokenizer, AutoModel
from collections import Counter, defaultdict
import bitsandbytes, triton
from transformers import (
    AutoTokenizer,  
    AutoModelForCausalLM,
    TrainingArguments
)
from peft import LoraConfig, prepare_model_for_kbit_training
from kaggle_secrets import UserSecretsClient
from trl import SFTConfig, SFTTrainer

print("‚úÖ All libraries imported successfully!")
print("\n" + "="*50)
print("‚úÖ LIBRARY VERSION CHECK ‚úÖ")
print("="*50)
print(f"üêç Python: {sys.version.split()[0]}")
print("\n--- Core Libraries ---")
print(f"üî• Torch: {torch.__version__} | CUDA: {torch.version.cuda}")
print(f"‚úÖ CUDA available: {torch.cuda.is_available()}")
print("\n--- Hugging Face Ecosystem ---")
print(f"ü§ó Transformers: {transformers.__version__}")
print(f"üìì Datasets: {datasets.__version__}")
print(f"üß™ TRL: {trl.__version__}")
print("\n--- Hardware & Optimization ---")
print(f"‚ö° BitsAndBytes: {bitsandbytes.__version__}")
print(f"üî± Triton: {triton.__version__}")
print("\n--- Data & Utility ---")
print(f"üìä Numpy: {np.__version__}")
print(f"üìà Pandas: {pd.__version__}")
print(f"üîç Faiss: {faiss.__version__}")
print(f"‚è≥ Tqdm: {tqdm.__version__}") 
print("="*50)

‚úÖ All libraries imported successfully!

‚úÖ LIBRARY VERSION CHECK ‚úÖ
üêç Python: 3.11.13

--- Core Libraries ---
üî• Torch: 2.6.0+cu124 | CUDA: 12.4
‚úÖ CUDA available: True

--- Hugging Face Ecosystem ---
ü§ó Transformers: 4.57.1
üìì Datasets: 4.1.1
üß™ TRL: 0.24.0

--- Hardware & Optimization ---
‚ö° BitsAndBytes: 0.48.1
üî± Triton: 3.2.0

--- Data & Utility ---
üìä Numpy: 1.26.4
üìà Pandas: 2.2.3
üîç Faiss: 1.12.0
‚è≥ Tqdm: 4.67.1


In [3]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/bert-embeddings/bert_embeddings.npy
/kaggle/input/bert-embeddings/ft_news_dialogue_FINAL_2.jsonl
/kaggle/input/bert-embeddings/news_dialogue_two_roles.json
/kaggle/input/bert-embeddings/news_dialogue.json
/kaggle/input/bert-embeddings/news_articles_cleaned_for_bert.csv
/kaggle/input/bert-embeddings/ft_news_dialogue_host_interview.jsonl


# -----------------------------------------------------

-----------------------------------------------------

# **1. Topic Semantic Search**

In [4]:
# ------------------------------------------------------------
# 1. Load embeddings and cleaned dataset
# ------------------------------------------------------------
try:
    embeddings = np.load("/kaggle/input/bert-embeddings/bert_embeddings.npy")
    df = pd.read_csv("/kaggle/input/bert-embeddings/news_articles_cleaned_for_bert.csv")
except FileNotFoundError:
    print("‚ùå Error: Could not find input files.")
    print("Please ensure '/kaggle/input/bert-embeddings/bert_embeddings.npy' and")
    print("'/kaggle/input/bert-embeddings/news_articles_cleaned_for_bert.csv' exist.")
    # Stop execution if files aren't found
    raise

print(f"Embeddings: {embeddings.shape[0]} | Dataset rows: {len(df)}")

# --- FIXED SYNTAX ERROR ---
# Removed the stray code ", gs, axis=1, keepdims=True)"
assert len(df) == embeddings.shape[0], "‚ùå Mismatch between embeddings and dataset rows!"
print("‚úÖ Data loaded and verified.")

# ------------------------------------------------------------
# 2. Build & save FAISS index
# ------------------------------------------------------------
index = faiss.IndexFlatIP(embeddings.shape[1])  # Inner product ‚âà cosine
index.add(embeddings)
faiss.write_index(index, "topic_retriever.index")

# Save category labels aligned with embeddings
df["category"].to_csv("topic_labels.csv", index=False)
print("‚úÖ FAISS index and topic labels saved.")

# ------------------------------------------------------------
# 3. Load same BERT model/tokenizer used for embeddings
# ------------------------------------------------------------
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()
print(f"‚úÖ Model '{model_name}' loaded onto {device}.")

# --- FIXED NORMALIZATION & INDENTATION ---
def embed_text(text):
    """Encode new text into same vector space as dataset."""
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden = outputs.last_hidden_state
        mask = inputs["attention_mask"].unsqueeze(-1).expand(last_hidden.size()).float()
        sum_emb = torch.sum(last_hidden * mask, 1)
        sum_mask = torch.clamp(mask.sum(1), min=1e-9)
        mean_pool = sum_emb / sum_mask
    emb = mean_pool.cpu().numpy()
    
    # --- THIS IS THE CORRECT NORMALIZATION ---
    # The (1, 768) embedding is normalized along axis=1
    norm = np.linalg.norm(emb, axis=1, keepdims=True)
    emb = emb / (norm + 1e-9)  # Add epsilon for numerical stability
    
    return emb.astype("float32")

# ------------------------------------------------------------
# 4. Load FAISS index & topic labels
# ------------------------------------------------------------
index = faiss.read_index("topic_retriever.index")
topics = pd.read_csv("topic_labels.csv")["category"].tolist()
print("‚úÖ FAISS index and labels re-loaded for inference.")

# ------------------------------------------------------------
# 5. Example: predict topic of a new speech
# ------------------------------------------------------------
# --- FIXED INDENTATION ---
speech = "We are moving from a mobile and cloud era to an era of ubiquitous computing and ambient intelligence, an era which will experience more digitization over the next 10 years than the last 40. Going forward, every business process will be collaborative, powered by data and AI, and will bridge the digital and physical worlds. One thing underlying everything is how large-scale AI models are becoming platforms in their own right, creating that ambient intelligence all around us."

speech_emb = embed_text(speech)
D, I = index.search(speech_emb, k=10)  # Search for 10 nearest neighbors
retrieved_topics = [topics[i] for i in I[0]]

# Find the most common topic among the neighbors
predicted = Counter(retrieved_topics).most_common(1)[0][0]

print("\n" + "="*35)
print("     TOPIC RETRIEVAL RESULT")
print("="*35)
print("User Speech:")
print(f"  '{speech}'")
print("-----------------------------------")
print(f"Predicted Topic: {predicted}")
print("===================================")

Embeddings: 6844 | Dataset rows: 6844
‚úÖ Data loaded and verified.
‚úÖ FAISS index and topic labels saved.
‚úÖ Model 'bert-base-uncased' loaded onto cuda.
‚úÖ FAISS index and labels re-loaded for inference.

     TOPIC RETRIEVAL RESULT
User Speech:
  'We are moving from a mobile and cloud era to an era of ubiquitous computing and ambient intelligence, an era which will experience more digitization over the next 10 years than the last 40. Going forward, every business process will be collaborative, powered by data and AI, and will bridge the digital and physical worlds. One thing underlying everything is how large-scale AI models are becoming platforms in their own right, creating that ambient intelligence all around us.'
-----------------------------------
Predicted Topic: TECH


# -----------------------------------------------------

----------------------------------------------------------------------------

# **2. Data Filtration and Cleaning for finetuning use later :**

In [5]:
# ============================================================
# Inspect the uploaded MediaSum-like dataset (array format)
# ============================================================
INPUT_PATH = "/kaggle/input/bert-embeddings/news_dialogue.json"

# Load full JSON or only first few entries if it's large
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"‚úÖ Loaded {len(data)} records\n")

# Convert to DataFrame
df = pd.DataFrame(data)
print("Columns:", df.columns.tolist(), "\n")

# Show one example in detail
print(df.iloc[1])

# If it has utt/speaker, preview them
if "utt" in df.columns and "speaker" in df.columns:
    print("\nFirst 3 utterances:", df.iloc[0]['utt'][:3])
    print("First 3 speakers:", df.iloc[0]['speaker'][:3])

‚úÖ Loaded 463596 records

Columns: ['id', 'program', 'date', 'url', 'title', 'summary', 'utt', 'speaker'] 

id                                                     NPR-2
program                               Weekend Edition Sunday
date                                              2016-10-23
url        https://www.npr.org/2016/10/23/499042298/young...
title      Young, First-Time Voters Share Views On Electi...
summary    NPR's Rachel Martin speaks with young voters w...
utt        [You have heard it again and again - this is a...
speaker    [RACHEL MARTIN, HOST, ASHANTI MARTINEZ, LAUREN...
Name: 1, dtype: object

First 3 utterances: ['Now, moving on, Forest Whitaker as Moses, Tisha Campbell Martin as Mary Magdalene - well, that\'s all in "The Bible Experience." A New Testament edition was released in 2006. This edition is billed as "The Complete Bible." It doesn\'t have one person reading the gospels. It features nearly 400 African-American artists, actors and ministers, plus sound eff

In [6]:
# ============================================================
# Analyze alignment and length statistics
# ============================================================
INPUT_PATH = "/kaggle/input/bert-embeddings/news_dialogue.json"

with open(INPUT_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

# Verify alignment
utt_lens = [len(d["utt"]) for d in data]
spk_lens = [len(d["speaker"]) for d in data]
aligned_mask = [u == s for u, s in zip(utt_lens, spk_lens)]
aligned = sum(aligned_mask)
print(f"‚úÖ {aligned:,}/{len(data):,} dialogues have matching utt/speaker lengths "
      f"({aligned/len(data)*100:.2f}% alignment).\n")

# Compute averages and percentiles
utt_arr = np.array(utt_lens)
print(f"üó£Ô∏è Average number of utterances per dialogue: {utt_arr.mean():.2f}")
print(f"üîπ Median: {np.median(utt_arr):.0f}")
print(f"üîπ 90th percentile: {np.percentile(utt_arr, 90):.0f}")
print(f"üîπ 99th percentile: {np.percentile(utt_arr, 99):.0f}")
print(f"üìè Min: {utt_arr.min()} | Max: {utt_arr.max()}\n")

# Show one well-formed example
for d in data:
    if len(d["utt"]) == len(d["speaker"]):
        print("--- Example dialogue ---")
        for u, s in zip(d["utt"][:8], d["speaker"][:8]):  # show first 8 turns
            print(f"{s}: {u}")
        break

‚úÖ 463,596/463,596 dialogues have matching utt/speaker lengths (100.00% alignment).

üó£Ô∏è Average number of utterances per dialogue: 30.02
üîπ Median: 21
üîπ 90th percentile: 56
üîπ 99th percentile: 188
üìè Min: 1 | Max: 2827

--- Example dialogue ---
FARAI CHIDEYA, host: Now, moving on, Forest Whitaker as Moses, Tisha Campbell Martin as Mary Magdalene - well, that's all in "The Bible Experience." A New Testament edition was released in 2006. This edition is billed as "The Complete Bible." It doesn't have one person reading the gospels. It features nearly 400 African-American artists, actors and ministers, plus sound effects.
FARAI CHIDEYA, host: Just listen to Blair Underwood's rendition of Jesus on the cross.
Mr. BLAIR UNDERWOOD (Actor): (As Jesus) My God, my God, why have you forsaken me?
FARAI CHIDEYA, host: Now, we've got two people affiliated with the project with us today. Kyle Bowser, he co-produced "The Bible Experience" and actress Wendy Raquel Robinson, one of the ac

In [7]:
# ============================================================
# Analyze token length and speaker role vocabulary
# ============================================================
INPUT_PATH = "/kaggle/input/bert-embeddings/news_dialogue.json"
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

with open(INPUT_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

# --- Compute token counts on a sample ---
sample_size = 5000  # adjust for speed
token_counts = []
for d in data[:sample_size]:
    for utt in d["utt"]:
        n_tokens = len(tokenizer.encode(utt, add_special_tokens=False))
        token_counts.append(n_tokens)

token_counts = np.array(token_counts)
print(f"üßæ Sample size: {len(token_counts):,} utterances")
print(f"Average tokens per utterance: {token_counts.mean():.2f}")
print(f"Median: {np.median(token_counts):.0f}")
print(f"90th percentile: {np.percentile(token_counts,90):.0f}")
print(f"Max: {token_counts.max()}\n")

# --- Extract role descriptors from speaker strings ---
role_pattern = re.compile(r"(?:,|[\(])\s*([A-Za-z\s\&\.]+?)(?:[\)\,]|$)")
roles = []

for d in data[:20000]:  # sample 20k dialogues for coverage
    for spk in d["speaker"]:
        match = role_pattern.search(spk)
        if match:
            role = match.group(1).strip().upper()
            # remove clutter like "MR." "MS." etc.
            role = re.sub(r"\b(MR|MS|MRS|DR|PROF|THE|A|AN)\b\.?\s*", "", role)
            roles.append(role)

role_counts = Counter(roles)
print(f"üéôÔ∏è Found {len(role_counts)} distinct role descriptors.\n")

# Display top 30 most frequent roles
for role, freq in role_counts.most_common(40):
    print(f"{role:40s} {freq}")

Token indices sequence length is longer than the specified maximum sequence length for this model (1051 > 512). Running this sequence through the model will result in indexing errors


üßæ Sample size: 205,854 utterances
Average tokens per utterance: 48.05
Median: 40
90th percentile: 102
Max: 1051

üéôÔ∏è Found 4094 distinct role descriptors.

HOST                                     240830
BYLINE                                   69938
CALLER                                   14512
AUTHOR                                   6088
DIRECTOR                                 2447
COLUMNIST                                2360
DEMOCRAT                                 1789
PRESIDENT                                1727
EXECUTIVE DIRECTOR                       1573
REPORTER                                 1441
ACTOR                                    1191
REPUBLICAN                               1073
EDITOR                                   1029
WRITER                                   1003
SINGER                                   755
FOUNDER                                  661
SENIOR FELLOW                            600
MUSICIAN                                 588
EDITORIAL

In [10]:
INPUT_PATH = "/kaggle/input/bert-embeddings/news_dialogue.json"
OUTPUT_PATH = "news_dialogue_known_roles.json"

KNOWN_ROLES = {
    "HOST", "AUTHOR", "DIRECTOR", "DEMOCRAT", "PRESIDENT",
    "EXECUTIVE DIRECTOR", "ACTOR", "REPUBLICAN", "WRITER",
    "FOUNDER", "VICE PRESIDENT", "PROFESSOR", "CEO",
    "CHAIRMAN", "PRESIDENT AND CEO", "OWNER"
}

role_pattern = re.compile(r"(?:,|[\(])\s*([A-Za-z\s\&\.]+?)(?:[\)\,]|$)")

def extract_role(spk: str) -> str:
    match = role_pattern.search(spk)
    if match:
        role = match.group(1).strip().upper()
        role = re.sub(r"\b(MR|MS|MRS|DR|PROF|THE|A|AN)\b\.?\s*", "", role)
        return role
    return ""

with open(INPUT_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

clean_data = []
for d in tqdm.tqdm(data, desc="Filtering dialogues"):
    roles = {extract_role(s) for s in d["speaker"] if s}
    if roles and all(r in KNOWN_ROLES for r in roles):
        clean_data.append(d)

print(f"‚úÖ Filtered dataset: {len(clean_data):,} dialogues kept")

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(clean_data, f, ensure_ascii=False, indent=2)

print(f"üíæ Saved filtered dataset to '{OUTPUT_PATH}'")


Filtering dialogues: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 463596/463596 [00:09<00:00, 47582.00it/s]


‚úÖ Filtered dataset: 4,216 dialogues kept
üíæ Saved filtered dataset to 'news_dialogue_known_roles.json'


In [14]:
# ============================================================
# Keep only 2-role dialogues and compute average utt length per combo
# ============================================================
INPUT_PATH = "news_dialogue_known_roles.json"   
OUTPUT_2ROLE_PATH = "news_dialogue_two_roles.json"
OUTPUT_STATS_PATH = "two_role_combo_stats.csv"

# ------------------------------------------------------------
# Regex for role extraction
# ------------------------------------------------------------
role_pattern = re.compile(r"(?:,|[\(])\s*([A-Za-z\s\&\.]+?)(?:[\)\,]|$)")
def extract_role(spk: str) -> str:
    match = role_pattern.search(spk)
    if match:
        role = match.group(1).strip().upper()
        role = re.sub(r"\b(MR|MS|MRS|DR|PROF|THE|A|AN)\b\.?\s*", "", role)
        return role
    return ""

# ------------------------------------------------------------
# Load dataset
# ------------------------------------------------------------
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    clean_data = json.load(f)

# ------------------------------------------------------------
# Keep only dialogues with exactly 2 distinct roles
# ------------------------------------------------------------
two_role_data = []
for d in tqdm.tqdm(clean_data, desc="Filtering to 2-role dialogues"):
    roles = {extract_role(s) for s in d["speaker"] if s}
    if len(roles) == 2:
        two_role_data.append((tuple(sorted(roles)), len(d["utt"]), d))

print(f"\n‚úÖ Kept {len(two_role_data):,} dialogues with exactly 2 roles "
      f"out of {len(clean_data):,} ({len(two_role_data)/len(clean_data)*100:.2f}%)")

# ------------------------------------------------------------
# Compute average utterance count per combination
# ------------------------------------------------------------
combo_stats = defaultdict(list)
for roles, n_utt, _ in two_role_data:
    combo_stats[roles].append(n_utt)

combo_summary = []
for combo, utt_list in combo_stats.items():
    combo_summary.append({
        "roles": " - ".join(combo),
        "count_dialogues": len(utt_list),
        "avg_utterances": np.mean(utt_list),
        "median_utterances": np.median(utt_list),
        "max_utterances": np.max(utt_list)
    })

df_combo = pd.DataFrame(combo_summary).sort_values("count_dialogues", ascending=False)
df_combo.reset_index(drop=True, inplace=True)

# ------------------------------------------------------------
# Print top 20 combinations
# ------------------------------------------------------------
print("\nüéôÔ∏è Top 20 two-role combinations with average dialogue length:\n")
for i, row in df_combo.head(20).iterrows():
    print(f"{row['roles']:60s}  {row['count_dialogues']:5d} dialogs  "
          f"avg len: {row['avg_utterances']:.1f}  "
          f"(median {row['median_utterances']:.0f}, max {row['max_utterances']})")

# ------------------------------------------------------------
# Save filtered dataset (for training)
# ------------------------------------------------------------
only_dialogues = [d for _, _, d in two_role_data]

with open(OUTPUT_2ROLE_PATH, "w", encoding="utf-8") as f:
    json.dump(only_dialogues, f, ensure_ascii=False, indent=2)

print(f"\nüíæ Saved {len(only_dialogues):,} two-role dialogues to '{OUTPUT_2ROLE_PATH}'")

# ------------------------------------------------------------
# Save combination statistics
# ------------------------------------------------------------
df_combo.to_csv(OUTPUT_STATS_PATH, index=False)
print(f"üíæ Saved role combo stats to '{OUTPUT_STATS_PATH}'")


Filtering to 2-role dialogues: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4216/4216 [00:00<00:00, 76118.92it/s]


‚úÖ Kept 784 dialogues with exactly 2 roles out of 4,216 (18.60%)

üéôÔ∏è Top 20 two-role combinations with average dialogue length:

AUTHOR - HOST                                                   202 dialogs  avg len: 24.8  (median 22, max 111)
DEMOCRAT - HOST                                                 128 dialogs  avg len: 18.9  (median 19, max 49)
DIRECTOR - HOST                                                  94 dialogs  avg len: 22.1  (median 20, max 66)
HOST - PRESIDENT                                                 62 dialogs  avg len: 23.8  (median 22, max 48)
HOST - REPUBLICAN                                                62 dialogs  avg len: 18.4  (median 18, max 52)
HOST - WRITER                                                    47 dialogs  avg len: 21.1  (median 20, max 52)
EXECUTIVE DIRECTOR - HOST                                        30 dialogs  avg len: 21.4  (median 20, max 71)
FOUNDER - HOST                                                   30 dialogs  av




# -----------------------------------------------------

-----------------------------------------------------

# **3. Processing the data for prompt ready sample into JSONL file for finetuning :**

In [15]:
# ============================================================
# Inspect dialogue samples from the two-role dataset
# ============================================================
INPUT_PATH = "/kaggle/input/bert-embeddings/news_dialogue_two_roles.json"

# --- Load JSON data ---
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"‚úÖ Loaded {len(data):,} records\n")

# --- Convert small sample to DataFrame for a quick glance ---
df = pd.DataFrame(data)
print("Columns:", df.columns.tolist(), "\n")

# --- Show 2 random samples (compact form) ---
sample_df = df.sample(2, random_state=42)
for idx, row in sample_df.iterrows():
    print(f"üÜî ID: {row['id']}")
    print(f"üéôÔ∏è Program: {row.get('program', '')}")
    print(f"üìÖ Date: {row.get('date', '')}")
    print(f"üì∞ Title: {row.get('title', '')}")
    print(f"üßæ Summary: {row.get('summary', '')[:180]}...")
    print(f"üó£Ô∏è Number of utterances: {len(row['utt'])}")
    print(f"üë• Speakers: {set(row['speaker'])}")
    print("First 5 utterances:")
    for u, s in list(zip(row['utt'], row['speaker']))[:3]:
        print(f"  {s}: {u}")
    print("-" * 90)


‚úÖ Loaded 784 records

Columns: ['id', 'program', 'date', 'url', 'title', 'summary', 'utt', 'speaker'] 

üÜî ID: NPR-41326
üéôÔ∏è Program: All Things Considered
üìÖ Date: 2007-03-04
üì∞ Title: Obama, Clinton Reflect on Selma's Lengthy Shadow
üßæ Summary: Democratic senators Hillary Clinton and Barack Obama courted black voters today with speeches in Selma, Ala., on the 42nd anniversary of the Selma March ‚Äî a civil rights demonstrat...
üó£Ô∏è Number of utterances: 13
üë• Speakers: {'Senator HILLARY CLINTON (Democrat, New York)', 'DEBBIE ELLIOTT, host', 'Senator BARACK OBAMA (Democrat, Illinois)'}
First 5 utterances:
  DEBBIE ELLIOTT, host: And now we return to the top story of the hour and take a few minutes to hear extended excerpts from today's speeches in Selma, Alabama by Democratic presidential hopefuls Hillary Clinton and Barack Obama.
  DEBBIE ELLIOTT, host: Here's Senator Obama speaking at the Brown Chapel, A.M.E. Church.
  Senator BARACK OBAMA (Democrat, Illinois): A 

In [17]:
# ============================================================
# Convert two-role dialogues into fine-tuning JSONL format
# ============================================================
INPUT_PATH = "/kaggle/input/bert-embeddings/news_dialogue_two_roles.json"
OUTPUT_PATH = "ft_news_dialogue_host_interview.jsonl"

# ------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------
def merge_consecutive(utterances, speakers):
    """Merge consecutive lines from the same speaker."""
    merged = []
    current_speaker = speakers[0]
    current_text = [utterances[0]]
    for u, s in zip(utterances[1:], speakers[1:]):
        if s == current_speaker:
            current_text.append(u)
        else:
            merged.append((current_speaker, " ".join(current_text)))
            current_speaker, current_text = s, [u]
    merged.append((current_speaker, " ".join(current_text)))
    return merged

def identify_host(speakers):
    for s in speakers:
        if "host" in s.lower():
            return s
    return None

# ------------------------------------------------------------
# Professional system prompt templates
# ------------------------------------------------------------
def build_system_prompt(title, summary):
    """Dynamic prompt generation with professional context injection."""
    base_prompt = (
        "You are an experienced journalist conducting a live interview. "
        "Your style is conversational, respectful, and inquisitive. "
        "You guide discussions with clear, insightful questions that encourage depth and authenticity. "
        "You respond naturally, reference prior remarks when relevant, and balance professionalism with empathy. "
        "If the guest provides complex or emotional information, acknowledge it gracefully and keep the discussion coherent."
    )

    # 50% of samples include contextual metadata
    if random.random() < 0.5 and title:
        context_prompt = (
            f"Today's topic: {title.strip()} "
            f"‚Äî Background: {summary.strip() if summary else 'no summary available.'} "
            "Keep your questions grounded in this context, but do not rigidly quote or summarize it. "
            "Your goal is to create an engaging dialogue that flows naturally while remaining factually grounded."
        )
        return f"{base_prompt}\n\n{context_prompt}"
    else:
        fallback_prompt = (
            "Engage the guest naturally. Ask follow-up questions that build on what they say. "
            "Avoid monologues; favor short, well-phrased questions. End gracefully when appropriate."
        )
        return f"{base_prompt}\n\n{fallback_prompt}"

# ------------------------------------------------------------
# Load dataset
# ------------------------------------------------------------
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"‚úÖ Loaded {len(data):,} dialogues")

# ------------------------------------------------------------
# Transform dialogues
# ------------------------------------------------------------
records = []
for d in tqdm.tqdm(data, desc="Converting"):
    host = identify_host(d["speaker"])
    if not host:
        continue

    merged = merge_consecutive(d["utt"], d["speaker"])
    sys_prompt = build_system_prompt(d.get("title", ""), d.get("summary", ""))

    messages = [{"role": "system", "content": sys_prompt.strip()}]

    for spk, utt in merged:
        if not utt.strip():
            continue
        role = "assistant" if spk == host else "user"
        messages.append({"role": role, "content": utt.strip()})

    if sum(1 for m in messages if m["role"] == "assistant") >= 1 and \
       sum(1 for m in messages if m["role"] == "user") >= 1:
        records.append({"messages": messages})

print(f"‚úÖ Prepared {len(records):,} usable dialogues")

# ------------------------------------------------------------
# Save as JSONL
# ------------------------------------------------------------
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    for r in records:
        json.dump(r, f, ensure_ascii=False)
        f.write("\n")

print(f"üíæ Saved fine-tuning file to '{OUTPUT_PATH}'")


‚úÖ Loaded 784 dialogues


Converting: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 784/784 [00:00<00:00, 48537.02it/s]

‚úÖ Prepared 784 usable dialogues
üíæ Saved fine-tuning file to 'ft_news_dialogue_host_interview.jsonl'





In [18]:
# ============================================================
#  Add "Let's begin the interview." after system message
#  and merge consecutive messages with same role
# ============================================================
OLD_PATH = "/kaggle/input/bert-embeddings/ft_news_dialogue_host_interview.jsonl"
NEW_PATH = "ft_news_dialogue_FINAL_2.jsonl"

def add_intro_and_merge(example):
    msgs = example["messages"]
    fixed = []
    # --- Keep system message if present ---
    if msgs and msgs[0]["role"] == "system":
        fixed.append(msgs[0])
        msgs = msgs[1:]
    # --- Always insert neutral intro user message ---
    fixed.append({"role": "user", "content": "Let's begin the interview."})
    # --- Merge consecutive messages with same role ---
    for msg in msgs:
        if fixed and fixed[-1]["role"] == msg["role"]:
            # Merge text if same role as previous
            fixed[-1]["content"] += " " + msg["content"].strip()
        else:
            fixed.append(msg)
    example["messages"] = fixed
    return example

# --- Process and save new file ---
with open(OLD_PATH, "r", encoding="utf-8") as fin, open(NEW_PATH, "w", encoding="utf-8") as fout:
    for line in fin:
        example = json.loads(line)
        example = add_intro_and_merge(example)
        fout.write(json.dumps(example, ensure_ascii=False) + "\n")
print(f"‚úÖ New merged dataset saved at: {NEW_PATH}")

‚úÖ New merged dataset saved at: ft_news_dialogue_FINAL_2.jsonl


In [19]:
# ============================================================
#  Validate final dataset structure (role alternation)
# ============================================================
FILE_PATH = "/kaggle/input/bert-embeddings/ft_news_dialogue_FINAL_2.jsonl"

bad_rows = []
double_user = []
double_assistant = []

with open(FILE_PATH, "r", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        ex = json.loads(line)
        msgs = ex.get("messages", [])

        if not msgs:
            bad_rows.append(i)
            continue

        # --- Check if system is followed by user ---
        if msgs[0]["role"] == "system":
            if len(msgs) < 2 or msgs[1]["role"] != "user":
                bad_rows.append(i)

        # --- Check for consecutive duplicate roles ---
        for j in range(1, len(msgs)):
            if msgs[j]["role"] == msgs[j - 1]["role"]:
                if msgs[j]["role"] == "user":
                    double_user.append((i, j))
                elif msgs[j]["role"] == "assistant":
                    double_assistant.append((i, j))

# --- Summary report ---
print("="*60)
print("üîç Dataset Validation Summary")
print("="*60)
print(f"Total rows checked: {i}")
print(f"‚ùå Rows with system not followed by user: {len(bad_rows)}")
print(f"‚ö†Ô∏è  Consecutive user-user pairs: {len(double_user)}")
print(f"‚ö†Ô∏è  Consecutive assistant-assistant pairs: {len(double_assistant)}")
print("="*60)

if bad_rows:
    print("Example problematic rows (system‚Üínon-user):", bad_rows[:5])
if double_user:
    print("Example user-user rows:", double_user[:5])
if double_assistant:
    print("Example assistant-assistant rows:", double_assistant[:5])

üîç Dataset Validation Summary
Total rows checked: 784
‚ùå Rows with system not followed by user: 0
‚ö†Ô∏è  Consecutive user-user pairs: 0
‚ö†Ô∏è  Consecutive assistant-assistant pairs: 0


# -----------------------------------------------------

-----------------------------------------------------------------------

# **4. Finetuning Using Mistral 7B v3 :**

In [34]:
# ============================================================
# 2. Load and Prepare Dataset
# ============================================================
DATA_PATH = "/kaggle/input/bert-embeddings/ft_news_dialogue_FINAL_2.jsonl"

with open(DATA_PATH, "r", encoding="utf-8") as f:
    data = [json.loads(l) for l in f]

random.shuffle(data)
n = len(data)
train_data = data[:int(0.8*n)]
eval_data  = data[int(0.8*n):int(0.9*n)]
test_data  = data[int(0.9*n):]

train_ds = Dataset.from_list(train_data)
eval_ds  = Dataset.from_list(eval_data)
test_ds  = Dataset.from_list(test_data)
dataset = {"train": train_ds, "eval": eval_ds, "test": test_ds}

print(f"‚úÖ Train: {len(train_ds)}, Eval: {len(eval_ds)}, Test: {len(test_ds)}")
print(dataset["train"][0])


‚úÖ Train: 627, Eval: 78, Test: 79
{'messages': [{'content': "You are an experienced journalist conducting a live interview. Your style is conversational, respectful, and inquisitive. You guide discussions with clear, insightful questions that encourage depth and authenticity. You respond naturally, reference prior remarks when relevant, and balance professionalism with empathy. If the guest provides complex or emotional information, acknowledge it gracefully and keep the discussion coherent.\n\nToday's topic: Rep. Maxine Waters on Blacks and Immigration ‚Äî Background: Where do black lawmakers stand on the issue of undocumented workers? U.S. Rep. Maxine Waters (D-CA), discusses the issue with Ed Gordon. Waters' district includes Los Angeles and surrounding cities, one of the hotbeds of protests against an immigration crackdown. Keep your questions grounded in this context, but do not rigidly quote or summarize it. Your goal is to create an engaging dialogue that flows naturally while 

In [35]:
# ============================================================
# 3. Load Base Model & Tokenizer (QLoRA Setup)
# ============================================================
model_id = "mistralai/Mistral-7B-Instruct-v0.3"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_4bit=True,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [37]:
# --- 1. Log in to W&B using Kaggle Secrets ---
# (This securely gets the key you just added)
try:
    api_key = UserSecretsClient().get_secret("WANDB_API_KEY")
    os.environ["WANDB_API_KEY"] = api_key
    print("‚úÖ W&B login successful using Kaggle Secret.")
except Exception as e:
    print(f"‚ö†Ô∏è Could not log in using Kaggle Secrets. Error: {e}")
    print("Please go to 'Add-ons' > 'Secrets' and add your W&B API key.")

# --- 2. Set your W&B Project and Entity names ---
# (This tells W&B where to send the run)
os.environ["WANDB_PROJECT"] = "mistral7b-newsbot-ft"  # You can change this project name
os.environ["WANDB_ENTITY"] = "adam-pro-01-esprit"    # This is your team name from the screenshot

‚úÖ W&B login successful using Kaggle Secret.


In [None]:
# ============================================================
# 4. Training Configuration
# ============================================================
sft_config = SFTConfig(
    output_dir="./mistral7b-newsbot-ft",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    save_strategy="epoch",
    logging_steps=15,
    bf16=True,
    optim="paged_adamw_8bit",
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to="wandb",  
)

# ============================================================
# 5. Fine-tune with TRL SFTTrainer
# ============================================================
# (This code is fixed, with invisible characters removed)
print("Initializing SFTTrainer...")
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    peft_config=lora_config,
    train_dataset=dataset["train"],
    eval_dataset=dataset["eval"],
)

print("Starting trainer.train()... This will create a run in W&B.")
trainer.train()

print("‚úÖ Training complete! Check your W&B dashboard.")

Initializing SFTTrainer...


Tokenizing train dataset:   0%|          | 0/627 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/627 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/78 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/78 [00:00<?, ? examples/s]

Starting trainer.train()... This will create a run in W&B.


  return fn(*args, **kwargs)


Step,Training Loss
15,2.341
30,2.1322
45,2.0789
60,2.0414


In [None]:
# ============================================================
# 6. Save Trained Adapter
# ============================================================
SAVE_DIR = "mistral7b-newsbot-lora"
trainer.model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print("‚úÖ LoRA fine-tuning complete and model saved at:", SAVE_DIR)

-------------------

-----------------------------------------------------------------------

-----------------------------------------------------------------------