In [39]:
!pip show bitsandbytes

Name: bitsandbytes
Version: 0.48.1
Summary: k-bit optimizers and matrix multiplication routines.
Home-page: https://github.com/bitsandbytes-foundation/bitsandbytes
Author: 
Author-email: Tim Dettmers <dettmers@cs.washington.edu>
License-Expression: MIT
Location: C:\Users\Webbies\anaconda3\Lib\site-packages
Requires: numpy, packaging, torch
Required-by: 


# CELL - 1: Installing the HF_XET module

In [2]:
# !pip install -q hf-xet

# CELL - 2: Importing All Required Packages

In [None]:
import os
from pathlib import Path
import json
import random
import getpass
from dataclasses import dataclass, field

# Set the HF Token and Configuration for Future Use
HF_TOKEN = os.environ.get("HF_TOKEN", "your_hf_token")
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"

# helper
print("HF_TOKEN set:", bool(HF_TOKEN))

HF_TOKEN set: True


# CELL - 3: Model Configuration and Setting Hyper-parameters

In [6]:
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"  # meta-llama/Llama-2-7b-chat-hf or mistralai/Mistral-7B-Instruct-v0.2

OUTPUT_DIR = "./lora_personalized_lbm"

os.makedirs(OUTPUT_DIR, exist_ok = True)

# Training hyperparams
BATCH_SIZE = 1   # small for low VRAM; use gradient accumulation to simulate larger B
GRAD_ACCUM_STEPS = 8
EPOCHS = 2
LEARNING_RATE = 1e-5
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.2
MAX_SEQ_LEN = 1024

print("Config:")
print("MODEL_NAME:", MODEL_NAME)
print("OUTPUT_DIR:", OUTPUT_DIR)

Config:
MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
OUTPUT_DIR: ./lora_personalized_lbm


# CELL - 4: Installing Accelerate Library

In [8]:
# !pip install -q accelerate

# CELL - 5: Model and Tokenizer Loading in 4 bit quantization

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
import warnings
warnings.filterwarnings('ignore')

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4"
)

# bnb_config = BitsAndBytesConfig(load_in_8bit = True, llm_int8_enable_fp32_cpu_offload = True)

print("Loading tokenizer...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast = False)
tokenizer.pad_token = tokenizer.eos_token

print("Loading model in 4-bit (this may take a while)...")

# AutoModelForCausalLM supports bitsandbytes integration (depends on model repo)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config = bnb_config,
    device_map = "auto",  # let accelerate choose device placement
    trust_remote_code = True,
    use_auth_token = HF_TOKEN or None # low_cpu_mem_usage=True  and torch_dtype=torch.float16
)

print("Model loaded.")

Loading tokenizer...
Loading model in 4-bit (this may take a while)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.


# CELL - 6: Loading the PENS data from Local Directory

In [12]:
# import os, csv, json
# from pathlib import Path
# import pandas as pd

# DATA_DIR = "./data/pens"

# train_path = os.path.join(DATA_DIR, "train.tsv")
# val_path = os.path.join(DATA_DIR, "valid.tsv")
# news_path = os.path.join(DATA_DIR, "news.tsv")

# assert os.path.exists(train_path), "Missing train.tsv"
# assert os.path.exists(val_path), "Missing validation.tsv"
# assert os.path.exists(news_path), "Missing news.tsv (required for article text)"

# # Load TSVs
# train_df_full = pd.read_csv(train_path, sep = "\t")
# val_df_full = pd.read_csv(val_path, sep = "\t")
# news_df_full = pd.read_csv(news_path, sep = "\t")


# # Sampling the train_df, val_df and news_df with 2000 samples with random state 42
# train_df = train_df_full.sample(15000, random_state = 42)
# val_df = val_df_full.sample(15000, random_state = 42)
# news_df = news_df_full.sample(15000, random_state = 42)

# print("=========================================================")

# print("The shapes of Respective dataframes are:")
# print("Train shape:", train_df.shape)
# print("Val shape:", val_df.shape)
# print("News shape:", news_df.shape)

# print("=========================================================")

# print("The Column names of Respective dataframes are:")
# print("Sample Train columns:", train_df.columns.tolist())
# print("Sample Validation columns:", val_df.columns.tolist())
# print("Sample news columns:", news_df.columns.tolist())

############################################## NEW MODIFIED CODE ############################################
import os, csv, json
from pathlib import Path
import pandas as pd

DATA_DIR = "./data/pens"

train_path = os.path.join(DATA_DIR, "train.tsv")
val_path = os.path.join(DATA_DIR, "valid.tsv")
news_path = os.path.join(DATA_DIR, "news.tsv")

assert os.path.exists(train_path), "Missing train.tsv"
assert os.path.exists(val_path), "Missing validation.tsv"
assert os.path.exists(news_path), "Missing news.tsv (required for article text)"

# Load TSVs
train_df_full = pd.read_csv(train_path, sep = "\t")
val_df_full = pd.read_csv(val_path, sep = "\t")
news_df_full = pd.read_csv(news_path, sep = "\t")

# 1. Sample larger train set and smaller validation set for better ratio
TRAIN_SAMPLE_SIZE = 1500  # Increased for higher training sample count
VAL_SAMPLE_SIZE = 300    # Decreased for lower validation sample count
RANDOM_STATE = 42

# Sample the train and validation logs
train_df = train_df_full.sample(TRAIN_SAMPLE_SIZE, random_state = RANDOM_STATE).reset_index(drop=True)
val_df = val_df_full.sample(VAL_SAMPLE_SIZE, random_state = RANDOM_STATE).reset_index(drop=True)

# 2. Extract ALL unique news IDs required by the sampled train and val sets
# The 'pos' column contains semicolon/pipe separated news IDs (which Code-2 parses)
def extract_all_news_ids(df):
    all_ids = set()
    for ids_str in df['pos'].dropna():
        # Handle both ';' and '|' separators, and split
        ids = str(ids_str).replace(";", "|").split("|")
        all_ids.update([n.strip() for n in ids if n.strip()])
    return all_ids

required_train_ids = extract_all_news_ids(train_df)
required_val_ids = extract_all_news_ids(val_df)
required_news_ids = required_train_ids.union(required_val_ids)

# 3. Filter the full news_df to only include the articles we actually need
# This ensures data linkage is maintained and dramatically increases the hit rate in the build_prompt_target function.
news_df = news_df_full[news_df_full['News ID'].isin(required_news_ids)].reset_index(drop = True)

print(f"Total unique news IDs needed by sampled data: {len(required_news_ids)}")
# --- END OF MODIFIED SECTION ---

print("=========================================================")

print("The shapes of Respective dataframes are:")
print("Train shape (User Logs):", train_df.shape)
print("Val shape (User Logs):", val_df.shape)
print("News shape (Filtered Articles):", news_df.shape) # This size will now be variable
print("=========================================================")

print("The Column names of Respective dataframes are:")
print("Sample Train columns:", train_df.columns.tolist())
print("Sample Validation columns:", val_df.columns.tolist())
print("Sample news columns:", news_df.columns.tolist())

Total unique news IDs needed by sampled data: 1572
The shapes of Respective dataframes are:
Train shape (User Logs): (1500, 9)
Val shape (User Logs): (300, 9)
News shape (Filtered Articles): (786, 7)
The Column names of Respective dataframes are:
Sample Train columns: ['UserID', 'ClicknewsID', 'dwelltime', 'exposure_time', 'pos', 'neg', 'start', 'end', 'dwelltime_pos']
Sample Validation columns: ['UserID', 'ClicknewsID', 'dwelltime', 'exposure_time', 'pos', 'neg', 'start', 'end', 'dwelltime_pos']
Sample news columns: ['News ID', 'Category', 'Topic', 'Headline', 'News body', 'Title entity', 'Entity content']


# CELL - 7: Merge Data to Build Behaviour-text pairs

In [14]:
def parse_news_ids(news_ids):
    """Parse semicolon or pipe separated news IDs into a clean list."""
    if pd.isna(news_ids):
        return []
    ids = str(news_ids).replace(";", "|").split("|")
    return [n.strip() for n in ids if n.strip()]


def get_user_profile(row, news_df):
    """
    Aggregate user's positive-click history (headlines of clicked articles)
    to build behavioral context.
    """
    pos_ids = parse_news_ids(row.get("pos", ""))
    clicked_headlines = []
    for nid in pos_ids[:5]:  # limit to last 5 clicked
        if nid in news_df.index:
            clicked_headlines.append(str(news_df.loc[nid, "Headline"]))
    avg_dwell = row.get("dwelltime_pos", 0)
    return {
        "user_id": row["UserID"],
        "clicked_headlines": clicked_headlines,
        "avg_dwell": avg_dwell,
    }


# Set News ID as index for faster lookup
news_df = news_df.set_index("News ID")


def build_prompt_target(row, news_df):
    """
    Build training samples combining user behavior and article text.
    - Input (prompt): combines user profile + article text
    - Target: the article's headline (as a personalized summary)
    """
    pos_ids = parse_news_ids(row.get("pos", ""))
    samples = []

    for nid in pos_ids:
        if nid in news_df.index:
            article = news_df.loc[nid]

            # User profile context
            profile = get_user_profile(row, news_df)
            profile_str = (
                f"<USER:{profile['user_id']}> "
                f"<AVG_DWELL:{profile['avg_dwell']}> "
                f"<HISTORY:{'|'.join(profile['clicked_headlines'])}>"
            )

            # Use news text and headline
            headline = str(article.get("Headline", "")).strip()
            news_body = str(article.get("News body", "")).strip()
            topic = str(article.get("Topic", "")).strip()
            category = str(article.get("Category", "")).strip()

            doc_text = f"[Category: {category}] [Topic: {topic}] {news_body}"

            # Headline is the personalized summary target
            target = headline

            # Prompt
            prompt = (
                f"Summarize the following news article for a user with preferences: {profile_str}\n\n"
                f"Article:\n{doc_text}\n\nSummary:"
            )

            samples.append({"prompt": prompt, "target": target})

    return samples


# Build dataset
train_samples = []
for _, row in train_df.iterrows():
    train_samples.extend(build_prompt_target(row, news_df))

val_samples = []
for _, row in val_df.iterrows():
    val_samples.extend(build_prompt_target(row, news_df))

print(f"Generated {len(train_samples)} training pairs and {len(val_samples)} validation pairs.")

Generated 837 training pairs and 176 validation pairs.


# CELL - 8: Converting the data into JSONL files for Fine Tuning

In [16]:
os.makedirs("processed_pens", exist_ok = True)

with open("processed_pens/pens_train.jsonl", "w", encoding = "utf-8") as f:
    for ex in train_samples:
        f.write(json.dumps(ex, ensure_ascii = False) + "\n")
        
with open("processed_pens/pens_val.jsonl", "w", encoding = "utf-8") as f:
    for ex in val_samples:
        f.write(json.dumps(ex, ensure_ascii = False) + "\n")

print("Saved preprocessed data to processed_pens/")

Saved preprocessed data to processed_pens/


# CELL - 9: Create Dataset class for PENS

In [18]:
from torch.utils.data import Dataset
import json

class PENSJsonlDataset(Dataset):
    def __init__(self, jsonl_path: str, tokenizer, max_length: int = 1024):
        self.tokenizer = tokenizer
        with open(jsonl_path, "r", encoding = "utf-8") as f:
            self.examples = [json.loads(line) for line in f]
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        ex = self.examples[idx]
        prompt = ex["prompt"]
        target = ex["target"]
        text = prompt + " " + target
        enc = self.tokenizer(
            text,
            truncation = True,
            max_length = self.max_length,
            padding = "max_length",
            return_tensors = "pt"
        )
        input_ids = enc["input_ids"].squeeze()
        attention_mask = enc["attention_mask"].squeeze()
        labels = input_ids.clone()
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

print("The dataset Class is created Successfully")

The dataset Class is created Successfully


# CELL - 10: Initialize training dataset from your processed files

In [20]:
train_jsonl_path = "processed_pens/pens_train.jsonl"
val_jsonl_path = "processed_pens/pens_val.jsonl"

ds = PENSJsonlDataset(train_jsonl_path, tokenizer, max_length = 512)
val_ds = PENSJsonlDataset(val_jsonl_path, tokenizer, max_length = 512)

print("Dataset ready. Training samples:", len(ds))
print("Dataset ready. Validation samples:", len(val_ds))

Dataset ready. Training samples: 837
Dataset ready. Validation samples: 176


# CELL - 11: Setup LoRA with PEFT and prepare Trainer

In [22]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, EarlyStoppingCallback

# Prepare model for int8/4bit training (PEFT helper)
# Note: prepare_model_for_int8_training supports int8; for 4-bit many users still apply similar prep.
try:
    model = prepare_model_for_kbit_training(model)
except Exception as e:
    print("prepare_model_for_int8_training hit an issue (ok for 4-bit), continuing:", e)

lora_config = LoraConfig(
    r = LORA_R,
    lora_alpha = LORA_ALPHA,
    target_modules = ["q_proj", "v_proj"] if "llama" in MODEL_NAME.lower() else None,
    lora_dropout = LORA_DROPOUT,
    bias = "none",
    task_type = TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)
print("PEFT LoRA model prepared. Trainable params:", sum(p.numel() for p in model.parameters() if p.requires_grad))

PEFT LoRA model prepared. Trainable params: 8388608


# CELL - 12: Create DataLoader and simple training loop using HuggingFace Trainer

In [24]:
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False)

print("Data Collator Created")

# Setting the Training Arguements
training_args = TrainingArguments(
    output_dir = OUTPUT_DIR,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    gradient_accumulation_steps = GRAD_ACCUM_STEPS,
    num_train_epochs = EPOCHS,
    learning_rate = LEARNING_RATE,
    eval_strategy="steps",
    eval_steps=20,
    load_best_model_at_end=True,
    warmup_steps = 10,
    logging_steps = 10,
    save_strategy = "steps",
    save_steps = 20,
    logging_dir = "./logs",
    report_to = "none",
    fp16 = True,
    save_total_limit = 2,
    metric_for_best_model="eval_loss",
)
print("Training Parameters are set")

# Creating the Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = ds,
    eval_dataset = val_ds,
    tokenizer = tokenizer,
    data_collator = data_collator,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 2)],
)
print("Trainer Created Successfully")

print("Starting Hugging Face Trainer fine-tuning...")

# THE ONLY REQUIRED CHANGE
# Set 'resume_from_checkpoint=True' to enable automatic resumption.
trainer.train() # resume_from_checkpoint = True

print("Training complete!")

# Save the LoRA adapter weights
trainer.save_model(OUTPUT_DIR)
print("Saved LoRA weights to", OUTPUT_DIR)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Data Collator Created
Training Parameters are set
Trainer Created Successfully
Starting Hugging Face Trainer fine-tuning...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
20,2.3487,2.337042
40,2.2089,2.250175
60,2.1503,2.174112
80,2.0614,2.115283
100,2.0411,2.065123
120,1.9952,2.021264
140,1.9519,1.984319
160,1.9059,1.955437
180,1.945,1.937767
200,1.8827,1.929712


Training complete!
Saved LoRA weights to ./lora_personalized_lbm


# CELL - 13: Inference demo: LoRA + 4-bit generate with conditioning

In [29]:
from peft import PeftModel
from transformers import GenerationConfig
import torch

# Load base model (same as during training)
model_base = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map = "auto",
    trust_remote_code = True,
    load_in_4bit = True,  # if using 4-bit quantization
    bnb_4bit_compute_dtype = torch.float16
)

# Load LoRA adapter on top of base model
model = PeftModel.from_pretrained(model_base, OUTPUT_DIR)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("The model is loaded in Evaluation Mode with LoRA adapter.")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
# Define a function to generate summary
def generate_summary(document: str, user_profile: dict, max_new_tokens = 200):
    profile_str = f"<FOCUS:{user_profile['focus']}> <TONE:{user_profile['tone']}> <LENGTH:{user_profile['length']}> <HISTORY:{'|'.join(user_profile['history'])}>"
    prompt = f"Summarize the following document for a user with preferences: {profile_str}\n\nDocument:\n{document}\n\nSummary:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    gen_config = GenerationConfig(
        temperature=0.2,
        top_p=0.95,
        do_sample=False,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id
    )
    with torch.no_grad():
        out = model.generate(**inputs, generation_config=gen_config)
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    # post-process: strip prompt prefix
    summary_text = text.split("Summary:")[-1].strip()
    return summary_text

# CELL - 14: Creating a Sample Document

In [None]:
sample_doc = """Nearly 2,000 miners across four states may lose their jobs after yet another major coal company filed for bankruptcy this week 
the third since May and fourth since last October. The bankruptcy filing from Revelation Energy LLC and its affiliate Blackjewel LLC, 
the nation's sixth-top coal producing company in 2017, comes amid President Donald Trump's ongoing efforts to boost the flagging industry. 
The Trump administration rolled out a rule last month aiming to extend the lives of aging coal-fired power plants across the nation. 
Environmentalists say the Affordable Clean Energy rule would trigger premature deaths, including from lung disease. During his 2016 
presidential campaign, Trump promised to revitalize coal and save miners' jobs, despite scientists linking the burning of the fossil fuels 
to global warming , but the industry has continued to suffer losses. Coal comeback? Trump plan breathes new life into aging power plants, 
but critics say climate will suffer At mines and facilities in Virginia, Kentucky, West Virginia and Wyoming, Revelation Energy and 
Blackjewel employ 1,800 workers, according to court documents and The Casper (Wyo.) Star-Tribune . Company officials estimate they owe 
$156 million for goods and services, West Virginia Public Radio reported. Last month, Cambrian Coal LLC also filed for bankruptcy . 
The company operating in Kentucky and Virginia blamed its bankruptcy on changes in demand and regulations related to the Clean Air Act. 
Another coal-producing leader filed for Chapter 11 bankruptcy in May. Once the nation's third-largest coal company, Wyoming-based Cloud Peak Energy 
employed 1,300 people at the time of its filing. It accounted for 7.4% of total U.S. coal production in 2017, according to the Department of Labor . 
And, the nation's ninth-leading coal company went to bankruptcy court late in 2018. Colorado-based Westmoreland Coal Co. had more than $1.4 
billion in debt at the time, The Associated Press reported. Gone by 2030?: On World Environment Day, everything you know about energy in 
the US might be wrong Although Trump has touted coal's rebirth, 51 coal plants have closed and eight coal companies have filed for bankruptcy 
since his election, CBS News reported last month. Coal's share of the U.S. electricity mix fell from 48% in 2008 to 27% in 2018 and is 
projected to be 22% in 2020, according to the Department of Energy. \"We're retiring a coal plant every month. Coal will all be gone by 2030,
said Bruce Nilles , a managing director at the Rocky Mountain Institute, a think tank in Colorado that focuses on energy and resource efficiency. 
Coal policy, including Trump's Affordable Clean Energy rule , could influence the 2020 election in swing states where coal is still mined, 
such as Ohio and Pennsylvania. Contributing: Beth Weise and Ledyard King, USA TODAY This article originally appeared on USA TODAY: 
Is President Donald Trump losing his fight to save coal? Third major company since May files for bankruptcy. 
"""

sample_doc[:500]

# CELL - 15: Creating A User Profile for use

In [None]:
# Build a demo user profile based on the sample_doc content
demo_profile = {
    "focus": "results",      # focus on the outcome of the events: job losses, bankruptcies
    "tone": "analytical",    # factual, professional summary style
    "length": "medium",      # medium length to capture key points
    "history": [
        "Coal industry layoffs in Appalachia",
        "Bankruptcy filings of major mining companies",
        "Trump administration energy policies",
        "Impact of Clean Air Act on coal sector",
        "US electricity mix and coal decline"
    ]
}

print("Demo user profile:\n", demo_profile)

# CELL - 16: Running the generate_summary function on the Sample Document and User Profile

In [None]:
demo_profile = {"focus":"results","tone":"analytical","length":"short","history":["previous_article_title_1"]}
print("Generated summary:\n", generate_summary(sample_doc, demo_profile))

# CELL 17: Save artifacts and small README

In [None]:
with open("README_MODEL_ARTIFACTS.md", "w") as f:
    f.write("Artifacts: LoRA weights, tokenizer (if needed), FAISS index (docs_index.faiss).")
print("Done.")