In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install --upgrade pip
!pip install torch==2.5.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install "unsloth[cu124-torch250] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
train_df=pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv")

In [None]:
train_df.head()

In [None]:
train_df.drop("id" , axis = 1 ,inplace=True)

In [None]:
train_df.info()

In [None]:
train_df.iloc[0]

In [None]:
train_df.iloc[0]['article']

In [None]:
script = train_df.iloc[0]['article']

In [None]:
train_df.iloc[0]['highlights']

In [None]:
summary = train_df.iloc[0]['highlights']

In [None]:
summary

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
train_df.columns

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
def find_target_modules(model, target_class_name="Linear4bit", return_full_names=False):
    """
    Find all unique module names in the model that match a given class name substring.

    Args:
        model (torch.nn.Module): The model to inspect.
        target_class_name (str): Substring to look for in module types (e.g., "Linear4bit").
        return_full_names (bool): If True, return full module names; otherwise, just the last part.

    Returns:
        List[str]: Unique module name parts where the target class was found.
    """
    unique_layers = set()

    for name, module in model.named_modules():
        if target_class_name in type(module).__name__:
            layer_name = name if return_full_names else name.split('.')[-1]
            unique_layers.add(layer_name)

    return sorted(unique_layers)


In [None]:
res = find_target_modules(model, target_class_name="Linear4bit", return_full_names=False)
print(res)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = res,
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None # And LoftQ
)

In [None]:
subset_df = train_df.sample(n=5000, random_state=42).reset_index(drop=True)

In [None]:
subset_df

In [None]:
from datasets import Dataset

# 1. Define the Alpaca-style prompt template
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Summarize the following text:

### Input:
{}

### Response:
{}"""

# 2. Get the EOS token from your tokenizer
EOS_TOKEN = tokenizer.eos_token

# 3. Define the formatting function
def formatting_prompts_func(examples):
    inputs = examples["article"]     # raw input text
    outputs = examples["highlights"] # summary (fixed typo)
    texts = [alpaca_prompt.format(inp, out) + EOS_TOKEN for inp, out in zip(inputs, outputs)]
    return { "text": texts }

# 4. Convert your DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(subset_df)

# 5. Apply the formatting function
dataset = dataset.map(formatting_prompts_func, batched=True)


In [None]:
dataset

In [None]:
from transformers import AutoTokenizer

# Load tokenizer for Unsloth's Mistral-7B 4bit model
tokenizer = AutoTokenizer.from_pretrained("unsloth/mistral-7b-bnb-4bit", use_fast=True)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = True,  # Speeds up training for short text inputs
    args = TrainingArguments(
        per_device_train_batch_size = 4,       # Increased for speed
        gradient_accumulation_steps = 1,       # Simpler, higher throughput
        num_train_epochs = 1,                  # Faster experimentation
        warmup_steps = 10,
        learning_rate = 5e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none"
    ),
)


In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
script_2 = train_df.iloc[10]['article']

In [None]:
script_2

In [None]:
from unsloth import FastLanguageModel
import torch, re

# Enable fast inference mode
FastLanguageModel.for_inference(model)

# Add special tokens to ensure proper stopping
if tokenizer.eos_token is None:
    tokenizer.eos_token = "</s>"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Ultra-simplified prompt with explicit end markers
prompt = f"""<|im_start|>system
You are a summarization AI. Generate only a 2-3 sentence summary. Always complete your sentences.
<|im_end|>
<|im_start|>user
Summarize this article in 2-3 complete sentences:

{script_2}
<|im_end|>
<|im_start|>assistant
"""

# Add explicit ending tokens to the tokenizer's vocabulary if not present
end_token = "<|im_end|>"
if end_token not in tokenizer.get_vocab():
    print("End token not in vocabulary, using EOS token instead")
    end_token = tokenizer.eos_token

# Tokenize
inputs = tokenizer(
    [prompt],
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=2048,
).to("cuda")

# Force the model to generate a complete summary
outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    min_new_tokens=20,
    temperature=0.1,  # Very low temperature to reduce randomness
    top_p=0.5,       # More restrictive top_p
    do_sample=False, # Turn off sampling for more deterministic output
    num_beams=None,     # Use beam search for better completion
    early_stopping=True,
    repetition_penalty=1.0,  # Default repetition penalty
    length_penalty=1.0,      # No length penalty
    use_cache=True
)

# Get the generated text
full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
input_text = tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
summary = full_output[len(input_text):].strip()

# Final cleanup to ensure we have only the summary
summary = re.sub(r'(?i)(summary:|the summary is:|here\'s a summary:|step-by-step|instructions:|note:|###)', '', summary)
summary = re.sub(r'^\d+\.[\s]*', '', summary, flags=re.MULTILINE)
summary = re.sub(r'^\*[\s]*', '', summary, flags=re.MULTILINE)
summary = re.sub(r'<\|im_end\|>.*', '', summary, flags=re.MULTILINE)




, '', summary, flags=re.DOTALL)  # Remove anything after end token

# Ensure the summary ends with proper punctuation
if summary and not summary[-1] in ['.', '!', '?']:
    # Find the last complete sentence
    last_sentence_end = max(summary.rfind('.'), summary.rfind('!'), summary.rfind('?'))
    if last_sentence_end > 0:
        summary = summary[:last_sentence_end+1]

print("SUMMARY:\n", summary.strip())

# Post-processing function to call if you still get incomplete summaries
def get_complete_sentences(text):
    """Extract only complete sentences from text"""
    sentences = re.findall(r'[^.!?]*[.!?]', text)
    if len(sentences) == 0:
        return text  # Return original if no complete sentences found
    return ' '.join(sentences)

# Uncomment this line if you still get incomplete summaries
# print("CLEANED SUMMARY:\n", get_complete_sentences(summary))

In [None]:
model.push_to_hub("badbrock/mistral-7b-finetuned")
tokenizer.push_to_hub("badbrock/mistral-7b-finetuned")