# Install Libraries

In [1]:
! pip install -p transformers accelerate bitsandbytes peft torch


Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: -p


# Import Libraries

In [32]:
import os
import pandas as pd
import re
from bs4 import BeautifulSoup

# Global Variables

In [7]:
INPUT_BASE_PATH = "/kaggle/input/"

In [26]:
NEWS_DATASET_PATH = os.path.join(INPUT_BASE_PATH, "european-commission-news/eu_commission_news.csv")

In [10]:
LLAMA_MODEL_PATH = os.path.join(INPUT_BASE_PATH, "llama-3.1/transformers/8b-instruct/2")

In [12]:
OUTPUT_BASE_PATH = '/kaggle/working/'

In [33]:
CLEAN_DATASET_PATH = os.path.join(OUTPUT_BASE_PATH, "eu_commission_news_clean.csv")

# Import Dataset

In [29]:
df = pd.read_csv(NEWS_DATASET_PATH)

In [30]:
df.head()

Unnamed: 0,title,link,date,summary,description
0,Mounting risks threaten survival of wild Europ...,https://environment.ec.europa.eu/news/mounting...,11 October 2025,Nearly 100 additional wild bee species in Euro...,"Abu Dhabi, United Arab Emirates, 11 October 20..."
1,Energy Efficiency Directive: Advancing the EU’...,https://energy.ec.europa.eu/news/energy-effici...,10 October 2025,Tomorrow (11 October) marks the deadline for E...,Tomorrow (11 October) marks the deadline for E...
2,EU’s Leading Role Instrumental in advancing su...,https://research-and-innovation.ec.europa.eu/n...,10 October 2025,G20 reinforces international collaboration in ...,"The G20 Research, Science and Innovation Minis..."
3,President von der Leyen travels to the Western...,https://enlargement.ec.europa.eu/news/presiden...,10 October 2025,"The President of the European Commission, Ursu...","The President of the European Commission, Ursu..."
4,Fishing vessel engine power - new Commission g...,https://oceans-and-fisheries.ec.europa.eu/news...,10 October 2025,The European Commission has published two tech...,The European Commission has published two tech...


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        266 non-null    object
 1   link         266 non-null    object
 2   date         266 non-null    object
 3   summary      266 non-null    object
 4   description  266 non-null    object
dtypes: object(5)
memory usage: 10.5+ KB


# Data Preparation & Cleaning

In [34]:
# Define a function to clean text
def clean_text(text):
    if pd.isna(text):
        return ""
    # Remove HTML tags
    text = BeautifulSoup(str(text), "html.parser").get_text()
    # Remove non-UTF characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to relevant text columns
text_columns = ['title', 'summary', 'description']  # adjust based on your actual columns
for col in text_columns:
    if col in df.columns:
        df[col] = df[col].apply(clean_text)

# Save the cleaned data
df.to_csv(CLEAN_DATASET_PATH, index=False)

# Preview cleaned data
df.head()

Unnamed: 0,title,link,date,summary,description
0,Mounting risks threaten survival of wild Europ...,https://environment.ec.europa.eu/news/mounting...,11 October 2025,Nearly 100 additional wild bee species in Euro...,"Abu Dhabi, United Arab Emirates, 11 October 20..."
1,Energy Efficiency Directive: Advancing the EU ...,https://energy.ec.europa.eu/news/energy-effici...,10 October 2025,Tomorrow (11 October) marks the deadline for E...,Tomorrow (11 October) marks the deadline for E...
2,EU s Leading Role Instrumental in advancing su...,https://research-and-innovation.ec.europa.eu/n...,10 October 2025,G20 reinforces international collaboration in ...,"The G20 Research, Science and Innovation Minis..."
3,President von der Leyen travels to the Western...,https://enlargement.ec.europa.eu/news/presiden...,10 October 2025,"The President of the European Commission, Ursu...","The President of the European Commission, Ursu..."
4,Fishing vessel engine power - new Commission g...,https://oceans-and-fisheries.ec.europa.eu/news...,10 October 2025,The European Commission has published two tech...,The European Commission has published two tech...


# Try Llama 3.1

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# 1. Define the correct local path for Version 2 (Note the '2' at the end)

# 2. Define the 4-bit quantization config (Crucial for VRAM on a T4 GPU)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # Normalized Float 4-bit (recommended)
    bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for computation if supported (faster)
)

In [3]:
# 3. Load Tokenizer and Model
# Note: You may need to add trust_remote_code=True for some models, but it is often unnecessary 
# for officially hosted models.
tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
    LLAMA_MODEL_PATH,
    quantization_config=bnb_config,
    device_map="auto"
)

# Set the padding token, which is often missing or incorrectly set for Llama models
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Llama 3.1 8B Model loaded successfully!")

2025-10-28 13:47:57.328550: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761659277.511718      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761659277.560629      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Llama 3.1 8B Model loaded successfully!


In [4]:
# Define the conversation history. We'll use a simple system message and a user query.
messages = [
    # Optional: A system message to define the model's persona or rules.
    {"role": "system", "content": "You are a concise, factual European news assistant."},
    # The user's question to test the model.
    {"role": "user", "content": "Explain why the Euro zone was created in one short paragraph."}
]

In [5]:
# Apply the template to create the final prompt string (input_ids)
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True, # Tells the tokenizer to add the final 'assistant' header
    return_tensors="pt"
).to(model.device) # Move the prompt tokens to the GPU where the model is loaded

In [7]:
# The messages list is already defined from your previous cell
# messages = [...]

# 1. Apply the template to create the final prompt *string* (not a tensor yet)
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False, # Important: Returns a string
    add_generation_prompt=True
)

In [8]:
# 2. Tokenize the resulting string to get the required dictionary of tensors
# This is where the dictionary containing 'input_ids' and 'attention_mask' is created.
input_dict = tokenizer(prompt, return_tensors="pt").to(model.device)

In [9]:
import torch

with torch.no_grad():
    outputs = model.generate(
        **input_dict, # Pass the dictionary of tensors here
        max_new_tokens=256,    
        do_sample=True,        
        temperature=0.7        
    )

# Decode and print the output
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract only the assistant response
assistant_response = response_text.split("assistant\n")[-1].strip()

print("\n--- ASSISTANT RESPONSE ONLY ---")
print(assistant_response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- ASSISTANT RESPONSE ONLY ---
The Eurozone was created to facilitate economic integration among European countries. It began with the signing of the Maastricht Treaty in 1992, which established the European Monetary Union (EMU). The treaty aimed to create a single currency, the Euro, to promote economic unity, increase trade, and reduce transaction costs among participating countries. The Euro was introduced in 1999 and replaced the national currencies of participating countries in 2002.
