In [None]:
!pip install datasets transformers
!pip install evaluate

In [23]:
!pip install faiss-cpu 
!pip install langchain
!pip install langchain_community

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting langchain_community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain-core<1.0.0,>=0.3.59 (from langchain_community)
  Downloading langchain_core-0.3.60-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain<1.0.0,>=0.3.25 (from langchain_community)
  Downloading langchain-0.3.25-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain<1.0.0,>=0.3.25->langchain_community)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain_community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading

In [24]:
from datasets import load_dataset
from transformers import PreTrainedTokenizerBase
from transformers import AutoTokenizer, DPRContextEncoder,DPRContextEncoderTokenizerFast, AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer, pipeline
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration, pipeline

from datasets import DatasetDict
import evaluate
import torch
import faiss
from datasets import Dataset
from langchain.schema    import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings   import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms         import HuggingFacePipeline
from langchain.chains       import RetrievalQA

## Helper functions

In [4]:
def preprocess(examples, tokenizer: PreTrainedTokenizerBase):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=256,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")
    start_positions = []
    end_positions = []
    for i, offsets in enumerate(offset_mapping):
        sample_idx = sample_mapping[i]
        answers = examples["answers"][sample_idx]
        seq_ids = tokenized.sequence_ids(i)
        cls_index = 0
        start_positions.append(cls_index)
        end_positions.append(cls_index)
        if len(answers["answer_start"]) == 0:
            continue
        start_char = answers["answer_start"][0]
        end_char = start_char + len(answers["text"][0])
        context_tokens = [idx for idx, s_id in enumerate(seq_ids) if s_id == 1]
        if not context_tokens:
            continue
        chunk_start_char = offsets[context_tokens[0]][0]
        chunk_end_char = offsets[context_tokens[-1]][1]
        if not (chunk_start_char <= start_char and end_char <= chunk_end_char):
            continue
        token_start_index = cls_index
        token_end_index = cls_index
        for idx in context_tokens:
            off_start, off_end = offsets[idx]
            if off_start <= start_char < off_end:
                token_start_index = idx
            if off_start < end_char <= off_end:
                token_end_index = idx
        start_positions[-1] = token_start_index
        end_positions[-1] = token_end_index
    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    return tokenized


In [3]:
def print_metrics(ds, custom_pipeline):
    outputs = custom_pipeline(question=ds["question"], context=ds["context"])
    
    preds = [
        {"id": ex["id"], "prediction_text": out["answer"]}
        for ex, out in zip(ds, outputs)
    ]
    refs = [
        {"id": ex["id"], "answers": ex["answers"]}
        for ex in ds
    ]
    
    results = metric.compute(predictions=preds, references=refs)
    print(f"EM: {results['exact_match']:.2f}, F1: {results['f1']:.2f}")

## Dataset loading

In [2]:
raw = load_dataset("squad")
sub = raw["train"].shuffle(seed=42).select(range(20000))
split = sub.train_test_split(test_size=0.2, seed=42)

data = DatasetDict({
    "train":      split["train"],
    "validation": split["test"],
    "test":       raw["validation"]
})

## Experiment 1

In [None]:
model_name = "distilbert-base-uncased-distilled-squad"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model_distilbert  = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [None]:
tokenized = data.map(
    lambda ex: preprocess(ex, tokenizer),
    batched=True,
    remove_columns=raw["train"].column_names
)

In [None]:
args = TrainingArguments(
  output_dir="distilbert-qa",
  per_device_train_batch_size=128,
  per_device_eval_batch_size=128,
  num_train_epochs=10,
  learning_rate=3e-5,
  weight_decay=0.01,
  logging_steps=200,
  log_level="info",
  report_to=["none"],       
  disable_tqdm=False,      
  fp16=True if torch.cuda.is_available() else False,            
)

In [None]:
trainer_full = Trainer(
  model=model_distilbert,
  args=args,
  train_dataset=tokenized["train"],
  eval_dataset =tokenized["validation"],
  processing_class=tokenizer,
  compute_metrics=compute_metrics
)
trainer_full.train()

In [None]:
metric = evaluate.load("squad")
qa = pipeline(
    "question-answering",
    model="/kaggle/working/distilbert-qa/checkpoint-1740",
    tokenizer="/kaggle/working/distilbert-qa/checkpoint-1740",
    device=0,
)

ds = data["test"]

print_metrics(ds, qa)

## Experiment 2

In [None]:
model_partial = AutoModelForQuestionAnswering.from_pretrained(model_name)

#### Partial fine tuning

In [None]:
for param in model_partial.distilbert.parameters():
    param.requires_grad = False


In [None]:
args_partial = TrainingArguments(
    output_dir="distilbert-qa-partial",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=3,
    learning_rate=5e-4,  
    weight_decay=0.01,
    logging_steps=200,
    log_level="info",
    report_to=["none"],
    disable_tqdm=False,
    fp16=True if torch.cuda.is_available() else False,
)

In [None]:
trainer_partial = Trainer(
    model=model_partial,
    args=args_partial,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
)

In [None]:
trainer_partial.train()

In [None]:
metric = evaluate.load("squad")
qa_partial = pipeline(
    "question-answering",
    model="distilbert-qa-partial/checkpoint-417",  
    tokenizer="distilbert-qa-partial/checkpoint-417",
    device=0 if torch.cuda.is_available() else -1,
)


print_metrics(data["test"], qa_partial)

## Experiment 3

In [27]:
contexts = raw["train"]["context"] + raw["validation"]["context"]
unique   = list(dict.fromkeys(contexts))
docs     = [Document(page_content=c) for c in unique]


splitter    = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs  = splitter.split_documents(docs)


embeddings = HuggingFaceEmbeddings()                    
db         = FAISS.from_documents(split_docs, embeddings)
retriever  = db.as_retriever(search_kwargs={"k": 4})

  embeddings = HuggingFaceEmbeddings()


In [28]:
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [30]:
qa_pipe = pipeline(
    "question-answering",
    model="distilbert-base-uncased-distilled-squad",  
    tokenizer="distilbert-base-uncased-distilled-squad",
)

def answer_with_distilbert(query: str, docs: list[Document]) -> str:
    context = " ".join(d.page_content for d in docs)
    out = qa_pipe(question=query, context=context)
    return out["answer"]


retrieved_docs = retriever.get_relevant_documents("Who is Thomas Hardy?")
print(answer_with_distilbert("Who is Thomas Hardy?", retrieved_docs))

Device set to use cuda:0


novelists and poets
