In [1]:
%pip install langchain-community langchain-huggingface faiss-gpu sentence-transformers datasets

Collecting langchain-community
  Downloading langchain_community-0.3.3-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.0-py3-none-any.whl.metadata (1.3 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain<0.4.0,>=0.3.4 (from langchain-community)
  Downloading langchain-0.3.4-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.12 (from langchain-community)
  Downloading langchain_core-0.3.12-py3-none-any.whl.metadata (6.3 kB)
Collecting langsmith<0.2.0,>=0.1.125 (from langchain-communi

In [2]:
import os
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load your scraped data and split into chunks
def load_and_split_data(directory_path):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
    docs = []

    for file in os.listdir(directory_path):
      if file.endswith(".txt"):
        with open(os.path.join(directory_path, file), 'r') as f:
            text = f.read()
            # Split the text into chunks
            docs.extend(text_splitter.split_text(text))

    return docs

# Example usage
context_docs = load_and_split_data('./data')

In [3]:
len(context_docs)

2099

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

vector_store = FAISS.from_texts(context_docs, embeddings)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
def retrieve_documents(query, k=5):
    docs = vector_store.similarity_search(query, k=k)
    return [doc.page_content for doc in docs]

In [6]:
query = "When was the Pittsburgh Soul Food Festival established?"
context = retrieve_documents(query)

In [7]:
# Load questions and answers from separate files
with open("/content/train/questions.txt", "r") as f:
    questions = [line.strip() for line in f]

with open("/content/train/reference_answers.txt", "r") as f:
    answers = [line.strip() for line in f]

# Combine questions and answers
qa_data = [{"question": q, "answer": a} for q, a in zip(questions, answers)]

In [8]:
len(qa_data)

2051

In [9]:
def prepare_training_data(qa_data):
    training_data = []

    for qa in qa_data:
        query = qa["question"]
        true_answers = qa["answer"].split(';')

        retrieved_docs = retrieve_documents(query, k=5)
        context = " ".join(retrieved_docs)

        for true_answer in true_answers:
            true_answer = true_answer.strip()
            answer_start = context.find(true_answer)
            if answer_start != -1:
                training_example = {
                    "question": query,
                    "context": context,
                    "answer_text": true_answer,
                    "answer_start": answer_start,
                    "answer_end": answer_start + len(true_answer)
                }
                training_data.append(training_example)

    return training_data

In [10]:
training_data = prepare_training_data(qa_data)

In [11]:
from datasets import Dataset
dataset = Dataset.from_list(training_data)

In [12]:
print(dataset)

dataset.to_pandas().head()

Dataset({
    features: ['question', 'context', 'answer_text', 'answer_start', 'answer_end'],
    num_rows: 640
})


Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,What is the name of the arena where the Pittsb...,"Where to See Them:PNC Park\nCapacity:38,362O\n...",PPG Paints Arena,645,661
1,Which company sponsored the naming rights for ...,"AVE (PPG Paints Arena), 1000 FORT DUQUESNE BLV...",PPG Paints,5,15
2,How do inclusive processes and practices aim t...,Dr. Wanda Heading-Grant\nVice Provost for Dive...,Inclusive processes and practices strive to br...,1027,1156
3,What year was the University Center re-dedicat...,Since the 1990s\n---------------\nIn the 1990s...,2014,3435,3439
4,What is the name given to the large grassy are...,Title: Carnegie Mellon University\n===========...,The Cut,7841,7848


In [13]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, Trainer, TrainingArguments
# Load RoBERTa model and tokenizer
model_name = "deepset/roberta-base-squad2"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]



In [14]:
def tokenize_and_align_labels(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answer_text = examples["answer_text"][sample_index]
        start_char = examples["answer_start"][sample_index]
        end_char = examples["answer_end"][sample_index]

        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # Check if the answer is within the span (token_start_index, token_end_index)
        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)

            while token_end_index >= token_start_index and offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)

        # Sanity check
        if tokenized_examples["start_positions"][-1] > tokenized_examples["end_positions"][-1]:
            tokenized_examples["start_positions"][-1] = cls_index
            tokenized_examples["end_positions"][-1] = cls_index

    return tokenized_examples

In [15]:
# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset.column_names)

Map:   0%|          | 0/640 [00:00<?, ? examples/s]

In [16]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    report_to='none',
    disable_tqdm=False
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)



In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.449064
2,No log,0.254077
3,0.601800,0.19243


TrainOutput(global_step=657, training_loss=0.5314832132882361, metrics={'train_runtime': 1414.2244, 'train_samples_per_second': 7.408, 'train_steps_per_second': 0.465, 'total_flos': 2737344823566336.0, 'train_loss': 0.5314832132882361, 'epoch': 3.0})

In [18]:
model.save_pretrained("./fine_tuned_roberta")
tokenizer.save_pretrained("./fine_tuned_roberta")

('./fine_tuned_roberta/tokenizer_config.json',
 './fine_tuned_roberta/special_tokens_map.json',
 './fine_tuned_roberta/vocab.json',
 './fine_tuned_roberta/merges.txt',
 './fine_tuned_roberta/added_tokens.json',
 './fine_tuned_roberta/tokenizer.json')

In [19]:
import torch

# First, determine the available device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the appropriate device
model = model.to(device)

def get_answer_finetuned(question, context):
    # Tokenize the input
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512, padding="max_length")

    # Move input tensors to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get the model's outputs
    with torch.no_grad():
        outputs = model(**inputs)

    # Move output tensors to CPU for further processing
    start_logits = outputs.start_logits.cpu()
    end_logits = outputs.end_logits.cpu()

    # Get the most probable start and end indices
    answer_start = torch.argmax(start_logits)
    answer_end = torch.argmax(end_logits) + 1

    # Convert tokens back to text
    answer = tokenizer.decode(inputs['input_ids'][0][answer_start:answer_end])

    return answer

In [20]:
test_question = "What city in China is a sister city of Pittsburgh?"
retrieved_docs = retrieve_documents(test_question, k=5)
context = " ".join(retrieved_docs)
predicted_answer = get_answer_finetuned(test_question, context)

print(f"Question: {test_question}")
print(f"Predicted Answer: {predicted_answer}")

Question: What city in China is a sister city of Pittsburgh?
Predicted Answer: Wuhan


In [21]:
# Load questions and answers from separate files
with open("/content/test/questions.txt", "r") as f:
    questions = [line.strip() for line in f]

with open("/content/test/reference_answers.txt", "r") as f:
    answers = [line.strip() for line in f]

# Combine questions and answers
test_qa_data = [{"question": q, "answer": a} for q, a in zip(questions, answers)]

In [22]:
len(test_qa_data)

116

In [38]:
from nltk.tokenize import word_tokenize
import string
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [39]:
def get_answer_finetuned(question, context):
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512, padding="max_length")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    start_logits = outputs.start_logits.cpu()
    end_logits = outputs.end_logits.cpu()

    answer_start = torch.argmax(start_logits)
    answer_end = torch.argmax(end_logits) + 1

    answer = tokenizer.decode(inputs['input_ids'][0][answer_start:answer_end])

    return answer

In [42]:
def evaluate_model(test_data):
    exact_matches = []
    f1_scores = []
    answer_recalls = []

    for item in tqdm(test_data):
        question = item['question']
        retrieved_docs = retrieve_documents(question, k=5)
        context = " ".join(retrieved_docs)
        gold_answer = item['answer']

        predicted_answer = get_answer_finetuned(question, context)

        exact_matches.append(compute_exact(gold_answer, predicted_answer))
        f1_scores.append(compute_f1(gold_answer, predicted_answer))
        answer_recalls.append(compute_answer_recall(gold_answer, context))

    exact_match = sum(exact_matches) / len(exact_matches)
    f1 = sum(f1_scores) / len(f1_scores)
    answer_recall = sum(answer_recalls) / len(answer_recalls)

    return {
        'exact_match': exact_match,
        'f1': f1,
        'answer_recall': answer_recall
    }

def compute_answer_recall(gold_answer, context):
    gold_tokens = set(get_tokens(gold_answer))
    context_tokens = set(get_tokens(context))
    # overlap = gold_tokens.intersection(context_tokens)

    answer_recall = any(all(word in context_tokens for word in ans) for ans in gold_tokens)

    return answer_recall

In [43]:
results = evaluate_model(test_qa_data)

print(f"Exact Match: {results['exact_match']:.4f}")
print(f"F1 Score: {results['f1']:.4f}")
print(f"Answer Recall: {results['answer_recall']:.4f}")

100%|██████████| 116/116 [00:05<00:00, 20.49it/s]

Exact Match: 0.1034
F1 Score: 0.2036
Answer Recall: 0.1466



