In [1]:
import src.bm25_IR as bm25

# For models
import torch
from transformers import pipeline
#import nltk

### Setup corpus

In [2]:
# Path to the dev data
dev_data_path = "./devset/dev_set.csv"

# Import the data
bm25_data = bm25.load_csv(dev_data_path)
print(bm25_data.shape)

# Remove rows with nan values in either 'text, str', 'question, str', or 'answer, str'
bm25_data_clean = bm25_data.dropna(subset=['text, str', 'question, str', 'answer, str'])
print(bm25_data_clean.shape)

(106, 5)
(105, 5)


### Tokenize the data 

In [3]:
# Get corpus, question and answer
corpus = [text for text in bm25_data_clean['text, str']]
questions = [qst for qst in bm25_data_clean['question, str']]
answers = [answ for answ in bm25_data_clean['answer, str']]

# Preprocess the data
tokenized_corpus = [bm25.preprocess(text) for text in corpus]
tokenized_questions = [bm25.preprocess(qst) for qst in questions]
tokenized_answers = [bm25.preprocess(answ) for answ in answers]

### Create the BM25 model

In [4]:
# Create the BM25 model
bm25_model = bm25.init_bm25_corpus(tokenized_corpus)

In [12]:
for i, question in enumerate(tokenized_questions):
    scores = bm25_model.get_scores(question)
    best_doc_index = scores.argmax()  # Get the index of the best matching document
    print(f"Question: {questions[i]}")
    print(f"Best Document: {corpus[best_doc_index]}")
    print(f"Score: {scores[best_doc_index]}\n")

# Make dictionaries with question and top 1, 3 and 5 documents
question_to_docs_top_1 = {}
question_to_docs_top_3 = {}
question_to_docs_top_5 = {}

for i, (tokenized_question, question, answer) in enumerate(zip(tokenized_questions, questions, answers)):
    # Get scores
    scores = bm25_model.get_scores(tokenized_question)

    # Top 1
    best_doc_index = scores.argmax()  # Get the index of the best matching document
    best_doc = corpus[best_doc_index]
    best_doc_score = scores[best_doc_index]
    question_to_docs_top_1[question] = {'docs':(best_doc, best_doc_score), 'answer':answer}

    # Top 3
    top_3_indices = scores.argsort()[-3:][::-1]
    top_3_docs = [corpus[i] for i in top_3_indices]
    top_3_scores = [scores[i] for i in top_3_indices]
    question_to_docs_top_3[question] = {'docs':(top_3_docs, top_3_scores), 'answer':answer}

    # Top 5
    top_5_indices = scores.argsort()[-5:][::-1]
    top_5_docs = [corpus[i] for i in top_5_indices]
    top_5_scores = [scores[i] for i in top_5_indices]
    question_to_docs_top_5[question] = {'docs':(top_5_docs, top_5_scores), 'answer':answer}

Question: Hvad har ejeren af en ejerlejlighed, sammen med andre ejere af lejligheder, ejendoms ret til?
Best Document: 'Ejeren af en ejerlejlighed har sammen med andre ejere af lejligheder ejendomsret til grunden, fælles bestanddele og tilbehør m.v. efter et fordelingstal, der fastsættes som en brøkdel. Til lejligheden hører i samme forhold rettigheder og forpligtelser for ejeren som medlem af ejerforeningen. De i stk. 1 og 2 omhandlede rettigheder og forpligtelser kan ikke adskilles fra ejendomsretten til lejligheden.'
Score: 24.461946248382397

Question: Hvem fastsætter eller aftaler bestemmelser om løn- og ansættelsesvilkår, herunder pensionsforhold for højskolens ansatte?
Best Document: 'Højskolen skal følge de af finansministeren fastsatte eller aftalte bestemmelser om løn- og ansættelsesvilkår, herunder om pensionsforhold for højskolens ansatte. Tjenestemandslovens § 21, stk. 4, gælder for medlemmer af højskolens bestyrelse.'
Score: 35.63539365300649

Question: Hvad skal Beskæfti

# RAG with KennethTM/gpt-neo-1.3B-danish

**Set up device with MPS**

In [6]:
# Set the device to MPS if available
device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Load the pipeline and move the model to MPS
generator = pipeline(
    "text-generation",
    model="KennethTM/gpt-neo-1.3B-danish",
    device= "mps" if torch.backends.mps.is_available() else "cpu"

)

Using device: mps


config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/854k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/513k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
# Perform RAG with generator

# If questions dont already have an '?' in the end, add it to the question and save it to the key
for question in question_to_docs_top_1.keys():
    if question[-1] != '?':
        new_question = question + '?'
        question_to_docs_top_1[new_question] = question_to_docs_top_1.pop(question)

for question in question_to_docs_top_3.keys():
    if question[-1] != '?':
        new_question = question + '?'
        question_to_docs_top_3[new_question] = question_to_docs_top_3.pop(question)

for question in question_to_docs_top_5.keys():
    if question[-1] != '?':
        new_question = question + '?'
        question_to_docs_top_5[new_question] = question_to_docs_top_5.pop(question)

In [None]:
# Jeg er kommet til at køre modellen