In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import json
from tqdm import tqdm
import regex as re
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
#from transformers import AutoTokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModel
import torch
from itertools import cycle
import os 

# For BM25
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/adamwagnerhoegh/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Generate retrieval corpus

In [10]:
path_adam = '/Users/adamwagnerhoegh/Documents/Legal data/domsdatabasen.retsinformation_newer.json'
path_asger = "/Users/asgerkromand/Library/CloudStorage/OneDrive-UniversityofCopenhagen/0. SDS/1 deep learning and nlp/ANLPDP_exam/data/domsdatabasen.retsinformation_newer.json"
path_andreas = '' #missing

# Define a function that can cycle through paths the above paths try them out, and yield the path
def path():
    paths = cycle([path_adam, path_asger, path_andreas])
    for path in paths:
        if path != '':
            try:
                with open(path, 'r') as f:
                    data = json.load(f)
                return data
            except:
                pass
        else:
            raise FileNotFoundError('No path to data found')

retsinfo = path()
    
rag_list = []
idx = 0
for lov in tqdm(retsinfo):
    for kapitel in lov['kapitler']:
        lov_navn = lov['shortName']
        for paragraffer in kapitel['paragraffer']:
            temp_paragraf_dict = {}
            temp_paragraf_dict['paragraf_nr'] = paragraffer['nummer']
            temp_paragraf_dict['lovnavn'] = lov_navn
            temp_paragraf_list = []
            for styk in paragraffer['stk']:
                temp_paragraf_list.append(styk['tekst'])
            temp_paragraf_dict['text'] = ' '.join(temp_paragraf_list)
            rag_list.append(temp_paragraf_dict)

with open("rag_list.txt", "w") as file:
    for item in rag_list:
        file.write(f"{item}\n")

100%|██████████| 1637/1637 [00:00<00:00, 11455.97it/s]


## Generate dev set

In [11]:
# load excel files in dev set folder
import os

dev_set_folder = "devset"

dfs = []
for file in os.listdir(dev_set_folder):
    if file.endswith(".xlsx"):
        df = pd.read_excel(os.path.join(dev_set_folder, file))
        dfs.append(df)

# merge all excel
dev_set = pd.concat(dfs, ignore_index=True)

# add csv
rag_batch_1_with_qa = pd.read_csv("devset/rag_batch_1_with_qa.csv", sep=",").iloc[:, 1:].dropna()
rag_batch_1_with_qa.columns = dev_set.columns
dev_set = pd.concat([dev_set, rag_batch_1_with_qa], ignore_index=True)

# output dev set
dev_set.to_csv("devset/dev_set.csv", index=False)

## Vectorize retrieval corpus

### Sparse retrieval TF-IDF

In [12]:
rag_list2 = rag_list

def preprocess(rag_list):
    # extract and preprocess text
    corpus = [item['text'] for item in rag_list]
    corpus = [re.sub('\\s{2,}', ' ', 
                     re.sub('\\W|[0-9]|§', ' ',
                           item.lower())) for item in corpus]

    # remove stopwords
    #nltk.download('punkt')
    stop_words = set(stopwords.words('danish'))
    corpus = [' '.join(word for word in text.split() 
                      if word not in stop_words) for text in tqdm(corpus)]
    
    return corpus

corpus = preprocess(rag_list2)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

100%|██████████| 42593/42593 [00:00<00:00, 115716.35it/s]


### Dense retrieval

In [13]:
## WRITE LATER

## RAG retriever

### Sparse retrieval pipeline for BM25


In [14]:
def sparse_retrieval(question, sparse_matrix, k=3):
    """
    Function that takes a question and returns a list of paragraphs that are most relevant to the question
    """

    # preprocess and vectorize question
    question_processed = [re.sub('\\s{2,}', ' ', 
                               re.sub('\\W|[0-9]|§', ' ',
                                     question.lower()))]
    
    # remove stopwords
    stop_words = set(stopwords.words('danish'))
    question_processed = [' '.join(word for word in text.split() 
                                 if word not in stop_words) for text in question_processed]
    
    question_vector = vectorizer.transform(question_processed)

    # sparse retrieval (cosine similarity)
    sparse_retrieval = sparse_matrix.dot(question_vector.T).toarray()

    # get top k paragraphs
    top_k = np.argsort(sparse_retrieval.flatten())[-k:]

    return top_k

# check if it works using a random question from the dev set
random_question = dev_set.iloc[np.random.randint(0, len(dev_set))]['question, str']
print(random_question, '\n')
top_k = sparse_retrieval(random_question, X)
for i in top_k:
    print(f'{rag_list2[i]["paragraf_nr"]}: {rag_list2[i]["text"]}')

Hvilke betingelser skal være opfyldt for at få udbetalt rejse- eller befordringsgodtgørelser efter ligningslovens § 9 A eller 9 B? 

§ 67 a.: De transaktioner, der på de betingelser, der er fastsat i § 67, stk. 1, udføres af rejsebureauet med henblik på gennemførelse af en rejse, anses som en enkelt ydelse.
§ 16.: Personer, som den 23. februar 2005 ikke var afskåret fra at anvende ligningslovens § 9 A, stk. 1-9, jf. ligningslovens § 9 A, stk. 11, og som kan foretage fradrag efter § 3, kan vælge fortsat at være omfattet af ligningslovens § 9 A, stk. 1-9.
§ 4.: Når der foretages fradrag efter § 3, kan der ikke samtidig foretages fradrag efter ligningslovens § 9, stk. 1, ligningslovens §§ 9 B-9 D og ligningslovens § 13 samt efter pensionsbeskatningslovens § 49, stk. 1. Personer, som kan foretage fradrag efter § 3, er ikke omfattet af ligningslovens § 9 A, stk. 1-9.


In [15]:
def bm25_retrieval(question, bm25_model, rag_list, k=3):
    """
    Function that takes a question and returns a list of the most relevant paragraphs based on BM25.
    """
    # Preprocess and tokenize the question
    question_processed = re.sub(r'\s{2,}', ' ', 
                                 re.sub(r'\W|[0-9]|§', ' ', question.lower()))
    stop_words = set(stopwords.words('danish'))
    question_tokens = [word for word in question_processed.split() if word not in stop_words]

    # Get BM25 scores for the query
    scores = bm25_model.get_scores(question_tokens)

    # Get the top k results
    top_k_indices = np.argsort(scores)[-k:][::-1]  # Sort scores in descending order

    # Return the top k paragraphs
    return [(rag_list[i]['paragraf_nr'], rag_list[i]['text'], scores[i]) for i in top_k_indices]

# Example Usage
random_question = dev_set.iloc[np.random.randint(0, len(dev_set))]['question, str']
print(f"Question: {random_question}\n")

# Assuming bm25 is the initialized BM25Okapi model
top_k_results = bm25_retrieval(random_question, bm25, rag_list2, k=3)

# Print top-k results
for paragraf_nr, text, score in top_k_results:
    print(f"{paragraf_nr}: {text} (Score: {score:.2f})")

Question: Hvad har Rådet for Dyreforsøg til enhver tid uden retskendelse mod behørig legitimation adgang til?



NameError: name 'bm25' is not defined

### Create embedding corpus

In [16]:

def create_embedding_matrix(pooling, save=True, save_folder=None):
    # initialise model
    bert_tokenizer = AutoTokenizer.from_pretrained("KennethTM/bert-base-uncased-danish")
    bert_model = AutoModel.from_pretrained("KennethTM/bert-base-uncased-danish")

    # define device
    device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"

    # move model to device
    bert_model.to(device)

    # create list of embedding vectors to concatenate into a torch tensor
    embeddings = []

    # index to track numer of mistakes
    idx = 0

    for item in tqdm(rag_list):
        # doing a try and except as some paragraphs may exceed the context window of the BERT (I believe)
        try:
            # tokenize texts
            input_ids = bert_tokenizer.encode(item['text'], return_tensors='pt').to(device)
            # run through BERT
            with torch.no_grad():  # disable gradient computation for inference
                outputs = bert_model(input_ids)
            
            # different kinds of pooling
            if pooling == 'cls':
                embedding_vector = outputs.last_hidden_state[:, 0, :]
            elif pooling == 'max':
                embedding_vector = torch.max(outputs, dim=1)[0]
            elif pooling == 'mean':
                embedding_vector = torch.mean(outputs, dim=1)
            else:
                raise ValueError(f"Unknown pooling method: {pooling}")
            
            # add cls-vector to list of embeddings
            embeddings.append(embedding_vector)
        except:
            # if error then count errors with this
            idx += 1

    print(f'{idx} no. of errors')

    # concatenate list into torch tensor
    embeddings_tensor = torch.cat(embeddings, dim=0)

    if save == True:
        # make sure that folder exists
        os.makedirs(save_folder, exist_ok=True)

        # save tensor 
        torch.save(embeddings_tensor, f'{save_folder}/{pooling}_embeddings_tensor.pt')

    return embeddings_tensor



In [None]:
#create_embedding_matrix(pooling='cls')

### Dense retrieval pipeline

In [17]:
bert_tokenizer = AutoTokenizer.from_pretrained("vesteinn/DanskBERT")
bert_model = AutoModel.from_pretrained("vesteinn/DanskBERT")

Some weights of XLMRobertaModel were not initialized from the model checkpoint at vesteinn/DanskBERT and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
embeddings_matrix = torch.load('/Users/adamwagnerhoegh/Documents/SODAS/sem3/nlp_itu/cls_embeddings_DanskBERT.pt')

In [24]:
def dense_retrieval(question, k=3):
    """
    Function that takes a question and returns a list of paragraphs that are most relevant to the question
    """
    
    # Encode the input sentence
    input_ids = bert_tokenizer.encode(question, return_tensors="pt")  # Encode and add batch dimension
    # Pass the input through the model
    
    with torch.no_grad():  # disable gradient computation for inference
        outputs = bert_model(input_ids)

    # Extract the CLS token representation
    cls_vector = outputs.last_hidden_state[:, 0, :]  # CLS token is at position 0
    
    # sparse retrieval (cosine similarity)
    dense_retrieval = embeddings_matrix @ torch.transpose(cls_vector, 0, 1)
    
    # get top k paragraphs
    top_k_indices = torch.sort(dense_retrieval, descending=True, dim=0)[1][:k]

    return top_k_indices

# check if it works using a random question from the dev set
random_question = dev_set.iloc[np.random.randint(0, len(dev_set))]['question, str']
print(random_question, '\n')
top_k = dense_retrieval(random_question, k=3)
for i in top_k:
    print(f'{rag_list[i]["text"]}')

Hvad skal en virksomhed, som har ansat minimum 10 personer, der gør tjeneste som besætningsmedlemmer på et luftfartøj? 

Hvis en virksomheds tilsvar af skatter og afgifter m.v., der opkræves efter reglerne i denne lov, for en afregningsperiode er negativt, udbetales beløbet til virksomheden. Såfremt angivelsen henholdsvis indberetningen af beløb omfattet af § 2, stk. 1, 4. pkt. er modtaget rettidigt, sker udbetaling efter stk. 1 senest 3 uger efter modtagelsen af angivelsen henholdsvis indberetningen for den pågældende periode. Kan told- og skatteforvaltningen på grund af virksomhedens forhold ikke foretage kontrol af angivelsen eller indberetningen af beløb omfattet af § 2, stk. 1, 4. pkt., afbrydes udbetalingsfristen, indtil virksomhedens forhold ikke længere hindrer kontrol. Beløb, der skulle have været udbetalt efter stk. 1, kan tilbageholdes, såfremt angivelser eller indberetningen af beløb omfattet af § 2, stk. 1, 4. pkt. vedrørende afsluttede afregningsperioder ikke er indgivet.