In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import json
from tqdm import tqdm
import regex as re
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
#from transformers import AutoTokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Generate retrieval corpus

In [3]:
path = '/Users/adamwagnerhoegh/Documents/Legal data/domsdatabasen.retsinformation_newer.json'

with open(path) as f:
    retsinfo = json.load(f)

rag_list = []
idx = 0
for lov in tqdm(retsinfo):
    for kapitel in lov['kapitler']:
        lov_navn = lov['shortName']
        for paragraffer in kapitel['paragraffer']:
            temp_paragraf_dict = {}
            temp_paragraf_dict['paragraf_nr'] = paragraffer['nummer']
            temp_paragraf_dict['lovnavn'] = lov_navn
            temp_paragraf_list = []
            for styk in paragraffer['stk']:
                temp_paragraf_list.append(styk['tekst'])
            temp_paragraf_dict['text'] = ' '.join(temp_paragraf_list)
            rag_list.append(temp_paragraf_dict)

with open("rag_list.txt", "w") as file:
    for item in rag_list:
        file.write(f"{item}\n")

100%|██████████| 1637/1637 [00:00<00:00, 11257.12it/s]


## Generate dev set

In [4]:
# load excel files in dev set folder
import os

dev_set_folder = "devset"

dfs = []
for file in os.listdir(dev_set_folder):
    if file.endswith(".xlsx"):
        df = pd.read_excel(os.path.join(dev_set_folder, file))
        dfs.append(df)

# merge all excel
dev_set = pd.concat(dfs, ignore_index=True)

# add csv
rag_batch_1_with_qa = pd.read_csv("devset/rag_batch_1_with_qa.csv", sep=";").iloc[:, 1:].dropna()
rag_batch_1_with_qa.columns = dev_set.columns
dev_set = pd.concat([dev_set, rag_batch_1_with_qa], ignore_index=True)

dev_set

Unnamed: 0,"question, str","answer, str","text, str","pnumber, str","law number, str"
0,"Hvad har ejeren af en ejerlejlighed, sammen me...","Grunden, fælles bestanddele og tilbehør",'Ejeren af en ejerlejlighed har sammen med and...,3,LOV nr 908 af 18/06/2020
1,Hvem fastsætter eller aftaler bestemmelser om ...,Finansministeren fastsætter eller aftaler best...,'Højskolen skal følge de af finansministeren f...,30,LBK nr 780 af 08/08/2019
2,Hvad skal Beskæftigelsesministeriet og Finanst...,Den indsendte årsrapport skal i det mindste in...,'Uden ugrundet ophold efter repræsentantskabet...,25 l,LBK nr 1110 af 10/10/2014
3,Hvor mange procent må kapitalandele i og lån y...,Kapitalandele i og lån ydet til en virksomhed ...,'Følgende grænser for Arbejdsmarkedets Tillægs...,26 e,LBK nr 1110 af 10/10/2014
4,Hvad er en betingelse for retten til jobpræmie?,Det er en betingelse for retten til jobpræmie ...,'Det er en betingelse for retten til jobpræmie...,9,LOV nr 287 af 29/03/2017
...,...,...,...,...,...
101,Hvordan anføres kandidatlister på stemmesedler?,I særskilte felter.,Kandidatlisterne anføres på stemmesedlen i sær...,46,LBK nr 6 af 08/01/2024
102,Hvem iværksætter beslaglæggelse?,Politiet.,Politiet iværksætter beslaglæggelse. Politiet ...,807,LBK nr 250 af 04/03/2024
103,Hvis interesser skal foranstaltninger mod inte...,De forvaltede alternative investeringsfondes e...,En forvalter af alternative investeringsfonde ...,23,LBK nr 231 af 01/03/2024
104,Hvad skal valgstyrere eller tilforordnede vælg...,At stemmekasserne er tomme.,Afstemningen begynder kl. Inden stemmeafgivnin...,38,LBK nr 1432 af 01/12/2023


## Vectorize retrieval corpus

### Sparse retrieval

In [5]:
rag_list2 = rag_list

def preprocess(rag_list):
    # extract and preprocess text
    corpus = [item['text'] for item in rag_list]
    corpus = [re.sub('\\s{2,}', ' ', 
                     re.sub('\\W|[0-9]|§', ' ',
                           item.lower())) for item in corpus]

    # remove stopwords
    #nltk.download('punkt')
    stop_words = set(stopwords.words('danish'))
    corpus = [' '.join(word for word in text.split() 
                      if word not in stop_words) for text in tqdm(corpus)]
    
    return corpus

corpus = preprocess(rag_list2)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

100%|██████████| 42593/42593 [00:00<00:00, 103915.01it/s]


### Dense retrieval

In [6]:
## WRITE LATER

## RAG retriever

### Sparse retrieval pipeline


In [7]:
def sparse_retrieval(question, sparse_matrix, k=3):
    """
    Function that takes a question and returns a list of paragraphs that are most relevant to the question
    """

    # preprocess and vectorize question
    question_processed = [re.sub('\\s{2,}', ' ', 
                               re.sub('\\W|[0-9]|§', ' ',
                                     question.lower()))]
    
    # remove stopwords
    stop_words = set(stopwords.words('danish'))
    question_processed = [' '.join(word for word in text.split() 
                                 if word not in stop_words) for text in question_processed]
    
    question_vector = vectorizer.transform(question_processed)

    # sparse retrieval (cosine similarity)
    sparse_retrieval = X.dot(question_vector.T).toarray()

    # get top k paragraphs
    top_k = np.argsort(sparse_retrieval.flatten())[-k:]

    return top_k

# check if it works using a random question from the dev set
random_question = dev_set.iloc[np.random.randint(0, len(dev_set))]['question, str']
print(random_question, '\n')
top_k = sparse_retrieval(random_question, X)
for i in top_k:
    print(f'{rag_list2[i]["paragraf_nr"]}: {rag_list2[i]["text"]}')

Hvad skal sikringsmæssige foranstaltninger for nukleart materiale og nukleare anlæg, jf. § 37 a og § 37 c, stk. 2, skal fremgå af? 

§ 37 b.: Sikringsmæssige foranstaltninger for nukleart materiale og nukleare anlæg, jf. § 37 a og § 37 c, stk. 2, skal fremgå af en sikringsplan, som skal indsendes til og godkendes af Beredskabsstyrelsen forud for brug, opbevaring eller transport af nukleart materiale og forud for drift af et nukleart anlæg. Ændrer de forhold, der har ligget til grund for godkendelse af en sikringsplan, sig væsentligt efterfølgende, skal en revideret sikringsplan godkendes af Beredskabsstyrelsen. Beredskabsstyrelsen kan træffe afgørelse om, at godkendelse af en sikringsplan bortfalder, såfremt den, der bruger, opbevarer eller transporterer nukleart materiale, eller den, der driver et nukleart anlæg, ikke inden en nærmere angivet frist har indsendt og opnået godkendelse af en revideret sikringsplan.
§ 37 a.: Den, der bruger, opbevarer eller transporterer nukleart material

### Create embedding corpus

In [8]:
tokenizer = AutoTokenizer.from_pretrained("KennethTM/bert-base-uncased-danish")
model = AutoModel.from_pretrained("KennethTM/bert-base-uncased-danish")

Some weights of BertModel were not initialized from the model checkpoint at KennethTM/bert-base-uncased-danish and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# creating embedding corpus


# device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"

# cls_embeddings = []

# idx = 0

# for item in tqdm(rag_list):
#     # doing a try and except as some paragraphs may exceed the context window of the BERT (I believe)
#     try:
#         # tokenize texts
#         input_ids = tokenizer.encode(item['text'], return_tensors='pt')
#         # run through BERT
#         with torch.no_grad():  # disable gradient computation for inference
#             outputs = model(input_ids)
#         # extract cls-token
#         cls_vector = outputs.last_hidden_state[:, 0, :]
#         # add cls-vector to list of embeddings
#         cls_embeddings.append(cls_vector)
#     except:
#         # if error then count errors with this
#         idx += 1

# print(f'{idx} no. of errors')

# # concatenate list into torch tensor
# cls_embeddings_tensor = torch.cat(cls_embeddings, dim=0)

  0%|          | 87/42593 [00:04<29:04, 24.37it/s] Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 42593/42593 [30:10<00:00, 23.53it/s] 


955 no. of errors


In [None]:
# saving the tensor
#torch.save(cls_embeddings_tensor, '/Users/adamwagnerhoegh/Documents/SODAS/sem3/nlp_itu/cls_embeddings_tensor.pt')

In [52]:
input_ids = tokenizer.encode(rag_list[0]['text'], return_tensors='pt')
outputs = model(input_ids)
cls_vector = outputs.last_hidden_state[:, 0, :]

### Dense retrieval pipeline

In [None]:


def dense_retrieval(question, dense_matrix, k=3):
    """
    Function that takes a question and returns a list of paragraphs that are most relevant to the question
    """

    # Encode the input sentence
    input_ids = tokenizer.encode(question, return_tensors="pt")  # Encode and add batch dimension
    # Pass the input through the model
    outputs = model(input_ids)

    # Extract the CLS token representation
    cls_vector = outputs.last_hidden_state[:, 0, :]  # CLS token is at position 0
    
    # sparse retrieval (cosine similarity)
    dense_retrieval = embedding_corpus.dot(cls_vector.T).toarray()

    # get top k paragraphs
    top_k = np.argsort(sparse_retrieval.flatten())[-k:]

    return top_k

# check if it works using a random question from the dev set
random_question = dev_set.iloc[np.random.randint(0, len(dev_set))]['question, str']
print(random_question, '\n')
top_k = dense_retrieval(random_question, X)
for i in top_k:
    print(f'{rag_list2[i]["paragraf_nr"]}: {rag_list2[i]["text"]}')

Some weights of BertModel were not initialized from the model checkpoint at KennethTM/bert-base-uncased-danish and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sydslesvigudvalget sender for hvert finansår inden den 1. maj det følgende år et årsregnskab, der underskrives af hvem? 



NameError: name 'X' is not defined

In [7]:
# Encode the input sentence
input_ids = tokenizer.encode('Hvem fastsætter eller aftaler bestemmelser om løn- og ansættelsesvilkår, herunder pensionsforhold for højskolens ansatte?', return_tensors="pt")  # Encode and add batch dimension
# Pass the input through the model
outputs = model(input_ids)

# Extract the CLS token representation
cls_vector = outputs.last_hidden_state[:, 0, :]  # CLS token is at position 0

# sparse retrieval (cosine similarity)
sparse_retrieval = X.dot(cls_vector.T).toarray()

# get top k paragraphs
top_k = np.argsort(sparse_retrieval.flatten())[-3:]

NameError: name 'X' is not defined