In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import json
from tqdm import tqdm
import regex as re
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
#from transformers import AutoTokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, T5ForConditionalGeneration
import torch
from itertools import cycle
import os 
from transformers import pipeline

from rouge_score import rouge_scorer

# For BM25
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/adamwagnerhoegh/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Generate retrieval corpus

In [3]:
path_adam = '/Users/adamwagnerhoegh/Documents/Legal data/domsdatabasen.retsinformation_newer.json'
path_asger = "/Users/asgerkromand/Library/CloudStorage/OneDrive-UniversityofCopenhagen/0. SDS/1 deep learning and nlp/ANLPDP_exam/data/domsdatabasen.retsinformation_newer.json"
path_andreas = '' #missing

# Define a function that can cycle through paths the above paths try them out, and yield the path
def path():
    paths = cycle([path_adam, path_asger, path_andreas])
    for path in paths:
        if path != '':
            try:
                with open(path, 'r') as f:
                    data = json.load(f)
                return data
            except:
                pass
        else:
            raise FileNotFoundError('No path to data found')

retsinfo = path()
    
rag_list = []
idx = 0
for lov in tqdm(retsinfo):
    for kapitel in lov['kapitler']:
        lov_navn = lov['shortName']
        for paragraffer in kapitel['paragraffer']:
            temp_paragraf_dict = {}
            temp_paragraf_dict['paragraf_nr'] = paragraffer['nummer']
            temp_paragraf_dict['lovnavn'] = lov_navn
            temp_paragraf_list = []
            for styk in paragraffer['stk']:
                temp_paragraf_list.append(styk['tekst'])
            temp_paragraf_dict['text'] = ' '.join(temp_paragraf_list)
            rag_list.append(temp_paragraf_dict)

with open("rag_list.txt", "w") as file:
    for item in rag_list:
        file.write(f"{item}\n")

100%|██████████| 1637/1637 [00:00<00:00, 9619.84it/s]


## Generate dev set

In [4]:
# load excel files in dev set folder
import os

dev_set_folder = "devset"

dfs = []
for file in os.listdir(dev_set_folder):
    if file.endswith(".xlsx"):
        df = pd.read_excel(os.path.join(dev_set_folder, file))
        dfs.append(df)

# merge all excel
dev_set = pd.concat(dfs, ignore_index=True)

# add csv
rag_batch_1_with_qa = pd.read_csv("devset/rag_batch_1_with_qa.csv", sep=",").iloc[:, 1:].dropna()
rag_batch_1_with_qa.columns = dev_set.columns
dev_set = pd.concat([dev_set, rag_batch_1_with_qa], ignore_index=True)

# output dev set
dev_set.to_csv("devset/dev_set.csv", index=False)

## Vectorize retrieval corpus

### Sparse retrieval TF-IDF

In [5]:
rag_list2 = rag_list

def preprocess(rag_list):
    # extract and preprocess text
    corpus = [item['text'] for item in rag_list]
    corpus = [re.sub('\\s{2,}', ' ', 
                     re.sub('\\W|[0-9]|§', ' ',
                           item.lower())) for item in corpus]

    # remove stopwords
    #nltk.download('punkt')
    stop_words = set(stopwords.words('danish'))
    corpus = [' '.join(word for word in text.split() 
                      if word not in stop_words) for text in tqdm(corpus)]
    
    return corpus

corpus = preprocess(rag_list2)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

100%|██████████| 42593/42593 [00:00<00:00, 112679.32it/s]


### Dense retrieval

In [6]:
## WRITE LATER

## RAG retriever

### Sparse retrieval pipeline for BM25


In [7]:
def sparse_retrieval(question, sparse_matrix, k=3):
    """
    Function that takes a question and returns a list of paragraphs that are most relevant to the question
    """

    # preprocess and vectorize question
    question_processed = [re.sub('\\s{2,}', ' ', 
                               re.sub('\\W|[0-9]|§', ' ',
                                     question.lower()))]
    
    # remove stopwords
    stop_words = set(stopwords.words('danish'))
    question_processed = [' '.join(word for word in text.split() 
                                 if word not in stop_words) for text in question_processed]
    
    question_vector = vectorizer.transform(question_processed)

    # sparse retrieval (cosine similarity)
    sparse_retrieval = sparse_matrix.dot(question_vector.T).toarray()

    # get top k paragraphs
    top_k = np.argsort(sparse_retrieval.flatten())[-k:]

    return top_k

# check if it works using a random question from the dev set
random_question = dev_set.iloc[np.random.randint(0, len(dev_set))]['question, str']
print(random_question, '\n')
top_k = sparse_retrieval(random_question, X)
for i in top_k:
    print(f'{rag_list2[i]["paragraf_nr"]}: {rag_list2[i]["text"]}')

Hvilken minister kan fastsætte regler om naturbeskyttelse på dansk territorium vedrørende transport af olie i rørledninger? 

§ 1 a.: Klima-, energi- og forsyningsministeren kan fastsætte regler med henblik på at gennemføre eller anvende internationale konventioner og EU-regler om forhold, der er omfattet af denne lov, herunder forordninger, direktiver og beslutninger om naturbeskyttelse på dansk kontinentalsokkel, i dansk eksklusiv økonomisk zone og på dansk søterritorium.
§ 2 a.: Klima-, energi- og forsyningsministeren kan meddele tilladelse til forundersøgelser med henblik på nedlæggelse af nye elkabler eller rørledninger til transport af kulbrinter eller ændringer af eksisterende elkabler eller rørledninger til transport af kulbrinter på dansk kontinentalsokkelområde og på dansk søterritorium. Klima-, energi- og forsyningsministeren kan fastsætte vilkår for tilladelsen efter stk. 1, herunder om de forhold, der skal undersøges, om forundersøgelsernes forløb og tidsrum og om overhold

In [8]:
def bm25_retrieval(question, bm25_model, rag_list, k=3):
    """
    Function that takes a question and returns a list of the most relevant paragraphs based on BM25.
    """
    # Preprocess and tokenize the question
    question_processed = re.sub(r'\s{2,}', ' ', 
                                 re.sub(r'\W|[0-9]|§', ' ', question.lower()))
    stop_words = set(stopwords.words('danish'))
    question_tokens = [word for word in question_processed.split() if word not in stop_words]

    # Get BM25 scores for the query
    scores = bm25_model.get_scores(question_tokens)

    # Get the top k results
    top_k_indices = np.argsort(scores)[-k:][::-1]  # Sort scores in descending order

    # Return the top k paragraphs
    return [(rag_list[i]['paragraf_nr'], rag_list[i]['text'], scores[i]) for i in top_k_indices]

# Example Usage
random_question = dev_set.iloc[np.random.randint(0, len(dev_set))]['question, str']
print(f"Question: {random_question}\n")

# Assuming bm25 is the initialized BM25Okapi model
top_k_results = bm25_retrieval(random_question, bm25, rag_list2, k=3)

# Print top-k results
for paragraf_nr, text, score in top_k_results:
    print(f"{paragraf_nr}: {text} (Score: {score:.2f})")

Question: Hvad er den mindste selskabskapital, et anpartsselskab skal have?



NameError: name 'bm25' is not defined

### Create embedding corpus

In [9]:

def create_embedding_matrix(pooling, save=False, save_folder=None):
    # initialise model
    bert_tokenizer = AutoTokenizer.from_pretrained("KennethTM/bert-base-uncased-danish")
    bert_model = AutoModel.from_pretrained("KennethTM/bert-base-uncased-danish")

    # define device
    device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"

    # move model to device
    bert_model.to(device)

    # create list of embedding vectors to concatenate into a torch tensor
    embeddings = []

    # index to track numer of mistakes
    idx = 0

    for item in tqdm(rag_list):
        # doing a try and except as some paragraphs may exceed the context window of the BERT (I believe)
        try:
            # tokenize texts
            input_ids = bert_tokenizer.encode(item['text'], return_tensors='pt').to(device)
            # run through BERT
            with torch.no_grad():  # disable gradient computation for inference
                outputs = bert_model(input_ids)
            
            # different kinds of pooling
            if pooling == 'cls':
                embedding_vector = outputs.last_hidden_state[:, 0, :]
            elif pooling == 'max':
                embedding_vector = torch.max(outputs, dim=1)[0]
            elif pooling == 'mean':
                embedding_vector = torch.mean(outputs, dim=1)
            else:
                raise ValueError(f"Unknown pooling method: {pooling}")
            
            # add cls-vector to list of embeddings
            embeddings.append(embedding_vector)
        except:
            # if error then count errors with this
            embeddings.append(torch.zeros(1, 768))
            idx += 1

    print(f'{idx} no. of errors')

    # concatenate list into torch tensor
    embeddings_tensor = torch.cat(embeddings, dim=0)

    if save == True:
        # make sure that folder exists
        os.makedirs(save_folder, exist_ok=True)

        # save tensor 
        torch.save(embeddings_tensor, f'{save_folder}/{pooling}_embeddings_tensor.pt')

    return embeddings_tensor



In [None]:
#create_embedding_matrix(pooling='cls')

Some weights of BertModel were not initialized from the model checkpoint at KennethTM/bert-base-uncased-danish and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 87/42593 [00:06<51:36, 13.73it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors
  1%|          | 454/42593 [00:41<1:24:13,  8.34it/s]

### Dense retrieval pipeline

In [10]:
bert_tokenizer = AutoTokenizer.from_pretrained("vesteinn/DanskBERT")
bert_model = AutoModel.from_pretrained("vesteinn/DanskBERT")

Some weights of XLMRobertaModel were not initialized from the model checkpoint at vesteinn/DanskBERT and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# you can load cls or max respectively, mean still needs to be created
embeddings_matrix = torch.load('/Users/adamwagnerhoegh/Documents/SODAS/sem3/nlp_itu/cls_embeddings_DanskBERT.pt')

In [77]:
def dense_retrieval(question, pooling='cls', k=3):
    """
    Function that takes a question and returns a list of paragraphs that are most relevant to the question
    pooling = 'cls', 'max' or 'mean'
    """
    
    # Encode the input sentence
    input_ids = bert_tokenizer.encode(question, return_tensors="pt")  # Encode and add batch dimension
    # Pass the input through the model
    
    with torch.no_grad():  # disable gradient computation for inference
        outputs = bert_model(input_ids)
    
    if pooling == 'cls':
        # Extract the CLS token representation
        embedding_vector = outputs.last_hidden_state[:, 0, :]
    
    elif pooling == 'max':
        embedding_vector = torch.max(outputs.last_hidden_state, dim=1)[0]

    elif pooling == 'mean':
        embedding_vector = torch.mean(outputs.last_hidden_state, dim=1)
    
    # normalise the cls-embedding and the embedding matrix so that the dot product
    # below is now cosine similarity
    embedding_vector_normalised = embedding_vector / torch.norm(embedding_vector, dim=-1, keepdim=True)
    embeddings_matrix_normalised = embeddings_matrix / torch.norm(embeddings_matrix, dim=-1, keepdim=True)

    # finding most similar vectors with dot product
    dense_retrieval = embeddings_matrix_normalised @ torch.transpose(embedding_vector_normalised, 0, 1)
    
    # get top k paragraphs
    top_k_indices = torch.sort(dense_retrieval, descending=True, dim=0)[1][:k]

    return top_k_indices

# check if it works using a random question from the dev set
random_question = dev_set.iloc[np.random.randint(0, len(dev_set))]['question, str']
print(random_question, '\n') 
top_k = dense_retrieval(random_question, pooling='cls', k=3)
for i in top_k:
    print(f'{rag_list[i]["text"]}')

Bedriftværn, der er etableret i henhold til den lovgivning, der er gældende indtil 1. januar 1993, opretholdes, medmindre at hvad? 

Uddannelsesparate og aktivitetsparate personer under 30 år uden en erhvervskompetencegivende uddannelse modtager uddannelseshjælp. Uddannelseshjælpen udgør et månedligt beløb på 11.505 kr. for personer, der forsørger eget barn i hjemmet og har erhvervet ret til ekstra børnetilskud efter lov om børnetilskud og forskudsvis udbetaling af børnebidrag, jf. dog stk. 5, 11.505 kr. for personer, der forsørger eget barn i hjemmet og har erhvervet ret til ekstra børnetilskud efter lov om børnetilskud og forskudsvis udbetaling af børnebidrag, jf. dog stk. 5, 8.051 kr. for personer, der forsørger eget barn i hjemmet og ikke har erhvervet ret til ekstra børnetilskud efter lov om børnetilskud og forskudsvis udbetaling af børnebidrag, 10.500 kr. for kvinder, der er gravide og har passeret 12. svangerskabsuge, 13.952 kr. for personer, der har en dokumenteret psykisk lide

In [16]:
# add retrieved paragraphs to dev_set

tf_idf_1 = []

for question in tqdm(dev_set['question, str'], desc='TF-IDF, k=1'):
    paragraphs = [rag_list[i]['text'] for i in sparse_retrieval(question, X, k=1)]
    # join list into long string
    paragraphs = ' '.join(paragraphs)
    tf_idf_1.append(paragraphs)


tf_idf_3 = []

for question in tqdm(dev_set['question, str'], desc='TF-IDF, k=3'):
    paragraphs = [rag_list[i]['text'] for i in sparse_retrieval(question, X, k=3)]
    # join list into long string
    paragraphs = ' '.join(paragraphs)
    tf_idf_3.append(paragraphs)


dev_set['tf_idf_1'] = tf_idf_1
dev_set['tf_idf_3'] = tf_idf_3

TF-IDF, k=1: 100%|██████████| 106/106 [00:00<00:00, 296.90it/s]
TF-IDF, k=3: 100%|██████████| 106/106 [00:00<00:00, 319.42it/s]


### Evaluation

In [17]:
def evaluate(model_name, retrieval_method, metric, k):
    """
    model_name = 'KennethTM/gpt-neo-1.3B-danish' or 'strombergnlp/dant5-large'
    retrieval_method = 'tf-idf', 'bm25' or 'dense'
    metric = 'bleu', 'rouge' or 'meteor'
    """
    # set device to mps
    device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"

    # load AutoTokenizer for model
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # generate list of answers to fill
    answers = []

    # load neo
    # the loops are made for each model to not waste compute on loading models for each question

    if model_name == 'KennethTM/gpt-neo-1.3B-danish':
        model = AutoModelForCausalLM.from_pretrained("KennethTM/gpt-neo-1.3B-danish").to(device)
    
        for question in tqdm(dev_set['question, str'], desc='Answering questions with neo'):

            if retrieval_method == 'tf-idf':
                # create list of paragraphs by getting indexes for best hits with sparse_retrieval
                paragraphs = [rag_list[i]['text'] for i in sparse_retrieval(question, X, k=3)]
                # join list into long string
                paragraphs = ' '.join(paragraphs)

            elif retrieval_method == 'bm25':
                paragraphs = bm25_retrieval(rag_list=rag_list, k=3)
            
            elif retrieval_method == 'dense_retrieval':
                # create list of paragraphs by getting indexes for best hits with sparse_retrieval
                paragraphs = [rag_list[i]['text'] for i in dense_retrieval(question, k=3)]
                # join list into long string
                paragraphs = ' '.join(paragraphs)

            # assemble all in prompt
            prompt = f'Kontekst: {paragraphs} Spørgsmål: {question} Svar: '

            # tokenize
            input_ids = tokenizer(prompt, return_tensors="pt").to(device)

            # set max_length to no. of tokens in prompt + 100 (the 100 are thus for generation)
            max_length = int(input_ids['input_ids'].size(1)) + 100

            # generate answer with no_grad() to save compute
            with torch.no_grad():
                outputs = model.generate(
                    input_ids,
                    max_length=max_length,
                    pad_token_id=tokenizer.eos_token_id)
            
            # decode the generated answer
            answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

            answers.append(answer)
    

    # load T5
    elif model_name == 'strombergnlp/dant5-large':
        model = T5ForConditionalGeneration.from_pretrained("strombergnlp/dant5-large").to(device)

        for question in tqdm(dev_set['question, str'], desc='Answering questions with T5'):

            if retrieval_method == 'tf-idf':
                # create list of paragraphs by getting indexes for best hits with sparse_retrieval
                paragraphs = [rag_list[i]['text'] for i in sparse_retrieval(question, X, k=3)]
                # join list into long string
                paragraphs = ' '.join(paragraphs)

            elif retrieval_method == 'bm25':
                paragraphs = bm25_retrieval(rag_list=rag_list, k=3)
            
            elif retrieval_method == 'dense_retrieval':
                # create list of paragraphs by getting indexes for best hits with sparse_retrieval
                paragraphs = [rag_list[i]['text'] for i in dense_retrieval(question, k=3)]
                # join list into long string
                paragraphs = ' '.join(paragraphs)

            # assemble all in input
            input_text = f"Spørgsmål: {question} Kontekst: {paragraphs} Svar:"

            # tokenize
            input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

            # generate answer with no_grad() to save compute
            with torch.no_grad():
                outputs = model.generate(
                    input_ids,
                    max_length=max_length,
                    pad_token_id=tokenizer.eos_token_id)
            
            with torch.no_grad():
                outputs = model.generate(input_ids, max_length=100)

            # Decode and print the generated answer
            answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

            answers.append(answer)
        
    # choosing metric to evaluate answers
    if metric == 'bleu':
        scores = []
        idx = 0

        for pred_answer, true_answer in tqdm(zip(answers, list(dev_set['answer, str'])), desc='Evaluating with bleu'):
            try:
                scores.append(nltk.translate.bleu_score.sentence_bleu([true_answer], pred_answer))
            except:
                print(f'Error when computing bleu-score at index {idx}')
            idx += 1
    
    elif metric == 'meteor':
        scores = []
        idx = 0

        for pred_answer, true_answer in tqdm(zip(answers, list(dev_set['answer, str'])), desc='Evaluating with meteor'):
            try:
                scores.append(nltk.tranlsate.meteor_score([true_answer], pred_answer))
            except:
                print(f'Error when computing meteor-score at index {idx}')
            idx += 1

    elif metric == 'rouge':
        scores = []
        idx = 0

        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

        for pred_answer, true_answer in tqdm(zip(answers, list(dev_set['answer, str'])), desc='Evaluating with rouge'):
            try:
                scores.append(score.score(true_answer, pred_answer)['rouge1'])
            except:
                print(f'Error when computing meteor-score at index {idx}')
            idx += 1

    print(f'{metric}-scores for {model_name} using {retrieval_method}: {np.mean(scores)}')
        