In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import json
import os
from tqdm import tqdm
import regex as re
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
#from transformers import AutoTokenizer, T5ForConditionalGeneration
from transformers import pipeline
import torch
import random
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import SmoothingFunction

## Generate retrieval corpus

In [9]:
with open('data/domsdatabasen.retsinformation_newer.json') as f:
    retsinfo = json.load(f)

rag_list = []
idx = 0
for lov in tqdm(retsinfo):
    for kapitel in lov['kapitler']:
        lov_navn = lov['shortName']
        for paragraffer in kapitel['paragraffer']:
            temp_paragraf_dict = {}
            temp_paragraf_dict['paragraf_nr'] = paragraffer['nummer']
            temp_paragraf_dict['lovnavn'] = lov_navn
            temp_paragraf_list = []
            for styk in paragraffer['stk']:
                temp_paragraf_list.append(styk['tekst'])
            temp_paragraf_dict['text'] = ' '.join(temp_paragraf_list)
            rag_list.append(temp_paragraf_dict)

with open("rag_list.txt", "w") as file:
    for item in rag_list:
        file.write(f"{item}\n")

100%|██████████| 1637/1637 [00:00<00:00, 8591.64it/s]


## Generate dev set

In [10]:
# load and merge excel files from devset folder
dfs = [pd.read_excel(os.path.join("devset", f)) for f in os.listdir("devset") if f.endswith(".xlsx")]
dev_set = pd.concat(dfs, ignore_index=True)

# add csv data
csv_data = pd.read_csv("devset/rag_batch_1_with_qa.csv", sep=";").iloc[:, 1:].dropna()
csv_data.columns = dev_set.columns
dev_set = pd.concat([dev_set, csv_data], ignore_index=True)

# change column names
dev_set.columns = ['question', 'answer', 'text', 'paragraph', 'law']

# write to csv
dev_set.to_csv("data/dev_set.csv", index=False)

### THIS CAN BE DELETED NOW

In [11]:
# load dev set
dev_set = pd.read_csv("data/dev_set.csv").astype(str)
dev_set

Unnamed: 0,question,answer,text,paragraph,law
0,"Hvad har ejeren af en ejerlejlighed, sammen me...","Grunden, fælles bestanddele og tilbehør",'Ejeren af en ejerlejlighed har sammen med and...,3,LOV nr 908 af 18/06/2020
1,Hvem fastsætter eller aftaler bestemmelser om ...,Finansministeren fastsætter eller aftaler best...,'Højskolen skal følge de af finansministeren f...,30,LBK nr 780 af 08/08/2019
2,Hvad skal Beskæftigelsesministeriet og Finanst...,Den indsendte årsrapport skal i det mindste in...,'Uden ugrundet ophold efter repræsentantskabet...,25 l,LBK nr 1110 af 10/10/2014
3,Hvor mange procent må kapitalandele i og lån y...,Kapitalandele i og lån ydet til en virksomhed ...,'Følgende grænser for Arbejdsmarkedets Tillægs...,26 e,LBK nr 1110 af 10/10/2014
4,Hvad er en betingelse for retten til jobpræmie?,Det er en betingelse for retten til jobpræmie ...,'Det er en betingelse for retten til jobpræmie...,9,LOV nr 287 af 29/03/2017
...,...,...,...,...,...
101,Hvordan anføres kandidatlister på stemmesedler?,I særskilte felter.,Kandidatlisterne anføres på stemmesedlen i sær...,46,LBK nr 6 af 08/01/2024
102,Hvem iværksætter beslaglæggelse?,Politiet.,Politiet iværksætter beslaglæggelse. Politiet ...,807,LBK nr 250 af 04/03/2024
103,Hvis interesser skal foranstaltninger mod inte...,De forvaltede alternative investeringsfondes e...,En forvalter af alternative investeringsfonde ...,23,LBK nr 231 af 01/03/2024
104,Hvad skal valgstyrere eller tilforordnede vælg...,At stemmekasserne er tomme.,Afstemningen begynder kl. Inden stemmeafgivnin...,38,LBK nr 1432 af 01/12/2023


## Vectorize retrieval corpus

### Sparse retrieval

In [12]:
def preprocess(rag_list):
    # extract and preprocess text
    corpus = [item['text'] for item in rag_list]
    corpus = [re.sub('\\s{2,}', ' ', 
                     re.sub('\\W|[0-9]|§', ' ',
                           item.lower())) for item in corpus]

    # remove stopwords
    #nltk.download('punkt')
    stop_words = set(stopwords.words('danish'))
    corpus = [' '.join(word for word in text.split() 
                      if word not in stop_words) for text in tqdm(corpus)]
    
    return corpus

corpus = preprocess(rag_list)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

100%|██████████| 42593/42593 [00:00<00:00, 139300.26it/s]


## RAG retriever

### Sparse retrieval pipeline


In [17]:
def sparse_retrieval(question, corpus_embeddings=X, corpus=rag_list, vectorizer=vectorizer, k=3, max_tokens=800):
    """
    Function that takes a question and returns a list of paragraphs that are most relevant to the question
    """

    # preprocess and vectorize question
    question_processed = [re.sub('\\s{2,}', ' ', 
                               re.sub('\\W|[0-9]|§', ' ',
                                     question.lower()))]
    
    # remove stopwords
    stop_words = set(stopwords.words('danish'))
    question_processed = [' '.join(word for word in text.split() 
                                 if word not in stop_words) for text in question_processed]
    
    # embed question
    question_vector = vectorizer.transform(question_processed)

    # calculate cosine similarity
    sparse_retrieval = corpus_embeddings.dot(question_vector.T).toarray()

    # get top k paragraphs
    top_k = np.argsort(sparse_retrieval.flatten())[-k:]

    # truncate context to approximate token limit
    context = '\n'.join([corpus[i]['text'] for i in reversed(top_k)])
    context_words = context.split()[:max_tokens]
    return ' '.join(context_words)

# check if it works using a random question from the dev set
random_question = dev_set.iloc[np.random.randint(0, len(dev_set))]['question']
print(random_question, '\n')
sparse_retrieval(random_question, X)

Hvor overføres indeståender til i Særlig Pensionsopsparing (SP), som det ikke har været muligt at udbetale på grund af kontohaverens forhold? 



'Indeståender i Særlig Pensionsopsparing (SP), som det ikke har været muligt at udbetale på grund af kontohaverens forhold, overføres pr. 1. oktober 2010 til Arbejdsmarkedets Tillægspension til forvaltning. En kontohaver eller boet efter en afdød kontohaver, der kan dokumentere et krav på et indestående, kan i perioden fra den 1. maj 2010 til den 1. maj 2015 henvende sig til Arbejdsmarkedets Tillægspension og få udbetalt tilgodehavendet. Overførte indeståender som nævnt i stk. 1, som kontohaver eller boet efter en kontohaver ikke senest den 30. april 2015 har anmodet om at få udbetalt, tilfalder Arbejdsmarkedets Tillægspension. Beskæftigelsesministeren fastsætter regler om nedsættelse af fleksydelse på grund af pensionsopsparing. Arbejdsmarkedets Tillægspension fastsætter omkostningsprocenter og gebyrer i forbindelse med forvaltning og administration af SP.'

In [7]:
# inspect the dimensions in more detail to understand the sparse retrieval
question_processed = [re.sub('\\s{2,}', ' ', 
                               re.sub('\\W|[0-9]|§', ' ',
                                     random_question.lower()))]
question_vector = vectorizer.transform(question_processed)

print(f'corpus matrix shape: {X.toarray().shape}')
print(f'question vector shape: {question_vector.toarray().shape}')
print(f'question vector transpose shape: {question_vector.T.toarray().shape}')
print(f'sparse retrieval shape: {X.dot(question_vector.T).toarray().shape}')

corpus matrix shape: (42593, 107999)
question vector shape: (1, 107999)
question vector transpose shape: (107999, 1)
sparse retrieval shape: (42593, 1)


This makes sense. The corpus matrix, X, covers 42593 paragraphs (rows) with 107999 unique terms (columns). The query vector describes just one document with the same vocabulary (i.e., 107999 unique terms), and when transposed the unique terms become the rows. The sparse retrieval is the dot product between the corpus matrix and the query vector, which results in a 42593 by 1 vector, i.e., the cosine similarity between the query and each paragraph in the corpus.

In [8]:
# calculate L2 norms of each document vector
norms = np.sqrt((X.toarray()**2).sum(axis=1))
print(f"Mean norm: {norms.mean():.4f}")
print(f"Std norm: {norms.std():.4f}")

Mean norm: 0.9996
Std norm: 0.0200


This shows that the vectors in the corpus matrix are tf-idf normalized. Hence, the cosine similarity is equivalent to the dot product. 

## Test RAG retriever on dev set

In [13]:
device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Load the pipeline and move the model to MPS
generator = pipeline(
    "text-generation",
    model="KennethTM/gpt-neo-1.3B-danish",
    device=device
)

Using device: mps


In [10]:
# test how the model performs on random questions from the dev set
for question in random.sample(list(dev_set['question']), 2):
    
    # get top k paragraphs
    top_k = sparse_retrieval(question, X, k=3)
    
    # get the context
    context = ' '.join([rag_list[i]['text'] for i in top_k])
    
    # query the model
    prompt = f"Relevante paragraffer: {context}\n\nSpørgsmål: {question} \n\nSvar: "
    generated_text = generator(prompt, max_new_tokens=100, pad_token_id=50256)
    print(generated_text[0]['generated_text'][len(prompt) + 1:], '\n\n')

 I aktieselskaber kan alle anpartshavere forlange indkaldelse til en ekstraordinær generalforsamling.
Forslag til vedtægtsændringer skal for at kunne behandles på en ekstraordinær generalforsamling være modtaget hos bestyrelsen senest 3 uger før, generalforsamlingen skal holdes.
Ethvert kapitalejer, der kan anmelde sin interesse ifølge §§ 5 og 6, skal anmelde sit krav senest 3 uger før afholdelse af den ekstraordinære generalforsamling, medmindre der foreligger særlige omstændigheder, herunder, men ikke begrænset til, tilfælde, hvor kapitalejeren i øvrigt er forhindret i at anmelde sit krav. 


 Kan ministeren fastsætte regler om beskyttelse af natur og miljø efter lov nr. 316 af 25. maj 2004 om naturbeskyttelse og lov nr. 316 af 25. maj 2004 om beskyttelse af havmiljøet i danske farvande?Det er muligt at tilkøbe en lang række ydelser fra den offentlige myndighed, fx hjælp til sagsbehandling, borgeroverdragelse, rådgivning af myndigheder og behandling af klager.
Vedligeholdelse af det 

In [16]:
answers = []

# run through the questions in the dev set and calculate a bleu score and a rouge score
for question, correct_answer in tqdm(zip(dev_set['question'], dev_set['answer']), total=len(dev_set), leave=False):

    # run through RAG pipeline and generate answer
    top_k = sparse_retrieval(question, X, k=3)
    context = ' '.join([rag_list[i]['text'] for i in top_k])
    prompt = f"Relevante paragraffer: {context}\n\nSpørgsmål: {question} \n\nSvar: "
    generated_text = generator(prompt, max_new_tokens=100, pad_token_id=50256)

    # store answer and scores
    generated_answer = generated_text[0]['generated_text'][len(prompt) + 1:]
    answers.append(generated_answer)

Processed question 1 of 106
Processed question 2 of 106


Token indices sequence length is longer than the specified maximum sequence length for this model (1267 > 1024). Running this sequence through the model will result in indexing errors


Processed question 3 of 106
Processed question 4 of 106
Processed question 5 of 106
Processed question 6 of 106
Processed question 7 of 106
Processed question 8 of 106
Processed question 9 of 106
Processed question 10 of 106
Processed question 11 of 106
Processed question 12 of 106
Processed question 13 of 106
Processed question 14 of 106
Processed question 15 of 106
Processed question 16 of 106
Processed question 17 of 106
Processed question 18 of 106
Processed question 19 of 106
Processed question 20 of 106
Processed question 21 of 106
Processed question 22 of 106
Processed question 23 of 106
Processed question 24 of 106
Processed question 25 of 106
Processed question 26 of 106
Processed question 27 of 106
Processed question 28 of 106
Processed question 29 of 106
Processed question 30 of 106
Processed question 31 of 106
Processed question 32 of 106
Processed question 33 of 106
Processed question 34 of 106
Processed question 35 of 106
Processed question 36 of 106
Processed question 37

TypeError: unsupported operand type(s) for +: 'RougeScorer' and 'RougeScorer'

In [17]:
len(bleu_scores), len(rouge_scores), len(answers)


(106, 106, 106)

In [18]:
print(f'Mean BLEU score: {np.mean(bleu_scores):.2f}')


Mean BLEU score: 0.02


In [None]:
def calculate_bleu_scores(answers, dev_set):
    bleu_scores = []
    for i in range(len(answers)):
        reference = str(dev_set['answer'].iloc[i]).split()
        hypothesis = answers[i].split()
        bleu_scores.append(sentence_bleu([reference], hypothesis, smoothing_function=SmoothingFunction().method1))
    return np.mean(bleu_scores)

In [19]:
def calculate_rouge_scores(answers, dev_set):
    # Calculate actual ROUGE scores from the scorer objects
    rouge_scores_dict = []
    for i in range(len(answers)):
        # Skip any entries where answers are not strings
        if isinstance(dev_set['answer'].iloc[i], float) or isinstance(answers[i], float):
            continue
            
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
        scores = scorer.score(target=dev_set['answer'].iloc[i], prediction=answers[i])
        rouge_scores_dict.append({
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        })

    # get the average of the different rouge scores
    rouge1_scores = [score['rouge1'] for score in rouge_scores_dict]
    rouge2_scores = [score['rouge2'] for score in rouge_scores_dict]
    rougeL_scores = [score['rougeL'] for score in rouge_scores_dict]
    
    return rouge1_scores, rouge2_scores, rougeL_scores

[{'rouge1': 0.041666666666666664,
  'rouge2': 0.0,
  'rougeL': 0.041666666666666664},
 {'rouge1': 0.08333333333333334,
  'rouge2': 0.016949152542372885,
  'rougeL': 0.06666666666666667},
 {'rouge1': 0.288659793814433,
  'rouge2': 0.21052631578947367,
  'rougeL': 0.288659793814433},
 {'rouge1': 0.27722772277227725,
  'rouge2': 0.1414141414141414,
  'rougeL': 0.19801980198019803},
 {'rouge1': 0.17857142857142858,
  'rouge2': 0.07272727272727272,
  'rougeL': 0.16071428571428567},
 {'rouge1': 0.27419354838709675,
  'rouge2': 0.13114754098360656,
  'rougeL': 0.22580645161290322},
 {'rouge1': 0.23636363636363633,
  'rouge2': 0.09259259259259259,
  'rougeL': 0.1272727272727273},
 {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0},
 {'rouge1': 0.056603773584905655,
  'rouge2': 0.0,
  'rougeL': 0.03773584905660377},
 {'rouge1': 0.15267175572519084,
  'rouge2': 0.015503875968992248,
  'rougeL': 0.09160305343511449},
 {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0},
 {'rouge1': 0.27586206896551724,
  'r

In [20]:
# get the average of the different rouge scores
rouge1_scores = [score['rouge1'] for score in rouge_scores_dict]
rouge2_scores = [score['rouge2'] for score in rouge_scores_dict]
rougeL_scores = [score['rougeL'] for score in rouge_scores_dict]
print(f'Mean ROUGE1 score: {np.mean(rouge1_scores):.2f}')
print(f'Mean ROUGE2 score: {np.mean(rouge2_scores):.2f}')
print(f'Mean ROUGE-L score: {np.mean(rougeL_scores):.2f}')

Mean ROUGE1 score: 0.10
Mean ROUGE2 score: 0.04
Mean ROUGE-L score: 0.08


In [147]:
# let's just calculate the bleu score btw questions and answers to get it to work'
bleu_list = []
for question, answer in zip(dev_set['question'], dev_set['answer']):
    reference = answer.split()
    hypothesis = question.split()
    bleu_list.append(nltk.translate.bleu_score.sentence_bleu([reference], hypothesis))

print(f'Mean BLEU score: {np.mean(bleu_list):.2f}')

TypeError: Fraction.__new__() got an unexpected keyword argument '_normalize'