In [1]:
# For modules loading
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import json
from tqdm import tqdm
import regex as re
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
#from transformers import AutoTokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, T5ForConditionalGeneration
import torch
from itertools import cycle
import os 
from transformers import pipeline

from rouge_score import rouge_scorer

# For BM25
import src.bm25_IR

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
path_adam = '/Users/adamwagnerhoegh/Documents/Legal data/domsdatabasen.retsinformation_newer.json'
path_asger = "/Users/asgerkromand/Library/CloudStorage/OneDrive-UniversityofCopenhagen/0. SDS/1 deep learning and nlp/ANLPDP_exam/data/domsdatabasen.retsinformation_newer.json"
path_andreas = '' #missing

# Define a function that can cycle through paths the above paths try them out, and yield the path
def path():
    paths = cycle([path_adam, path_asger, path_andreas])
    for path in paths:
        if path != '':
            try:
                with open(path, 'r') as f:
                    data = json.load(f)
                return data
            except:
                pass
        else:
            raise FileNotFoundError('No path to data found')

retsinfo = path()
    
rag_list = []
idx = 0
for lov in tqdm(retsinfo):
    for kapitel in lov['kapitler']:
        lov_navn = lov['shortName']
        for paragraffer in kapitel['paragraffer']:
            temp_paragraf_dict = {}
            temp_paragraf_dict['paragraf_nr'] = paragraffer['nummer']
            temp_paragraf_dict['lovnavn'] = lov_navn
            temp_paragraf_list = []
            for styk in paragraffer['stk']:
                temp_paragraf_list.append(styk['tekst'])
            temp_paragraf_dict['text'] = ' '.join(temp_paragraf_list)
            rag_list.append(temp_paragraf_dict)

# Write jsonl
with open("rag_list.jsonl", "w") as file:
    for item in rag_list:
        file.write(json.dumps(item) + "\n")

100%|██████████| 1637/1637 [00:00<00:00, 28044.37it/s]


In [5]:
rag_list2 = rag_list

def preprocess(rag_list):
    # extract and preprocess text
    corpus = [item['text'] for item in rag_list]
    corpus = [re.sub('\\s{2,}', ' ', 
                     re.sub('\\W|[0-9]|§', ' ',
                           item.lower())) for item in corpus]

    # remove stopwords
    #nltk.download('punkt')
    stop_words = set(stopwords.words('danish'))
    corpus = [' '.join(word for word in text.split() 
                      if word not in stop_words) for text in tqdm(corpus)]
    
    return corpus

corpus = preprocess(rag_list2)
vectorizer = TfidfVectorizer()
tf_idf_corpus = vectorizer.fit_transform(corpus)

100%|██████████| 42593/42593 [00:00<00:00, 124756.88it/s]


In [6]:
dev_set = pd.read_csv('devset/dev_set.csv')

In [7]:
def sparse_retrieval(question, corpus_embeddings, corpus=rag_list, vectorizer=vectorizer, k=1, max_tokens=800):
    """
    Function that takes a question and returns a list of paragraphs that are most relevant to the question
    """

    # preprocess and vectorize question
    question_processed = [re.sub('\\s{2,}', ' ', 
                               re.sub('\\W|[0-9]|§', ' ',
                                     question.lower()))]
    
    # remove stopwords
    stop_words = set(stopwords.words('danish'))
    question_processed = [' '.join(word for word in text.split() 
                                 if word not in stop_words) for text in question_processed]
    
    # embed question
    question_vector = vectorizer.transform(question_processed)

    # calculate cosine similarity
    sparse_retrieval = corpus_embeddings.dot(question_vector.T).toarray()

    # get top k paragraphs
    top_k = np.argsort(sparse_retrieval.flatten())[-k:]

    # truncate context to approximate token limit
    context = '\n'.join([corpus[i]['text'] for i in top_k])

    return context

In [3]:
# Load the pretrained T5 model and tokenizer
model_name = "strombergnlp/dant5-large"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
model.to(DEVICE)




T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

In [None]:
#for question, documents in tqdm(zip(dev_set['question, str'], dev_set['tf_idf_k1']), desc='Answering questions'):

# Example question and context
question = dev_set.loc[0, 'question']
documents = sparse_retrieval(question, tf_idf_corpus, k=3)

# Format the input for T5
input_text = f"Relevante paragraffer: {documents}\nSpørgsmål: {question}\nIndsæt svar her baseret på de relevante paragraffer:"

# Tokenize the input and generate an answer
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(DEVICE)

max_length = len(input_ids[0]) + 100

with torch.no_grad():
    outputs = model.generate(
        input_ids,
        max_length=max_length,
        pad_token_id=tokenizer.eos_token_id,
        # generation set to stop at ' Spørgsmål' as it otherwise just repeats itself (think it's because we don't sample)
        eos_token_id=tokenizer.encode(' Spørgsmål')[0]
    )

# Decode and print the generated answer
answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip(' Spørgsmål')
print(input_text, '\n')
print(answer)

Relevante paragraffer: I denne lov forstås ved: Ejerlejlighed: En lejlighed samt andre særskilt afgrænsede husrum, der er registreret som en ejerlejlighed efter reglerne i denne lov eller tidligere gældende lovgivning. Ejerlejlighed: En lejlighed samt andre særskilt afgrænsede husrum, der er registreret som en ejerlejlighed efter reglerne i denne lov eller tidligere gældende lovgivning. Ejerforening: Et obligatorisk fællesskab bestående af samtlige ejere af ejerlejligheder på en ejendom. Ejendom: En bestemt fast ejendom i tinglysningslovens forstand. Bygning: En fast konstruktion med vægge og tag beregnet til beboelse, erhverv eller opbevaring. Opdeling: Oprettelse af ejerlejligheder i bygninger på en ejendom. Videreopdeling: Oprettelse af flere ejerlejligheder i en eksisterende ejerlejlighed. Ændring: Anden ændring af en ejerlejligheds registrering end en videreopdeling.
Hver ejerlejlighed anses som en selvstændig fast ejendom. Hver ejerlejlighed er identificeret ved eget identifikati