In [None]:
import json
import os
from dotenv import load_dotenv
from tqdm import tqdm
import re
import pandas as pd
from numpy import log10
import matplotlib.pyplot as plt
from langdetect import detect

import sentence_transformers

In [None]:
load_dotenv()

In [None]:
results_path = os.getenv('RESULTS')

In [None]:
files = [f for f in os.listdir(results_path) if f.endswith('.jsonl') if 'summaries' in f]

In [None]:
files

In [None]:
# read jsonl

Summaries = []
for file in files:
    print(file)
    with open(os.path.join(results_path, file), 'r', encoding='utf-8') as f:
        Summaries.extend([json.loads(line) for line in f])

In [None]:
# remove: chapter paragraph hoofdstuk paragraaf summary samenvatting 
re_replace = re.compile(r'\b(chapter|paragraph|hoofdstuk|paragraaf|summary|samenvatting)\b', re.IGNORECASE)
re_replace_m = re.compile(r'\s{2,}')
re_replace_t = re.compile(r'\t')

# initial words: english, dutch, engelse, nederlandse
re_replace_start = re.compile(r'^[\s\n\r]*(publications|references|english|dutch|engelse|nederlandse|appendix|appendices|dankwoord)\b', re.IGNORECASE)

Cleaned_Summaries = []
for summ in tqdm(Summaries):
    # use regex
    if summ['summary_dutch']:
        summ['summary_dutch'] = re_replace.sub('', summ['summary_dutch']).strip()
        summ['summary_dutch'] = re_replace_m.sub(' ', summ['summary_dutch'])
        summ['summary_dutch'] = re_replace_t.sub(' ', summ['summary_dutch'])
        summ['summary_dutch'] = re_replace_start.sub('', summ['summary_dutch']).strip()

        # remove all lines with less than <= 3 words
        summ['summary_dutch'] = '\n'.join([line for line in summ['summary_dutch'].split('\n') if len(line.split()) > 3])

    if summ['summary_english']:
        summ['summary_english'] = re_replace.sub('', summ['summary_english']).strip()
        summ['summary_english'] = re_replace_m.sub(' ', summ['summary_english'])
        summ['summary_english'] = re_replace_t.sub(' ', summ['summary_english'])
        summ['summary_english'] = re_replace_start.sub('', summ['summary_english']).strip()

        # remove all lines with less than <= 3 words
        summ['summary_english'] = '\n'.join([line for line in summ['summary_english'].split('\n') if len(line.split()) > 3])
    Cleaned_Summaries.append(summ)


In [None]:
# Load multilingual model
model = sentence_transformers.SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
Cleaned_Summaries[10]

In [None]:
detect(Cleaned_Summaries[10]['summary_dutch'])

In [None]:
# summary english 
Paired = []
Dutch_Only = []
English_Only = []
failed=[]
for summ in tqdm(Cleaned_Summaries):
    # check if both summaries exist and have more than 25 words
    if 'summary_dutch' in summ and 'summary_english' in summ:
        dutch_len = len(summ['summary_dutch'].split()) if summ['summary_dutch'] else 0
        english_len = len(summ['summary_english'].split()) if summ['summary_english'] else 0

         # if both summaries have more than 25 words, compute similarity
         # if only one summary has more than 25 words, add to respective list

        # check if dutch summary is indeed dutch
        if dutch_len > 25:
            try:
                language = detect(summ['summary_dutch'])
            except Exception as e:
                print(f"Error detecting language for {summ['summary_dutch']}: {e}")
                continue
            if language != 'nl':
                #print(f"Detected language for Dutch summary in file {summ['file']} is not Dutch: {detect(summ['summary_dutch'])}")
                failed.append(summ['summary_dutch'])
        
        if (dutch_len> 25) and (english_len > 25):
            try:
                encs= model.encode([summ['summary_dutch'], summ['summary_english']])
                sim = sentence_transformers.util.cos_sim(encs[0], encs[1])
                Paired.append({
                    'id': summ['file'],
                    'institute': summ['institute'],
                    'summary_dutch': summ['summary_dutch'],
                    'summary_english': summ['summary_english'],
                    'dutch_len': dutch_len,
                    'english_len': english_len,
                    'similarity': float(sim[0][0])
                })
            except Exception as e:
                print(f"Error processing {summ['file']}: {e}")
                failed.append(summ['file'])
        else:
            print(f"Both summaries are less than 25 words or one is missing: {summ['file']}.")
            failed.append(summ['file'])

        
        if (dutch_len > 25) & (language == 'nl'):
            Dutch_Only.append({
                'id': summ['file'],
                'institute': summ['institute'],
                'summary_dutch': summ['summary_dutch'],
            })

        if english_len > 25:
            English_Only.append({
                'id': summ['file'],
                'institute': summ['institute'],
                'summary_english': summ['summary_english'],
            })

In [None]:
len(English_Only), len(Dutch_Only), len(Paired)

In [None]:
df = pd.DataFrame(Paired)

In [None]:
df.similarity.hist(bins=50)

In [None]:
df['rel_len'] = log10(df.dutch_len/df.english_len)