# Load all ARGUS documents (from 2019 and 2024):
* deid them
* store them in JSON

In [None]:
import deduce
import re
import os
import pandas as pd

deid = deduce.Deduce()

In [None]:
from dotenv import load_dotenv

In [None]:
load_dotenv('../.env')
ARGUS_PATH = os.getenv('ARGUS_PATH')

In [None]:
radio_2019 = pd.read_csv(os.path.join(ARGUS_PATH, '20190118/st9_ct_radio_vrsl_18jan2019.csv'), sep=';', encoding='latin1')
radio_2019_2 = pd.read_csv(os.path.join(ARGUS_PATH, '20190909/ST9_CT_RADIO_VRSL_09SEP2019.csv'), sep=';', encoding='latin1')

In [None]:
radio_2019 = radio_2019[['pateventid', 'verr_datum', 'ONDERZDAT', 'MEMO' ,'tekst', 'plattetext', 'reporttxt']].dropna(subset=['tekst', 'plattetext', 'reporttxt'], how='all')
radio_2019['TEXT'] = radio_2019[['tekst', 'plattetext', 'reporttxt']].apply(lambda x: "\n".join([t for t in x if not pd.isna(t)]), axis=1)
radio_2019 = radio_2019.drop(['tekst', 'plattetext', 'reporttxt'], axis=1)

radio_2019_2 = radio_2019_2[['pateventid', 'verr_datum', 'ONDERZDAT', 'MEMO' ,'tekst', 'plattetext', 'reporttxt']].dropna(subset=['tekst', 'plattetext', 'reporttxt'], how='all')
radio_2019_2['TEXT'] = radio_2019_2[['tekst', 'plattetext', 'reporttxt']].apply(lambda x: "\n".join([t for t in x if not pd.isna(t)]), axis=1)
radio_2019_2 = radio_2019_2.drop(['tekst', 'plattetext', 'reporttxt'], axis=1)

In [None]:
# remove repeated characters
radio_2019['TEXT'] = radio_2019['TEXT'].str.replace(r'[^\w]{3,}', ' ', regex=True)
radio_2019['TEXT_ID'] = radio_2019['TEXT'].apply(lambda x: deid.deidentify(x).deidentified_text)
radio_2019 = radio_2019.sort_values(by='verr_datum')

radio_2019_2['TEXT'] = radio_2019_2['TEXT'].str.replace(r'[^\w]{3,}', ' ', regex=True)
radio_2019_2['TEXT_ID'] = radio_2019_2['TEXT'].apply(lambda x: deid.deidentify(x).deidentified_text)
radio_2019_2 = radio_2019_2.sort_values(by='verr_datum')

In [None]:
radio_old = pd.concat([radio_2019, radio_2019_2], axis=0)[['pateventid', 'verr_datum', 'TEXT_ID']].sort_values(by='verr_datum').reset_index(drop=True)


In [None]:
radio_new = pd.read_parquet(os.path.join(ARGUS_PATH, '20240909/parquet/radio_reports.parquet'))
radio_new = radio_new.sort_values(by='onderzoeks_dt')
radio_new = radio_new.drop_duplicates(subset=['studyId_0771','content_attachment1_plain_data'])
radio_new = radio_new.dropna(subset=['TEXT'])

In [None]:
radio_new['TEXT'] = radio_new['content_attachment1_plain_data'].str.replace(r'[^\w]{3,}', ' ', regex=True)
radio_new['TEXT_ID'] = radio_new['TEXT'].apply(lambda x: deid.deidentify(x).deidentified_text)

In [None]:
radio_new = radio_new[['studyId_0771', 'onderzoeks_dt', 'TEXT_ID']].reset_index(drop=True)
radio_new['onderzoeks_dt'] = radio_new.onderzoeks_dt.dt.date
radio_new = radio_new.rename(columns={'onderzoeks_dt': 'verr_datum', 'studyId_0771': 'pateventid'})
radio_new['idx'] = radio_new.groupby('pateventid').cumcount()+1
radio_new['pateventid'] = radio_new['pateventid'].astype(str)
radio_new['pateventid'] = radio_new['pateventid'] + "_" + radio_new['idx'].astype(str)
radio_new = radio_new.drop('idx', axis=1)

In [None]:
radio_new = radio_new.set_index(['pateventid', 'verr_datum'])
radio_new = radio_new.drop_duplicates()

In [None]:
radio_old = radio_old.groupby(['pateventid', 'verr_datum']).TEXT_ID.apply(lambda x: "\n\n".join(x)).to_frame()

In [None]:
radio_old.to_json(os.path.join(ARGUS_PATH, 'radio_old.json'))
radio_new.to_json(os.path.join(ARGUS_PATH, 'radio_new.json'))

In [None]:
import json
from random import sample, shuffle
from collections import defaultdict

In [None]:
radio_new = json.load(open(os.path.join(ARGUS_PATH, 'radio_new.json')))

In [None]:
# I want to make 7 sets of 50 documents
# Each document is represented twice in the 7 sets.
# We have 350 documents in total with 175 unique documents.

# 1. select 175 keys
# 2. randomly assign each key to 2 sets until all sets have 50 documents

KEYS = list(radio_new.keys())
shuffle(KEYS)
random_selection = sample(KEYS, 175)

In [None]:
GroupsDict = defaultdict(set)
AvailableGroups = set(range(7))
while len(AvailableGroups) > 0:
    for key in random_selection:
        if len(AvailableGroups)>1:
            groups = sample(list(AvailableGroups), 2)
            for group in groups:
                GroupsDict[group].add(key)
                if len(GroupsDict[group]) == 50:
                    AvailableGroups.discard(group)
        elif len(AvailableGroups) == 1:
            group = AvailableGroups.pop()
            GroupsDict[group].add(key)
            if len(GroupsDict[group]) == 50:
                AvailableGroups.discard(group)
        else:
            break

In [None]:
key_counter = defaultdict(int)

for group, keys in GroupsDict.items():
    for key in keys:
        key_counter[key] += 1

In [None]:
re_clean = re.compile(r'[()\, \']')
for groupID,v in GroupsDict.items():
    print(f'Group {groupID}: {len(v)}')
    os.makedirs(os.path.join(ARGUS_PATH, 'A_W_W', f'Groep{str(groupID+1)}'), exist_ok=True)

    for key in v:
        txt = radio_new[key]
        _key = re_clean.sub("", key.replace("datetime.date","-"))
        write_path = os.path.join(ARGUS_PATH, 'A_W_W', f'Groep{str(groupID+1)}', f'{_key}.txt')
        # write to .txt file in os.path.join(ARGUS_PATH, 'A_W_W', f'Groep{str(groupID+1)}')
        with open(write_path, 'w', encoding='utf-8') as f:
            f.write(txt)


In [75]:
AWW = {k:v for k,v in radio_new.items() if k in random_selection}

In [None]:
json.dump(AWW, open(os.path.join(ARGUS_PATH, 'A_W_W', 'AWW.json'), 'w',
                     encoding='latin1'), indent=4, ensure_ascii=False)

  json.dump(AWW, open(os.path.join(ARGUS_PATH, 'A_W_W', 'AWW.json'), 'w', encoding='latin1'), indent=4, ensure_ascii=False)
