In [None]:
# add autoreload
%load_ext autoreload
%autoreload 2

from pubscience.anonymise import deduce
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

In [2]:
os.chdir(r'L:\lab_research\RES-Folder-UPOD\CarTeksten\G_Output\2_Data\Clinical\CARDIOLOGY')

In [None]:
os.listdir()

In [72]:
name_dict = {
 'ARGUS_DisCharge2020': {'text': 'TEXT', 'meta': None},
 'ARGUS_DisCharge2024': {'text': 'text', 'meta': ['type2_display_original']},
 'ARGUS_echo': {'text': 'FINAL', 'meta': None},
 'ARGUS_RADIO2020': {'text': 'TEXT', 'meta': None},
 'ARGUS_RADIO2024': {'text': 'TEXT', 'meta': ['status_display_original2', 'isy_category_display']},
 'ARGUS_Referal_2024': {'text': 'text', 'meta': None},
 'CARQ': {'text': 'TEXT', 'meta': None},
 'CARQ_REFERALS': {'text': 'TEXT', 'meta': None},
 'CCN_consults': {'text': 'MH_EXTRA', 'meta': None},
 'CCN_decursus': {'text': 'text', 'meta': None},
 'DIGIN_echo': {'text': 'TEXT', 'meta': None},
 'DIGIN_POLI': {'text': 'TEXT', 'meta': None},
 'dt4h_echo': {'text': 'conclusion', 'meta': None},
 'HMC_decursus': {'text': 'ReportTxt', 'meta': None},
 'HMC_radio': {'text': 'ReportTxt', 'meta': None},
 'HMC_radio_cardio': {'text': 'ReportTxt', 'meta': None},
 'INKBrieven_PoBTriage': {'text': 'reporttxt', 'meta': None},
 'ontslagbrieven_PoBTriage': {'text': 'text', 'meta': ['docOmsch']},
 'SCAD': {'text': 'text', 'meta': None},
 'SMART': {'text': 'text', 'meta': None},
 'verslagen_PoBTriage': {'text': 'text', 'meta': None},
 'ZDBrieven_PoBTriage': {'text': 'InkomendeBriefTekst_DOC', 'meta': None},
} 

In [76]:
concat_df = pd.DataFrame()
for fname, fdict in name_dict.items():
    df = pd.read_parquet(f"{fname}.parquet")
    df['meta'] = np.nan
    if fdict['meta'] is not None:
        df = df.assign(meta=df[fdict['meta']].apply(lambda x: ",".join(x), axis=1))
    df = df.rename(columns={fdict['text']: 'text'})

    concat_df = pd.concat([concat_df, df[['text', 'meta']]], axis=0)

In [None]:
concat_df.to_parquet('COMBINED.parquet')

In [97]:
DeID = deduce.Deidentify(n_jobs=1, 
                        to_dataframe=True,
                        bsn_check=False,
                        date_check=False,
                        phone_check=False,
                        pid_check=False,
                        number_replace=False,
                        backend='multiprocessing', 
                        custom_list = None,
                        clear_brackets=False,
                        kwargs=None,
                        data_index=None, 
                        text_cols='TEXT'
                        )

In [98]:
batch_size = 1_000

df_list = []
num_rows = concat_df.shape[0]
for k in tqdm(range(num_rows // batch_size + 1)):
    start_i = k*batch_size
    end_i = min((k+1)*batch_size, num_rows)
    df_list.append(DeID.fit_transform(concat_df[['text']].iloc[start_i:end_i]))
deid_df = pd.concat(df_list, axis=0)

100%|██████████| 3584/3584 [3:26:33<00:00,  3.46s/it]  


In [99]:
deid_df['meta'] = concat_df['meta']

In [100]:
deid_df.to_parquet('deidentified/COMBINED_DEID.parquet')

In [101]:
import ftfy

In [103]:
text = "Some random text with a patiÃ«nt"

ftfy.fix_encoding(text)

'Some random text with a patiënt'

In [105]:
DeID = deduce.Deidentify(n_jobs=1, 
                        to_dataframe=True,
                        bsn_check=False,
                        date_check=False,
                        phone_check=False,
                        pid_check=False,
                        number_replace=False,
                        backend='multiprocessing', 
                        custom_list = None,
                        clear_brackets=False,
                        kwargs=None,
                        data_index=None, 
                        text_cols='TEXT'
                        )

In [106]:
refletters = pd.read_parquet(r'L:\lab_research\RES-Folder-UPOD\ODIN-UC4\E_ResearchData\2_ResearchData\20241118\refletters.parquet')

In [123]:
refletters_anom = DeID.fit_transform(refletters[['stripped_text']])

In [124]:
refletters_anom[['studyId_0831', 'created']] = refletters[['studyId_0831', 'created']]

In [125]:
refletters_anom.to_parquet(r'L:\lab_research\RES-Folder-UPOD\ODIN-UC4\E_ResearchData\2_ResearchData\20241118\refletters_deid.parquet')