We extract stats for the following corpora

# EHRs
**Cardiology**
* [x] CarTeksten
* [x] SCAD
* [x] DataTools4Heart
* [x] DIGIN
* [x] PobTriage
* ARGUS
* [x] CCN
* [x] HMC

**IBD**
* MDL_hackathon $\rightarrow$ prefix with 'IBD_'

**Kidney**
* [x] ODIN  $\rightarrow$ prefix with 'NEFRO_'

**RA**
* D2TRA hackathon  $\rightarrow$ prefix with 'RA_'
* STRATAFIT $\rightarrow$ prefix with 'RA_'

# Other

* NtvG  $\rightarrow$ prefix with 'GEN_'
* HenW $\rightarrow$ prefix with 'GEN_'
* NHG $\rightarrow$ prefix with 'GEN_'
* Pubmed-abstract translation  $\rightarrow$ prefix with 'GEN_'

Translations
* MIMIC III/IV 
* eICU
* Apollo corpus
* Meditron guidelines

In [87]:
%load_ext autoreload
%autoreload 2

import numpy as np
import re
import gc
import os
import pandas as pd
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [89]:
def striprtf(text, simple=True):
   if simple:
    remove_re = re.compile(r'\{\*?\\[^{}]+}|[{}]|\\\n?[A-Za-z]+\n?(?:-?\d+)?[ ]?')
    return remove_re.sub('', text)
   else:
    pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
    # control words which specify a "destionation".
    destinations = frozenset((
        'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
        'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
        'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping',
        'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap',
        'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
        'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
        'ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype',
        'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr',
        'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
        'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
        'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
        'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride',
        'listoverridetable','listpicture','liststylename','listtable','listtext',
        'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr',
        'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr',
        'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me',
        'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr',
        'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag',
        'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname',
        'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr',
        'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject',
        'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname',
        'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl',
        'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr',
        'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu',
        'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr',
        'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup',
        'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide',
        'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol',
        'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables',
        'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops',
        'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password',
        'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta',
        'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe',
        'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst',
        'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv',
        'svb','tc','template','themedata','title','txe','ud','upr','userprops',
        'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
        'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
        'xmlopen',
    ))
    # Translation of some special characters.
    specialchars = {
        'par': '\n',
        'sect': '\n\n',
        'page': '\n\n',
        'line': '\n',
        'tab': '\t',
        'emdash': u'\u2014',
        'endash': u'\u2013',
        'emspace': u'\u2003',
        'enspace': u'\u2002',
        'qmspace': u'\u2005',
        'bullet': u'\u2022',
        'lquote': u'\u2018',
        'rquote': u'\u2019',
        'ldblquote': u'\201C',
        'rdblquote': u'\u201D', 
    }
    stack = []
    ignorable = False       # Whether this group (and all inside it) are "ignorable".
    ucskip = 1              # Number of ASCII characters to skip after a unicode character.
    curskip = 0             # Number of ASCII characters left to skip
    out = []                # Output buffer.
    for match in pattern.finditer(text):
        word,arg,hex,char,brace,tchar = match.groups()
        if brace:
            curskip = 0
            if brace == '{':
                # Push state
                stack.append((ucskip,ignorable))
            elif brace == '}':
                # Pop state
                ucskip,ignorable = stack.pop()
        elif char: # \x (not a letter)
            curskip = 0
            if char == '~':
                if not ignorable:
                    out.append(u'\xA0')
            elif char in '{}\\':
                if not ignorable:
                    out.append(char)
            elif char == '*':
                ignorable = True
        elif word: # \foo
            curskip = 0
            if word in destinations:
                ignorable = True
            elif ignorable:
                pass
            elif word in specialchars:
                out.append(specialchars[word])
            elif word == 'uc':
                ucskip = int(arg)
            elif word == 'u':
                c = int(arg)
                if c < 0: c += 0x10000
                if c > 127: out.append(unichr(c))
                else: out.append(chr(c))
                curskip = ucskip
        elif hex: # \'xx
            if curskip > 0:
                curskip -= 1
            elif not ignorable:
                c = int(hex,16)
                if c > 127: out.append(unichr(c))
                else: out.append(chr(c))
        elif tchar:
            if curskip > 0:
                curskip -= 1
            elif not ignorable:
                out.append(tchar)
    return ''.join(out)

In [3]:
re_split = re.compile(r"\W")
splitter = lambda x: re_split.split(x)

In [4]:
output_folder = r'L:\lab_research\RES-Folder-UPOD\CarTeksten\G_Output\2_Data'

# [x] SCAD

4Million tokens

In [None]:
folder = r'T:\lab_research\RES-Folder-UPOD\SCAD\E_ResearchData\2_ResearchData\20240312'
SCAD = pd.read_sas(folder+"/carokverslag_20240312.sas7bdat")

In [None]:
SCAD['QRI_Stelling'] = SCAD['QRI_Stelling'].str.decode(encoding='latin1')
SCAD['QRI_ValueAnswerTxt'] = SCAD['QRI_ValueAnswerTxt'].str.decode(encoding='latin1')
SCAD = SCAD.loc[SCAD.QRI_ValueAnswerTxt.notna()]
SCAD['Age'] = SCAD['Age'].astype(str)
SCAD['gender'] = SCAD['gender'].str.decode(encoding='latin1')
SCAD['gender'] = SCAD['gender'].map({'male': 'man', 'female': 'vrouw'})
SCAD['gender'] = "Geslacht:"+ SCAD['gender']

In [None]:
SCAD_TEXT = (SCAD.loc[SCAD['QRI_ValueAnswerTxt'].notna(), ['QRI_Stelling', 'QRI_ValueAnswerTxt', 'gender']]\
                .apply(lambda x: x[0]+":"+x[1]+", "+x[2], axis=1).to_frame().rename(columns={0:'text'}))

In [None]:
NEW_SCAD = SCAD[['studyId_0818', 'index1_date']].join(SCAD_TEXT, how='inner')

In [None]:
TEXT_GROUPED = NEW_SCAD.groupby(['studyId_0818', 'index1_date']).text.apply(lambda x: '\n'.join(x)).reset_index()

In [None]:
TEXT_GROUPED['text_words'] = TEXT_GROUPED['text'].apply(splitter)
TEXT_GROUPED['text_len'] = TEXT_GROUPED['text_words'].apply(len)

In [None]:
TEXT_GROUPED['text_len'].hist(bins=30)

In [None]:
print(f"Number of words: {TEXT_GROUPED.text_len.sum()}")

In [None]:
unique_words = set()
for l in TEXT_GROUPED['text_words'].values:
    for w in l:
        unique_words.add(w)

In [None]:
print(f"Number of unique words: {len(unique_words)}")

In [None]:
TEXT_GROUPED[['index1_date', 'text']].to_parquet(output_folder+'\SCAD.parquet')

In [None]:
del SCAD, SCAD_TEXT, TEXT_GROUPED

In [None]:
gc.collect()

# [x] CarTeksten

400Million tokens

In [11]:
folder = r'L:\lab_research\RES-Folder-UPOD\CarTeksten\E_ResearchData\2_ResearchData\20240321'
CarTeksten = pd.read_parquet(folder+"/vrglijsten_20240321.parquet")

In [None]:
CarTeksten = CarTeksten[CarTeksten['QRI_ValueAnswerTxt'].apply(lambda x: x is not None)]

In [None]:
CarTeksten = CarTeksten.assign(TEXT= CarTeksten[['QRE_description', 'QRE_category_display', 'QRE_name', 'QRI_Stelling','QRI_ValueAnswerTxt']].apply(lambda x: 
                                                                                                                f'Omschrijving:{x['QRE_description']}'+\
                                                                                                                f', Categorie:{x['QRE_category_display']}'+\
                                                                                                                f', Thema:{x['QRE_name']}'+\
                                                                                                                f', Stelling:{x['QRI_Stelling']}'+\
                                                                                                                f', Antwoord:{x['QRI_ValueAnswerTxt']}'
                                                                                                               , axis=1))

In [None]:
CarTeksten['text_words'] = CarTeksten['TEXT'].apply(splitter)
CarTeksten['text_len'] = CarTeksten['text_words'].apply(len)

In [None]:
print(f"Number of words: {CarTeksten.text_len.sum()}")

unique_words = set()
for l in CarTeksten['text_words'].values:
    for w in l:
        unique_words.add(w)
        
print(f"Number of unique words: {len(unique_words)}")

In [None]:
CarTeksten['text_len'].hist(bins=30)


In [None]:
CarTeksten[['QRR_created', 'TEXT']].to_parquet(output_folder+'/CARQ.parquet')

In [None]:
del CarTeksten
gc.collect()

# [x] DIGIN

145Million tokens

In [14]:
DIGIN_ECHO = pd.read_sas(r'L:\lab_research\RES-Folder-UPOD\DIGIN\E_ResearchData\2_ResearchData\pat_echo_brief_20231018.sas7bdat')

In [None]:
DIGIN_ECHO['DOCOMSCHR'] = DIGIN_ECHO['DOCOMSCHR'].str.decode('latin1')
DIGIN_ECHO['Brief_txt'] = DIGIN_ECHO['Brief_txt'].str.decode('latin1')
DIGIN_ECHO['Conclusions_ECHO'] = DIGIN_ECHO['Conclusions_ECHO'].str.decode('latin1')
DIGIN_ECHO['gender'] = DIGIN_ECHO['gender'].str.decode('latin1')

In [None]:
DIGIN_ECHO = DIGIN_ECHO.assign(TEXT = DIGIN_ECHO[['DOCOMSCHR', 'Brief_txt', 'Conclusions_ECHO', 'gender', 'age']].apply(lambda x:
                                                        f"Document type:{x['DOCOMSCHR']}\n"+\
                                                        f"Brief:{x['Brief_txt']}\n"+\
                                                        f"Echo:{x['Conclusions_ECHO']}\n"+\
                                                        f"Geslacht:{x['gender']}, Leeftijd:{str(x['age'])}", axis=1))

In [None]:
DIGIN_ECHO.head()

In [None]:
DIGIN_ECHO['text_words'] = DIGIN_ECHO['TEXT'].apply(splitter)
DIGIN_ECHO['text_len'] = DIGIN_ECHO['text_words'].apply(len)

In [None]:
print(f"Number of words: {DIGIN_ECHO.text_len.sum()}")
unique_words = set()
for l in DIGIN_ECHO['text_words'].values:
    for w in l:
        unique_words.add(w)
print(f"Number of unique words: {len(unique_words)}")

In [None]:
DIGIN_ECHO[['create_dt', 'TEXT']].to_parquet(output_folder+"/DIGIN_echo.parquet")
del DIGIN_ECHO
gc.collect()

In [None]:
DIGIN_POLI = pd.read_sas(r'L:\lab_research\RES-Folder-UPOD\DIGIN\E_ResearchData\2_ResearchData\pat_poli_brief_20231018.sas7bdat')

In [None]:
# poli_date, age, gender
DIGIN_POLI = DIGIN_POLI.assign(DOCOMSCHR=DIGIN_POLI.DOCOMSCHR.str.decode('latin1'))
DIGIN_POLI = DIGIN_POLI.assign(SPECIALISM=DIGIN_POLI.SPECIALISM.str.decode('latin1'))
DIGIN_POLI = DIGIN_POLI.assign(Brief_txt=DIGIN_POLI.Brief_txt.str.decode('latin1'))
DIGIN_POLI = DIGIN_POLI.assign(OMSCHR_AGENDA=DIGIN_POLI.OMSCHR_AGENDA.str.decode('latin1'))
DIGIN_POLI = DIGIN_POLI.assign(NAAM_AGENDA=DIGIN_POLI.NAAM_AGENDA.str.decode('latin1'))
DIGIN_POLI = DIGIN_POLI.assign(gender=DIGIN_POLI.gender.str.decode('latin1'))
DIGIN_POLI = DIGIN_POLI.assign(OMSCHR_Afspraak=DIGIN_POLI.OMSCHR_Afspraak.str.decode('latin1'))

In [None]:
DIGIN_POLI = DIGIN_POLI.assign(TEXT=DIGIN_POLI[['DOCOMSCHR', 'SPECIALISM', 'NAAM_AGENDA', 'OMSCHR_Afspraak', 'gender', 'age', 'Brief_txt']].apply(lambda x:
                                            f"Document: {x['DOCOMSCHR']}\n"\
                                            f"Specialisme: {x['SPECIALISM']}\n"\
                                            f"Specialisme: {x['NAAM_AGENDA']}\n"\
                                            f"Afspraak: {x['OMSCHR_Afspraak']}\n"\
                                            f"Geslacht: {x['gender']}, Leeftijd: {str(x['age'])}\n"\
                                            f"Brief: {x['Brief_txt']}" ,axis=1))

In [None]:
DIGIN_POLI['text_words'] = DIGIN_POLI['TEXT'].apply(splitter)
DIGIN_POLI['text_len'] = DIGIN_POLI['text_words'].apply(len)

In [None]:
print(f"Number of words: {DIGIN_POLI.text_len.sum()}")
unique_words = set()
for l in DIGIN_POLI['text_words'].values:
    for w in l:
        unique_words.add(w)
print(f"Number of unique words: {len(unique_words)}")

In [None]:
DIGIN_POLI[['poli_date', 'TEXT']].to_parquet(output_folder+"/DIGIN_POLI.parquet")
del DIGIN_POLI
gc.collect()

 # [x] DataTools4Heart
 150.000 tokens

In [None]:
folder = r'T:\lab_research\RES-Folder-UPOD\DataTools4Heart\E_ResearchData\2_ResearchData\20240709'
echo_conc = pd.read_sas(folder+"/echo_concl_20240709.sas7bdat")

folder = r'T:\lab_research\RES-Folder-UPOD\DataTools4Heart\E_ResearchData\2_ResearchData\20240715'
echo = pd.read_sas(folder+"/echo_20240715.sas7bdat")

#ecg = pd.read_sas(folder+'/ecg_20240610.sas7bdat')
#vital = pd.read_sas(folder+'/vitalsign_20240610.sas7bdat')

In [None]:
echo.groupby(['studyId_0763', 'identifier_value']).size().hist(bins=20)

In [None]:
echo_conc = echo_conc.assign(datum=echo_conc.effectiveDateTime.dt.date)
echo_conc = echo_conc.assign(conclusion=echo_conc.conclusion.str.decode('latin1'))

In [None]:
echo_conc['text_words'] = echo_conc['conclusion'].apply(splitter)
echo_conc['text_len'] = echo_conc['text_words'].apply(len)

In [None]:
echo_conc.text_len.sum()

In [None]:
echo_conc[['datum', 'conclusion']].to_parquet(output_folder+"/dt4h_echo.parquet")

# [x] PobTriage

In [None]:
zdbrieven = r'T:\lab_research\RES-Folder-UPOD\PobTriage\E_ResearchData\2_ResearchData\20230130\ZD_brieven_20230130.parquet'
inkbrieven = r'T:\lab_research\RES-Folder-UPOD\PobTriage\E_ResearchData\2_ResearchData\Old\inkomendebrieven.sas7bdat'
verslagen = r'T:\lab_research\RES-Folder-UPOD\PobTriage\E_ResearchData\2_ResearchData\20230216\verslagen_20230216.sas7bdat'
ontslagbrieven = r'T:\lab_research\RES-Folder-UPOD\PobTriage\E_ResearchData\2_ResearchData\20230216\ontslagbrieven_20230216.sas7bdat'

zdbrieven_df = pd.read_parquet(zdbrieven)
inkbrieven_df = pd.read_sas(inkbrieven)
verslagen_df = pd.read_sas(verslagen)
ontslagbrieven_df = pd.read_sas(ontslagbrieven)

### ZDbrieven

6.4Million tokens

In [None]:
zdbrieven_df['text_words'] = zdbrieven_df['InkomendeBriefTekst_DOC'].apply(splitter)
zdbrieven_df['text_len'] = zdbrieven_df['text_words'].apply(len)

In [None]:
zdbrieven_df['text_len'].sum()

In [None]:
zdbrieven_df.text_len.hist(bins=30)

In [None]:
zdbrieven_df[['jaar', 'InkomendeBriefTekst_DOC']].to_parquet(output_folder+"/ZDBrieven_PoBTriage.parquet")

### Inkomende brieven

190.000 tokens

In [None]:
inkbrieven_df = inkbrieven_df.assign(reporttxt=inkbrieven_df.reporttxt.str.decode('latin1'))

In [None]:
inkbrieven_df = inkbrieven_df.dropna(subset=['reporttxt'])

In [None]:
inkbrieven_df = inkbrieven_df.assign(text_words=inkbrieven_df['reporttxt'].apply(splitter))

inkbrieven_df  = inkbrieven_df.assign(text_len = inkbrieven_df['text_words'].apply(len))

In [None]:
inkbrieven_df.text_len.sum()

In [None]:
inkbrieven_df[['reporttxt']].to_parquet(output_folder+"/INKBrieven_PoBTriage.parquet")

### Verslagen

1.75Million tokens

In [None]:
verslagen_df = verslagen_df.assign(indicatie=verslagen_df.indicatie.str.decode('latin1'))

verslagen_df = verslagen_df.assign(transtext=verslagen_df.transtext.str.decode('latin1'))

verslagen_df = verslagen_df.assign(plattetext=verslagen_df.plattetext.str.decode('latin1'))

verslagen_df = verslagen_df.assign(tekst=verslagen_df.tekst.str.decode('latin1'))


In [None]:
def stitch_text(x):
    txt = ""
    if (isinstance(x['indicatie'], str)):
        txt += "Indicatie: " + x['indicatie']
    if (isinstance(x['transtext'], str)):
        txt += "\nTransText: " + x['transtext']
    if (isinstance(x['plattetext'], str)):
        txt += "\nPlatteText: " + x['plattetext']
    return txt

In [None]:
verslagen_df = verslagen_df.assign(text=verslagen_df[['indicatie', 'transtext', 'plattetext']].apply(stitch_text, axis=1))

In [None]:
verslagen_df = verslagen_df.assign(text_words=verslagen_df['text'].apply(splitter))

verslagen_df  = verslagen_df.assign(text_len = verslagen_df['text_words'].apply(len))

In [None]:
verslagen_df.text_len.sum()

In [None]:
verslagen_df[['indexdate', 'text']].to_parquet(output_folder+"/verslagen_PoBTriage.parquet")

### Ontslagbrieven

15Million tokens

In [None]:
ontslagbrieven_df = ontslagbrieven_df.assign(docOmsch=ontslagbrieven_df.docOmsch.str.decode('latin1'))

ontslagbrieven_df = ontslagbrieven_df.assign(plattetext=ontslagbrieven_df.plattetext.str.decode('latin1'))


In [None]:
def stitch_text(x):
    txt = ""
    if (isinstance(x['docOmsch'], str)):
        txt += "Specialisme: " + x['docOmsch']
    if (isinstance(x['plattetext'], str)):
        txt += "\nPlatteText: " + x['plattetext']
    return txt

In [None]:
ontslagbrieven_df = ontslagbrieven_df.assign(text=ontslagbrieven_df[['docOmsch', 'plattetext']].apply(stitch_text, axis=1))

In [None]:
ontslagbrieven_df = ontslagbrieven_df.assign(text_words=ontslagbrieven_df['text'].apply(splitter))

ontslagbrieven_df  = ontslagbrieven_df.assign(text_len = ontslagbrieven_df['text_words'].apply(len))

In [None]:
ontslagbrieven_df.text_len.sum()

In [None]:
ontslagbrieven_df.to_parquet(output_folder+"/ontslagbrieven_PoBTriage.parquet")

# [x] CCN

### Decursus

26Million tokens

In [37]:
ccn_decursus_df = pd.read_csv(r'T:\laupodteam\AIOS\Bram\data\Argus\external_validation\ccn\C_Data\4 Final_data\FINAL_CSV\RAW_2\decursus_15jan2019.csv', sep=';', encoding='latin1', low_memory=False)

In [None]:
# TODO: PROCESS
folderCheck = r'T:\laupodteam\AIOS\Bram\data\Argus\external_validation\ccn\C_Data\1 Raw_data\CSV datafiles february 2018'

decursus_extra_list = []
for f in os.listdir(folderCheck):
    if (f.endswith(".csv")) & ('ExamData_Zorgtraject_Keuring_Decursus' in f):
        print(f"Loading {f}...")
        df = pd.read_csv(os.path.join(folderCheck, f), sep=',', encoding='latin1', low_memory=False)
        decursus_extra_list.append(df[['PATIENT_NUMBER', 'REGISTRATION_REASON', 'APPOINTMENT_DATE', 'REDEN_VAN_VERWIJZING', 'DECURSUS', 'BELEID', 'DIAGNOSE']])
decursus_extra_df = pd.concat(decursus_extra_list)


In [None]:
# TODO: PROCESS
consult_extra_list = []
for f in os.listdir(folderCheck):
    if (f.endswith(".csv")) & ('ExamData_Zorgtraject_Keuring_Consult' in f):
        print(f"Loading {f}...")
        df = pd.read_csv(os.path.join(folderCheck, f), sep=',', encoding='latin1', low_memory=False)
        consult_extra_list.append(df)
consult_extra_df = pd.concat(consult_extra_list)

In [40]:
# TODO: merge decursus_extra_df and consult_extra_df

In [41]:
ccn_external = pd.read_csv(r'T:\laupodteam\AIOS\Bram\data\Argus\external_validation\ccn\C_Data\4 Final_data\FINAL_CSV\RAW_2\external_15jan2019.csv', sep=';', encoding='latin1', low_memory=False)

ccn_external = ccn_external.dropna(subset=['EXT_REPORT'])


In [42]:
def stitch_text(x):
    txt = ""
    if (isinstance(x['EXT_PROCEDURE'], str)):
        txt += "Procedure: " + x['EXT_PROCEDURE']
    if (isinstance(x['EXT_REPORT'], str)):
        txt += " Uitslag: " + x['EXT_REPORT']
    return txt

In [43]:
ccn_external = ccn_external.assign(ProcedureText=ccn_external[['EXT_PROCEDURE', 'EXT_REPORT']].apply(stitch_text, axis=1))

In [44]:
ccn_decursus_df = pd.merge_asof(ccn_decursus_df, ccn_external[['studyId_CCN_Sophie', 'ProcedureText', 'EXT_DATE']],
           on='studyId_CCN_Sophie',
           allow_exact_matches=True,
           left_by='APPOINTMENT_DATE',
           right_by='EXT_DATE',
           direction='nearest'
           )



In [45]:
def stitch_text(x):
    txt = ""
    if (isinstance(x['DECURSUS'], str)):
        txt += "Decursus: " + x['DECURSUS']
    if (isinstance(x['ProcedureText'], str)):
        txt += "Procedure: " + x['ProcedureText']
    if (isinstance(x['TREATMENT_PLAN'], str)):
        txt += "\nBehandelplan: " + x['TREATMENT_PLAN']
    return txt

In [46]:
ccn_decursus_df = ccn_decursus_df.assign(text=ccn_decursus_df[['DECURSUS', 'TREATMENT_PLAN', 'ProcedureText']].apply(stitch_text, axis=1))
ccn_decursus_df = ccn_decursus_df.dropna(subset=['DECURSUS', 'TREATMENT_PLAN'], how='all')

In [47]:
ccn_decursus_df = ccn_decursus_df.assign(text_words=ccn_decursus_df['text'].apply(splitter))
ccn_decursus_df  = ccn_decursus_df.assign(text_len = ccn_decursus_df['text_words'].apply(len))
ccn_decursus_df.text_len.sum()

In [None]:
ccn_decursus_df[['APPOINTMENT_DATE', 'text']].to_parquet(output_folder+"/CCN_decursus.parquet")

### Consult

1.5Million words

In [4]:
# TODO: add to ccn_decursus_df with merge_asof
ccn_consult_df = pd.read_csv(r'T:\laupodteam\AIOS\Bram\data\Argus\external_validation\ccn\C_Data\4 Final_data\FINAL_CSV\PROCESSED\20210601_consult.csv', sep=";", encoding='latin1', low_memory=False)

In [8]:
# TODO: process the contents of the consult columns into text.

In [34]:
# MH_EXTRA
ConsultTexts = ccn_consult_df.sort_values(by='APPOINTMENT_DATE')[['PATIENT_NUMBER', 'MH_EXTRA']].dropna().groupby('PATIENT_NUMBER').MH_EXTRA.apply(lambda x: "\n".join(x)).to_frame()

In [36]:
ConsultTexts.to_parquet(output_folder+"/CCN_consults.parquet")

In [None]:
ConsultTexts = ConsultTexts.assign(text_words=ConsultTexts['MH_EXTRA'].apply(splitter))
ConsultTexts  = ConsultTexts.assign(text_len = ConsultTexts['text_words'].apply(len))
ConsultTexts.text_len.sum()

## [x] HMC 

In [None]:
HMC_RAD_df = pd.read_feather(r'T:\laupodteam\AIOS\Bram\data\Argus\external_validation\hmc\radiology_txt.feather')
HMC_RAD_CARDIO_df = pd.read_feather(r'T:\laupodteam\AIOS\Bram\data\Argus\external_validation\hmc\radiology_cardio_txt.feather')
HMC_Decursus_df = pd.read_feather(r'T:\laupodteam\AIOS\Bram\data\Argus\external_validation\hmc\decursus_txt.feather')

### Radiology

44Million tokens

In [None]:
HMC_RAD_df = HMC_RAD_df.assign(text_words=HMC_RAD_df['ReportTxt'].apply(splitter))
HMC_RAD_df  = HMC_RAD_df.assign(text_len = HMC_RAD_df['text_words'].apply(len))

In [None]:
HMC_RAD_df.text_len.sum()

In [None]:
HMC_RAD_df[['Onderz_dt', 'ReportTxt']].to_parquet(output_folder+"/HMC_radio.parquet")

### Radiology cardio

14Million tokens

In [None]:
HMC_RAD_CARDIO_df = HMC_RAD_CARDIO_df.assign(text_words=HMC_RAD_CARDIO_df['ReportTxt'].apply(splitter))
HMC_RAD_CARDIO_df  = HMC_RAD_CARDIO_df.assign(text_len = HMC_RAD_CARDIO_df['text_words'].apply(len))

In [None]:
HMC_RAD_CARDIO_df.text_len.sum()

In [None]:
HMC_RAD_CARDIO_df[['Onderz_dt', 'ReportTxt']].to_parquet(output_folder+"/HMC_radio_cardio.parquet")

### Decursus

110Million tokens

In [None]:
HMC_Decursus_df = HMC_Decursus_df.assign(text_words=HMC_Decursus_df['ReportTxt'].apply(splitter))
HMC_Decursus_df  = HMC_Decursus_df.assign(text_len = HMC_Decursus_df['text_words'].apply(len))

In [None]:
HMC_Decursus_df.text_len.sum()

In [None]:
HMC_Decursus_df[['create_dt', 'ReportTxt']].to_parquet(output_folder+"/HMC_decursus.parquet")

## ARGUS NOT FINISHED

* Radio
* Echo
* Decursus
* Inkomende brieven

# [x] SMART

9.3Million tokens

In [None]:
smart_df = pd.read_csv(r"T:\laupodteam\AIOS\Bram\data\Argus\text_data\SMART\output_dataset_cardio.csv", sep=";", encoding='latin1')

smart_df = smart_df.assign(text=smart_df.text.str.replace("..", "\n"))

smart_df = smart_df.assign(text_words=smart_df['text'].apply(splitter))
smart_df  = smart_df.assign(text_len = smart_df['text_words'].apply(len))

In [None]:
smart_df.text_len.sum()

In [None]:
smart_df.to_parquet(output_folder+"/SMART.parquet")

# [x] Kidney

19.000.000

In [14]:
consults  = pd.read_sas(r'T:\lab_research\RES-Folder-UPOD\ODIN-UC4\E_ResearchData\2_ResearchData\20240826\consult_20240826.sas7bdat', encoding='latin1')

In [15]:
consults = consults.assign(date=consults['date'].dt.date)

In [16]:
# 'Geriatrie' title, section_title_display_original, section_text, date
def stitch_text(x):
    txt = "Geriatrie. \n"
    if (isinstance(x['title'], str)):
        txt += "Betreft: " + x['title']
    if (isinstance(x['section_title_display_original'], str)):
        txt += "\nBetreft: " + x['section_title_display_original']
    if (isinstance(x['section_text'], str)):
        txt += "\nInhoud: " + x['section_text']
    return txt

consults['text'] = consults[['title', 'section_title_display_original', 'section_text']].apply(stitch_text, axis=1)
consults = consults.assign(txtlen=consults.text.str.len())

In [17]:
consults['text_words'] = consults['text'].apply(splitter)
consults['text_len'] = consults['text_words'].apply(len)

In [None]:
consults.text_len.sum()

In [None]:
consults.txtlen.hist(bins=100)

In [33]:
consults.loc[consults.txtlen>200,['date', 'text']].to_parquet(output_folder+"/ODIN_CONSULTS.parquet")

# IBD NOT FINISHED

In [22]:
decursus_hmc = pd.read_feather(r'L:\lab_research\RES-Folder-UPOD\MDL_hackathon\E_ResearchData\4_Prepped\hmc_decursus.feather')
decursus_umcu = pd.read_feather(r'L:\lab_research\RES-Folder-UPOD\MDL_hackathon\E_ResearchData\4_Prepped\umcu_decursus.feather')

In [25]:
endoscopy_hmc = pd.read_feather(r'L:\lab_research\RES-Folder-UPOD\MDL_hackathon\E_ResearchData\4_Prepped\hmc_endoscopy.feather')
endoscopy_umcu = pd.read_feather(r'L:\lab_research\RES-Folder-UPOD\MDL_hackathon\E_ResearchData\4_Prepped\umcu_endoscopy.feather')

In [30]:
pathology_hmc = pd.read_feather(r'L:\lab_research\RES-Folder-UPOD\MDL_hackathon\E_ResearchData\4_Prepped\hmc_pathology.feather')
pathology_umcu = pd.read_feather(r'L:\lab_research\RES-Folder-UPOD\MDL_hackathon\E_ResearchData\4_Prepped\umcu_pathology.feather')

In [31]:
radiology_hmc = pd.read_feather(r'L:\lab_research\RES-Folder-UPOD\MDL_hackathon\E_ResearchData\4_Prepped\hmc_pathology.feather')
radiology_umcu = pd.read_feather(r'L:\lab_research\RES-Folder-UPOD\MDL_hackathon\E_ResearchData\4_Prepped\umcu_pathology.feather')

In [73]:
decursus_hmc = pd.read_feather(r'L:\lab_research\RES-Folder-UPOD\MDL_hackathon\E_ResearchData\4_Prepped\hmc_decursus.feather')
decursus_umcu = pd.read_feather(r'L:\lab_research\RES-Folder-UPOD\MDL_hackathon\E_ResearchData\4_Prepped\umcu_decursus.feather')

decursus_umcu['text_words'] = decursus_umcu['ANTWOORD_def'].apply(splitter)
decursus_umcu['text_len'] = decursus_umcu['text_words'].apply(len)
decursus_umcu = decursus_umcu.rename(columns={'DATUM':'date', 'ANTWOORD_def':'text'})
decursus_umcu = decursus_umcu.assign(text=decursus_umcu[['STELLING','text']].apply(lambda x: x['STELLING']+":"+x['text'], axis=1))
decursus_umcu = decursus_umcu.groupby(['StudyID', 'date'])['text'].apply(lambda x: '\n'.join(x)).reset_index()

In [75]:
decursus_hmc = decursus_hmc[decursus_hmc.STELLING.str.contains('telefoon', case=False)==False]
decursus_hmc = decursus_hmc.rename(columns={'DATUM':'date', 'ANTWOORD_def':'text'})
decursus_hmc = decursus_hmc.assign(text=decursus_hmc[['STELLING','text']].apply(lambda x: x['STELLING']+":"+x['text'], axis=1))
decursus_hmc = decursus_hmc.groupby(['studyID', 'date'])['text'].apply(lambda x: '\n'.join(x)).reset_index()

In [None]:
decursus_umcu

# RA NOT FINISHED

**40.000.000**

In [51]:
base_dir_d2tra = '//ds/Data/IGD/Onderzoek/Reumatologie/20-724_D2TRA_Hackaton/E_ResearchData/2_ResearchData'

ra_brieven = pd.read_sas(base_dir_d2tra+'/ra_brieven_uncompressed.sas7bdat', encoding='latin1')
ra_brieven = ra_brieven[ra_brieven['OMSCH'].notna()]
ra_brieven = ra_brieven[ra_brieven['plattetext_new'].notna()]

ra_brieven = ra_brieven.assign(text=ra_brieven[['DOCOMSCHR', 'OMSCH', 'plattetext_new']].apply(lambda x: x['DOCOMSCHR']+":"+x['plattetext_new'], axis=1))
ra_brieven.reset_index(drop=True)[['text']].to_parquet(output_folder+"/RA_brieven_fromSAS.parquet")
ra_brieven = ra_brieven.assign(text_words=ra_brieven['text'].apply(splitter))
ra_brieven  = ra_brieven.assign(text_len = ra_brieven['text_words'].apply(len))
print(ra_brieven.text_len.sum())

10943883


In [57]:
ra_vragen = pd.read_sas(base_dir_d2tra+'/vragenlijsten_new_uncompressed.sas7bdat', encoding='latin1')
ra_vragen = ra_vragen.assign(STELLING=ra_vragen['STELLING'].apply(lambda x: 'Reden van komst' if 'reden' in x.lower() else x))
ra_vragen = ra_vragen.assign(ANTWOORD=ra_vragen.ANTWOORD.apply(striprtf))
ra_vragen = ra_vragen[['studyId_RA_hackathon', 'create_dt', 'STELLING', 'ANTWOORD']]
ra_vragen = ra_vragen.assign(text_words=ra_vragen['ANTWOORD'].apply(splitter))
ra_vragen  = ra_vragen.assign(text_len = ra_vragen['text_words'].apply(len))

In [90]:
ra_vragen = ra_vragen.assign(ANTWOORD=ra_vragen.ANTWOORD.apply(striprtf))

In [92]:
ra_vragen[['STELLING', 'ANTWOORD', 'CATEGORIE','omschr']].STELLING.value_counts()[40:60]

STELLING
Toelichting slokdarm                          2711
VAS arts (mm)                                 2630
Opdrachten aan verpleegkundigen - Algemeen    2538
Protheses / arthrodeses                       2414
Algemeen                                      2264
Toelichting                                   2195
VAS arts                                      2105
DAS 28 Gezwollen gewrichten                   2034
DAS 28 Pijnlijke gewrichten                   2034
Diagnose                                      1986
Zijn er beperkingen t.a.v. mobiliseren?       1960
Bloeddruk                                     1932
VRC 30 min afspraak over                      1843
Lichamelijk onderzoek - interne breed (+)     1658
Opmerking                                     1654
Pols                                          1652
Is het gewrichtsonderzoek uitgevoerd?         1590
Medisch relevante gegevens                    1478
Rookt de patiënt                              1381
ICC moet plaats vinden

In [None]:
#  Voorgeschiedenis, Algemeen, Toelichting, Toelichting slokdarm, Reden van komst, Overdracht, Lichamelijk onderzoek, Aanvullend onderzoek, Anamnese, Tractus anamnese, Samenvatting, Conclusie, Beleid
ra_vragen[['STELLING', 'ANTWOORD', 'CATEGORIE','omschr']].STELLING.value_counts()[40:60]

In [52]:
with open(r'T:\laupodteam\AIOS\Bram\language_modeling\MEDICAL_TEXT\CLEANED\RA.txt', 'r') as f:
    ra_stratafit = f.readlines()
    ra_stratafit = [{'text': x.strip()} for x in ra_stratafit]
    ra_stratafit_df = pd.DataFrame(ra_stratafit)
    ra_stratafit_df.to_parquet(os.path.join(output_folder, 'RA.parquet'))
    ra_stratafit_df = ra_stratafit_df.assign(text_words=ra_stratafit_df['text'].apply(splitter))
    ra_stratafit_df  = ra_stratafit_df.assign(text_len = ra_stratafit_df['text_words'].apply(len))
    print(ra_stratafit_df.text_len.sum())

11682461


In [53]:
with open(r'T:\laupodteam\AIOS\Bram\language_modeling\MEDICAL_TEXT\CLEANED\RA_radio.txt', 'r') as f:
    ra_stratafit_radio = f.readlines()
    ra_stratafit_radio = [{'text': x.strip()} for x in ra_stratafit_radio]
    ra_stratafit_radio_df = pd.DataFrame(ra_stratafit_radio)
    ra_stratafit_radio_df.to_parquet(os.path.join(output_folder, 'RA_radio.parquet'))
    ra_stratafit_radio_df = ra_stratafit_radio_df.assign(text_words=ra_stratafit_radio_df['text'].apply(splitter))
    ra_stratafit_radio_df  = ra_stratafit_radio_df.assign(text_len = ra_stratafit_radio_df['text_words'].apply(len))
    print(ra_stratafit_radio_df.text_len.sum())

10547930


In [54]:
with open(r'T:\laupodteam\AIOS\Bram\language_modeling\MEDICAL_TEXT\CLEANED\RA_SMK.txt', 'r') as f:
    ra_smk = f.readlines()
    ra_smk = [{'text': x.strip()} for x in ra_smk]
    ra_smk_df = pd.DataFrame(ra_smk)
    ra_smk_df.to_parquet(os.path.join(output_folder, 'RA_OTHER.parquet'))
    ra_smk_df = ra_smk_df.assign(text_words=ra_smk_df['text'].apply(splitter))
    ra_smk_df  = ra_smk_df.assign(text_len = ra_smk_df['text_words'].apply(len))
    print(ra_smk_df.text_len.sum())

19649210


# MDS NOT FINISHED

In [None]:
base_dir = 'L://lab_research/RES-Folder-UPOD/DiagHematoMal_22-751/E_ResearchData/2_ResearchData'

# GENERIC DUTCH MEDICAL CORPORA

## NtVG

**33M words**

In [61]:
ntvg = pd.read_feather(r'T:\laupodteam\AIOS\Bram\language_modeling\MEDICAL_TEXT\RAW\NtvG\ntvg_articles.feather')

In [62]:
def concat_text(x):
    if isinstance(x, np.ndarray):
        if len(x)>=2:
            if len(x)==2:
                return x[1]+":"+x[0]
            elif len(x)==3:
                return x[1]+":"+x[0]+"\n"+x[2]
            elif len(x)>3:
                return "\n".join(x)
        else:
            return x
    else:
        return x
ntvg = ntvg.assign(text=ntvg.body_clean.apply(concat_text))
ntvg = ntvg.assign(category = ntvg.category.replace('Diagnose in beeld', 'Diagnose'))


In [63]:
ntvg = ntvg.assign(category = ntvg.category.astype('str'))
ntvg = ntvg.assign(domain = ntvg.domain.astype('str'))
ntvg = ntvg.assign(text = ntvg.text.astype('str'))

In [64]:
def concat_summaries(x):
    if x is not None:
        if isinstance(x, np.ndarray):
            return "\n".join(x)
        else:
            return ""
    else:
        return ""
ntvg = ntvg.assign(summary_text = "")
ntvg = ntvg.assign(summary_text = ntvg.summary_clean.apply(concat_summaries))

In [69]:
ntvg = ntvg.assign(total_text = ntvg[['text', 'summary_text']].apply(lambda x: "\n".join(x), axis=1))

In [70]:
ntvg[['category', 'domain', 'total_text']].to_parquet(output_folder+"/ntvg.parquet")
ntvg = ntvg.assign(num_contents = ntvg.body_clean.apply(lambda x: len(x)))
ntvg = ntvg.assign(txtlen=ntvg.total_text.str.split().apply(lambda x: len(x)))

In [None]:
ntvg.txtlen.sum()

In [None]:
# concat with summary_clean, if available

## NHG NOT FINISHED

## HeNW NOT FINISHED

## Medical Wiki NOT FINISHED