In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from collections import Counter
import re

In [2]:
import spacy
nlp = spacy.load("nl_core_news_sm")  # Make sure to install it with: python -m spacy download nl_core_news_sm


In [None]:
def remove_extreme_frequencies(texts, n_percent):
    """
    Remove top n% and bottom n% most frequent words from texts.
    
    Variables:
        texts: List of tokenized texts
        n_percent: Percentage of most/least frequent words to remove
        
    Returns:
        List of filtered tokenized texts
    """
    # Flatten all words into a single list for counting
    all_words = [word for text in texts for word in text.split()]
    word_freq = Counter(all_words)
    
    # Calculate number of words to remove from each end
    n_words = int(n_percent/100 * len(word_freq))
    print('n_words:', n_words)
    
    # Get top and bottom n% words
    top_n = set(word for word, _ in word_freq.most_common(n_words))
    bottom_n = set(word for word, _ in word_freq.most_common()[:-n_words-1:-1])
    
    # Filter out extreme frequency words
    return [' '.join(word for word in text.split() 
                    if word not in top_n and word not in bottom_n)
            for text in texts]

def preprocess_news_data(data_path, extreme_freq_threshold=1):
    """
    Preprocesses financial news and stock data with text cleaning.
    
    Variables:
        news_path: news dataset CSV
        stock_path: stock price CSV
        extreme_freq_threshold: Percentage of most/least frequent words to remove
    
    Returns:
        DataFrame with cleaned and merged news and stock data
    """
    nltk.download(['stopwords', 'wordnet'], quiet=True)
    
    reports_df = pd.read_csv(data_path)
    # stock_df = pd.read_csv(stock_path)
    
    # Clean news data - handle NaN values first
    reports_df = (reports_df.dropna(subset=['verslagen_report_content', 'verslagen_report_start_date']).copy())
    
    # Convert dates
    reports_df['date'] = pd.to_datetime(reports_df['verslagen_report_start_date'], errors='coerce')
  
   
    # Text preprocessing function
    def clean_text(text):
        # Convert to lowercase and remove HTML tags
        text = text.lower()
        # for tag in ['<p>', '</p>', '<u>', '</u>']:
        #     text = text.replace(tag, ' ')
            
        # Remove punctuation and non-alphabetic characters
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # # Remove stopwords and lemmatize
        # stop_words = set(stopwords.words('dutch'))
        # lemmatizer = WordNetLemmatizer()
        # words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        # print('words:', words)
        
        doc = nlp(text)
        words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop and len(token) > 1]
        # print('words:', words)

        return ' '.join(words)
    
    # Apply initial text preprocessing
    processed_df = (reports_df.assign(alltext=lambda x: x['verslagen_report_content'].apply(clean_text)))
    print('processed_df')
    
    # Remove extreme frequency words
    if extreme_freq_threshold > 0:
        processed_df['alltext'] = remove_extreme_frequencies(processed_df['verslagen_report_content'].tolist(), extreme_freq_threshold)
    
    # Create final features
    final_df = (processed_df
                .assign(tokens=lambda x: x['alltext'].str.split())
                # .assign(target=lambda x: (x['Close'] > x['Open']).astype(int))
                )
    
    return final_df


if __name__ == "__main__":
    cleaned_df = preprocess_news_data(
        'a:/bloeding-met-patientenlijst-2/bloeding-met-patientenlijst-4-verslagen.csv',
        extreme_freq_threshold=0
    )
    cleaned_df.to_csv('a:/df_cleaned.csv', index=False)

processed_df


In [4]:
cleaned_df

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",2020-11-26 15:06:00,2020-11-26 15:06:00,dhr aj dingemans huisarts streetnaam city datu...,"[dhr, aj, dingemans, huisarts, streetnaam, cit..."
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies obv diver...,2020-11-26 09:53:00,2020-11-26 09:53:00,samenvatting rectaal bloedverlie obvn divertik...,"[samenvatting, rectaal, bloedverlie, obvn, div..."
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,2020-11-25 14:13:00,2020-11-25 14:13:00,coloscopie betreffen mw initials lastname adre...,"[coloscopie, betreffen, mw, initials, lastname..."
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,2020-11-25 13:48:00,2020-11-25 13:48:00,gastroscopie betreffen mw initials lastname ad...,"[gastroscopie, betreffen, mw, initials, lastna..."
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies ; eenmali...,2020-11-25 08:47:00,2020-11-25 08:47:00,samenvatting rectaal bloedverlie eenmalig hd h...,"[samenvatting, rectaal, bloedverlie, eenmalig,..."
...,...,...,...,...,...,...,...
9572,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,Consult,Samenvatting: \n1e consult\r\n-Type 1e consult...,2015-03-20 08:13:00,2015-03-20 08:13:00,samenvatting consult type consult uitbreiden a...,"[samenvatting, consult, type, consult, uitbrei..."
9573,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus\r\n-Type decursus: De...,2015-01-14 15:39:00,2015-01-14 15:39:00,samenvatting decursus type decursus decursus s...,"[samenvatting, decursus, type, decursus, decur..."
9574,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,"Consult, SEH",Samenvatting: \nVerpleegkundige verslaglegging...,2014-12-21 09:31:00,2014-12-21 09:31:00,samenvatting verpleegkundig verslaglegging ver...,"[samenvatting, verpleegkundig, verslaglegging,..."
9575,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,"Consult, SEH",Samenvatting: \nMedisch Dossier\r\n[ Vk Sputov...,2010-11-10 21:03:00,2010-11-10 21:03:00,samenvatting medisch dossier vk sputovamo leef...,"[samenvatting, medisch, dossier, vk, sputovamo..."


In [5]:
cleaned_df['alltext'].iloc[0]

'dhr aj dingemans huisarts streetnaam city datum Kenmerk patientid bsn bsn betreffen mevrouw initials lastname geb birthdate streetnaam zip city tel phonenumber geacht collega bovengenoemde patiënte opnemen afdeling maag darm leverziekt verband melaena rectaal bloedverlie voorgeschiedenis diep veneuaz trombose longembolie cholecystectomie diverticulitis atriumfibrilleren spontaan conversie sinusritme melena waarvoor verklaring vinden verband stabiel hb overleg patiënt expectatief beleid vermoeidheid sinusbradycardie waarvoor stop metoprolol tambocor anamnees vanmiddag fors Helderrood bloedverlie stolsel vermengen ontlasting zwart kleur dag zeuren pijn bovenbuik maagpijn waarvoor stoppen koffie drinken vet eten dag ontlasting intaak bloed zwart verkleuring bemerken tractus anamnees bijdragen mn lwklachten all penicilline urticaria lichamelijk onderzoek controle hr bpm nibp mmhg temp alg acuut ziek duidelijk anemisch hh pearl lymfadenopathie Cor souffle pulm vag beiderzijds bijgeluiod ab

In [6]:
# Checking the data and common words
num_verslagen = len(cleaned_df)
cleaned_df['word_count'] = cleaned_df['alltext'].apply(lambda x: len(x.split()))
avg_words_per_article = cleaned_df['word_count'].mean()
all_words = ' '.join(cleaned_df['alltext']).split()
unique_words = set(all_words)
num_unique_words = len(unique_words)
lexical_variation = len(unique_words) / len(all_words)


print(f"Number of articles: {num_verslagen}")
print(f"Total number of words: {len(all_words)}")
print(f"Average words per article: {avg_words_per_article}")
print(f"Number of unique words: {num_unique_words}")
print(f"Lexical richness: {lexical_variation}")

cleaned_df['tokens'] = cleaned_df['alltext'].apply(lambda x: x.split())
all_tokens = [word for tokens_list in cleaned_df['tokens'] for word in tokens_list]
word_counter = Counter(all_tokens)
most_common_100 = word_counter.most_common(100)


print("Most common 100 words:")
for word, freq in most_common_100:
    print(f"{word}: {freq}")

Number of articles: 9577
Total number of words: 1424063
Average words per article: 148.69614701889944
Number of unique words: 31602
Lexical richness: 0.022191433946391417
Most common 100 words:
mg: 15130
dag: 12216
per: 11328
beleid: 10702
samenvatting: 9790
onderzoek: 8156
oraal: 8144
stuk: 7478
anemie: 7474
dd: 7161
ivm: 7112
tablet: 6266
conclusie: 6213
patiënt: 5880
hb: 5851
opname: 5798
goed: 5775
controle: 5626
anamnees: 5519
poli: 5191
opdracht: 5183
waarvoor: 5158
rechts: 5078
zien: 4906
lab: 4746
bekend: 4734
Mmoll: 4574
voorgeschiedenis: 4573
medicatie: 4314
afspraak: 4268
hypertensie: 4211
week: 4207
bloedverlie: 4170
normaal: 4159
aanvullen: 3776
beloop: 3683
links: 3647
starten: 3416
dr: 3393
klacht: 3353
gaan: 3294
laat: 3254
pijn: 3243
wv: 3176
seh: 3161
intern: 3068
lichamelijk: 3065
voltooien: 2940
type: 2827
ul: 2798
bloeding: 2788
gastroscopie: 2702
volgen: 2698
huisarts: 2596
afwijking: 2582
ontlasting: 2516
datum: 2503
chronisch: 2453
nee: 2450
overig: 2414
mdl: 24

In [7]:
cleaned_df['alltext_unfiltered'] = cleaned_df['alltext']

unwanted_words = ['mg', 'x', 'per', 'dag', 'samenvatting', 'beleid', 'conclusie', 'mmolL', 'waarvoor', 'goed', 'wel', 'beloop', 'voorgeschiedenis', 'opdrachten', 
                  'gehad', 'aanvullend', 'bekende', 'voltooid', 'mogelijk', 'gezien', 'city', 'bsn', 'nodig', 'firstname', 'streetname', 'lastname', 'postcode', 
                  'anamnese',
                  'dd', 'stuk', 'ivm', 'rechts', 'links', 'dr', 'sinds', 'huisarts', 'datum', 'dagen', 'min', 'extra', 'weken', 'algemeen', 'patiënte', 'overige',
                  'linker', 'week', 'accepteren', 'maanden', 'waarschijnlijk', 'reden', 'uur', 'verdenking', 'ontslag', 'stop', 'tijd',
                  'patiënt', 'onderzoek']

# Function to remove unwanted words
def remove_words(text):
    pattern = r'\b(?:' + '|'.join(unwanted_words) + r')\b'
    return re.sub(pattern, '', text, flags=re.IGNORECASE).strip()


cleaned_df['alltext_filtered'] = cleaned_df['alltext_unfiltered'].apply(remove_words)
cleaned_df.head()

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,word_count,alltext_unfiltered,alltext_filtered
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",2020-11-26 15:06:00,2020-11-26 15:06:00,dhr aj dingemans huisarts streetnaam city datu...,"[dhr, aj, dingemans, huisarts, streetnaam, cit...",414,dhr aj dingemans huisarts streetnaam city datu...,dhr aj dingemans streetnaam Kenmerk patient...
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies obv diver...,2020-11-26 09:53:00,2020-11-26 09:53:00,samenvatting rectaal bloedverlie obvn divertik...,"[samenvatting, rectaal, bloedverlie, obvn, div...",98,samenvatting rectaal bloedverlie obvn divertik...,rectaal bloedverlie obvn divertikelbloeding ac...
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,2020-11-25 14:13:00,2020-11-25 14:13:00,coloscopie betreffen mw initials lastname adre...,"[coloscopie, betreffen, mw, initials, lastname...",139,coloscopie betreffen mw initials lastname adre...,coloscopie betreffen mw initials adresgegeven...
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,2020-11-25 13:48:00,2020-11-25 13:48:00,gastroscopie betreffen mw initials lastname ad...,"[gastroscopie, betreffen, mw, initials, lastna...",80,gastroscopie betreffen mw initials lastname ad...,gastroscopie betreffen mw initials adresgegev...
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies ; eenmali...,2020-11-25 08:47:00,2020-11-25 08:47:00,samenvatting rectaal bloedverlie eenmalig hd h...,"[samenvatting, rectaal, bloedverlie, eenmalig,...",128,samenvatting rectaal bloedverlie eenmalig hd h...,rectaal bloedverlie eenmalig hd hbstabiel inr ...


In [8]:
# Checking the data and common words
num_verslagen = len(cleaned_df)
cleaned_df['word_count'] = cleaned_df['alltext_filtered'].apply(lambda x: len(x.split()))
avg_words_per_article = cleaned_df['word_count'].mean()
all_words = ' '.join(cleaned_df['alltext_filtered']).split()
unique_words = set(all_words)
num_unique_words = len(unique_words)
lexical_variation = len(unique_words) / len(all_words)


print(f"Number of articles: {num_verslagen}")
print(f"Total number of words: {len(all_words)}")
print(f"Average words per article: {avg_words_per_article}")
print(f"Number of unique words: {num_unique_words}")
print(f"Lexical richness: {lexical_variation}")

cleaned_df['tokens'] = cleaned_df['alltext_filtered'].apply(lambda x: x.split())
all_tokens = [word for tokens_list in cleaned_df['tokens'] for word in tokens_list]
word_counter = Counter(all_tokens)
most_common_100 = word_counter.most_common(100)


print("Most common 100 words:")
for word, freq in most_common_100:
    print(f"{word}: {freq}")

Number of articles: 9577
Total number of words: 1243856
Average words per article: 129.87950297587972
Number of unique words: 31543
Lexical richness: 0.025359044776887357
Most common 100 words:
oraal: 8144
anemie: 7474
tablet: 6266
hb: 5851
opname: 5798
controle: 5626
anamnees: 5519
poli: 5191
opdracht: 5183
zien: 4906
lab: 4746
bekend: 4734
medicatie: 4314
afspraak: 4268
hypertensie: 4211
bloedverlie: 4170
normaal: 4159
aanvullen: 3776
starten: 3416
klacht: 3353
gaan: 3294
laat: 3254
pijn: 3243
wv: 3176
seh: 3161
intern: 3068
lichamelijk: 3065
voltooien: 2940
type: 2827
ul: 2798
bloeding: 2788
gastroscopie: 2702
volgen: 2698
afwijking: 2582
ontlasting: 2516
chronisch: 2453
nee: 2450
overig: 2414
mdl: 2408
stabiel: 2364
knoppen: 2345
stoppen: 2315
laboratorium: 2314
morgen: 2307
coloscopie: 2294
maand: 2253
hemoglobine: 2250
diabete: 2180
bloed: 2169
decursus: 2154
rr: 2152
mild: 2111
iv: 2099
atriumfibrilleren: 2062
consult: 2013
mcv: 1979
int: 1966
via: 1964
recidief: 1902
rectaal: 1