## Text cleaning and extraction of 'clean' Dutch text

### Libraries for file operations

In [21]:
import re
import os
import shutil

In [22]:
# Filling in the path to the folder of the data

path = '/Users/anix/KPMG_NLP_project/texts/'

### Removal of other languages to extract Dutch text

In [23]:
# Creating a list of dutch legal texts (from the metadata) while ignoring German translations

texts = []
for name in os.listdir(path):
    with open(path+name, 'r') as file:
        text = file.read()
    if 'Duitse vertaling' in text:
        continue
    else:
        texts.append(name)

In [18]:
# Creating a list of dutch legal texts (from the metadata) while ignoring English translations

texts = []
for name in os.listdir(path):
    with open(path+name, 'r') as file:
        text = file.read()
    if 'in de Engelse taal' in text:
        continue
    else:
        texts.append(name)

### Initial cleaning

In [24]:
def initial_clean(text:str) ->  str:
    '''
    Function that cleans articles and tries add spaces where they are missing

    :params text: str that is the text

    return a str
    '''
    pattern = r'(,|:|;)'
    pat = re.compile(pattern)
    new_text = pat.sub(r'\1  ', text)

    pattern = r'([\s][\w-]+\.)([A-Za-z]+)'
    pat = re.compile(pattern)
    new_text = pat.sub(r'\1  \2', new_text)

    pattern = r'([\"][\.])'
    pat = re.compile(pattern)
    new_text = pat.sub(r'\1 ', new_text)

    pattern = r'([\.][\"])'
    pat = re.compile(pattern)
    new_text = pat.sub(r'\1 ', new_text)

    pattern = r'([A-Za-z])([0-9])'
    pat = re.compile(pattern)
    new_text = pat.sub(r'\1 \2', new_text)

    pattern = r'([0-9])([A-Za-z])'
    pat = re.compile(pattern)
    new_text = pat.sub(r'\1 \2', new_text)

    pattern = r'(__+)'
    pat = re.compile(pattern)
    new_text = pat.sub(r' \1  ', new_text)

    new_text = re.sub(r" +"," ",new_text)



    pattern = r'([A-Za-z]+)([A-Z][a-z])'
    pat = re.compile(pattern)
    new_text = pat.sub(r'\1 \2', new_text)

    return new_text

### Saving cleaned version of the files

In [25]:
# Path where the cleaned files should be saved

new_path = '/Users/anix/KPMG_NLP_project/clean_texts/'

# In this new path the cleaned texts will be stored

shutil.rmtree(new_path, ignore_errors=True)
os.makedirs(new_path, exist_ok=True)
for name in texts:
    with open(path+name,'r') as file:
        text = file.read()
    with open(new_path+name, 'w+') as file:
        file.write(initial_clean(text))