# Document preprocessing

In [None]:

import os
import re
import shutil


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

## Set path to data

In [None]:

input_dir = './data/raw/'
output_dir ='./data/cleaned/'
if not os.path.isdir(output_dir) : 
    os.mkdir(output_dir)


## Compile Regular expressions

Appart from removing special characters, some regex specific to PubMed documents has been defined :
- removal of roman numbers, frequently used for book chapter titles
- detection of missing content, (<sub><sup>_[Not Available]_</sup></sub>)
- Resolving of scientific abbreviations


In [None]:
# special character filter
strip_special_chars = re.compile("[^\w_]+")
# roman number filter
strip_roman_numbers = re.compile("^[MDCLXVI]+[\s\.\n]")
# missing data detector
missing_data_detector = re.compile("\[Not Available\]\.")

# atopic dermatitis abbreviation
atopic_dermatitis_abbr = re.compile(r"\bAD\b")



## Create Lemmatizer and stop words list

In [None]:
lemmatizer = WordNetLemmatizer()

def addStopwords(f) : 
    res = []
    with open(f, "r") as fin :
        for l in fin : 
            res.append(l.strip())
    return res




## Clean text documents

- remove roman number headings
- remove numerical values
- remove special characters
- remove words with length < 2 letters (include many units, usually not defined in stop words lists)
- remove stop words (wordnet)
- lower case
- tokenize
- lemmatize

processing specific to the study :
- resolve atopic dermatitis abbreviation
- resolve multiple-wording for atopic dermatitis and atopic eczema
- lemmatize atopic dermatitis

In [None]:

vocs = set()
for label in os.listdir(input_dir) :
    path_in = input_dir+label+"/"
    path_out = output_dir+label+"/"
    if os.path.isdir(path_out):  
        shutil.rmtree(path_out)
    if "DS_Store"not in path_out : 
        os.mkdir(path_out)

        for fileName in os.listdir(path_in) : 
            if fileName.endswith(".txt"):
                with open(path_in+fileName, "r") as fileContent :
                    content = fileContent.read().replace('\n',' ')

                    #remove roman number headings        
                    content = re.sub(strip_roman_numbers, " ", content)

                    #remove numbers        
                    #content = re.sub("(^|\W)[\d\s\.]+($|\W)", " ", content)   
                    content = re.sub(r"\b\d+\b", " ", content)   

                    #remove non alphabetical characters        
                    content = re.sub(strip_special_chars, " ", content)

                    # switch to lower case
                    content = content.lower()

                    # QUERY SPECIFIC PREPROCESSING
                    # - resolve atopic dermatitis abbreviation
                    # - resolve multiple-wording for atopic dermatitis
                    
                    content = re.sub(r"eczemas?(\b|\s)", "", content, flags=re.IGNORECASE)
                    content = re.sub("atopic", "", content, flags=re.IGNORECASE)
                    content = re.sub("dermatiti(s|des|ses)", "", content, flags=re.IGNORECASE)
                    
                    for m in re.finditer(atopic_dermatitis_abbr, content):
                        content = re.sub(re.escape(m.group(0)), '', content)

                    #remove word with less than two letter
                    content = re.sub(r"\b\w{1,2}\b", '', content)               

                    # remove stop words
                    stop_words = list(stopwords.words("english"))
                    #stop_words += addStopwords("stopwords.txt")
                    words = word_tokenize(content)
                    key_words = [word for word in words if word not in stop_words]

                    #lemmatize
                    lemmas = [lemmatizer.lemmatize(word) for word in key_words]
                    
                    for l in lemmas : 
                        vocs.add(l)

                    if lemmas :
                        with open(path_out+fileName,"w") as output :
                            output.write(" ".join(lemmas))

with open("vocab.txt", 'w') as fout : 
    for w in vocs : 
        fout.write(w+"\n")

