In [5]:
%matplotlib inline

# Rx thorax report preprocessor

In [1]:
import pandas as pd
import re
df = pd.read_csv('report_sentences.csv', encoding="ISO-8859-1")
print(df.head())

   Unnamed: 0  codigoinforme                             v
0           0        2442821             Sin alteraciones.
1           1        2442822             Sin alteraciones.
2           2        2442820            Sin  alteraciones.
3           3        2442823  Signos radiológicos de EPOC.
4           4        2442824             Sin alteraciones.


In [2]:
# Convert text column to lowercase
df['v'] = df['v'].str.lower()

In [3]:
#import unidecode to remove accents
import unidecode
df['v'] = df.apply(lambda row: unidecode.unidecode(str(row['v'])), axis=1 )

The field clinical evaluation "v", not only contains the Rx thorax description and clinical judgment but it may also include the reason for the Rx order as well as the descriptions of other Rx types such as sinus and abdominal radiographies in a non structured format. The following regex approachs are intended to extract the Rx thorax description and clinical judgments.

In [4]:
#Extract "Impresión Diagnóstica(*)" in new column
p = re.compile("Impresion\s+diagnostica(.*)", re.IGNORECASE)
df["v_clean"] = df.apply(lambda row: p.search(str(row['v'])).group(1) if p.search(str(row['v'])) else '', axis=1)
p = re.compile("Impresion\s+\w+\s?:(.*)", re.IGNORECASE)
df["v_clean"] = df.apply(lambda row: p.search(str(row['v'])).group(1) if (row['v_clean']=='' and p.search(str(row['v']))) else row['v_clean'], axis=1)
print(len(df[(df['v_clean']!= '')]))

105809


In [5]:
#Otherwise add "torax:(*):" to this column
p = re.compile("torax.*?:(.*?)[a-zA-Z]*:", re.IGNORECASE)
df["v_clean"] = df.apply(lambda row: p.search(str(row['v'])).group(1) if (row['v_clean']=='' and p.search(str(row['v']))) else row['v_clean'], axis=1)
print(len(df[(df['v_clean']!= '')]))

109253


In [6]:
#Otherwise add "torax:(*)" to this column
p = re.compile("torax.*?:(.*)", re.IGNORECASE)
df["v_clean"] = df.apply(lambda row: p.search(str(row['v'])).group(1) if (row['v_clean']=='' and p.search(str(row['v']))) else row['v_clean'], axis=1)
print(len(df[(df['v_clean']!= '')]))

184671


In [7]:
#Otherwise add "Motivo de consulta:*Comentario:(*)" to this column
p = re.compile("motivo de consulta.*?comentario(.*)", re.IGNORECASE)
df["v_clean"] = df.apply(lambda row: p.search(str(row['v'])).group(1) if (row['v_clean']=='' and p.search(str(row['v']))) else row['v_clean'], axis=1)
print(len(df[(df['v_clean']!= '')]))

184673


In [13]:
#Otherwise add ":(*)" to this column
p = re.compile(":(.*)", re.IGNORECASE)
df["v_clean"] = df.apply(lambda row: p.search(str(row['v'])).group(1) if (row['v_clean']=='' and p.search(str(row['v']))) else row['v_clean'], axis=1)
print(len(df[(df['v_clean']!= '')]))

189361


In [8]:
#Otherwise add "(*)" to this column
df["v_clean"] = df.apply(lambda row: str(row['v']) if row['v_clean']==''  else row['v_clean'], axis=1)
print(len(df[(df['v_clean']!= '')]))

206369


In [9]:
df.to_csv("report_sentences_selected.csv", columns=['codigoinforme','v_clean'], encoding="ISO-8859-1")

Apply common preprocessor pipeline: Normalization, tokenization, stopword and stemming (lemmatization and POS tagging does not work in spanish using NLTK, but those tasks are not necessary for the present problem) 

In [10]:
df = pd.read_csv('report_sentences_selected.csv', encoding="ISO-8859-1", na_filter=False)
# Remove punctuation characters except '.' as this define sentences
df['v_clean'] = df.apply(lambda row: re.sub(r"[^a-zA-Z0-9\\.]", " ", row['v_clean']), axis=1 )
df['v_clean'] = df.apply(lambda row: row['v_clean'].replace('.', ' . '), axis=1)
#df['v_clean'] = df.apply(lambda row: re.sub('(.*?)\\.\s*(.*)', r'\1 . \2', row['v_clean']), axis=1 )

In [11]:
df.to_csv("report_sentences_cleaned.csv", columns=['codigoinforme','v_clean'], encoding="ISO-8859-1")

In [18]:
# Stemmize and remove stopwords
from nltk.corpus import stopwords
from nltk.stem.snowball import SpanishStemmer
stopwords = set(stopwords.words("spanish")) - set(['sin', 'no', 'ni', 'con'])
stemmer = SpanishStemmer()

reports = []

for report in pd.Series(df['v_clean']).tolist():
    new_report = ''
    for w in report.split():
        w = w.strip()
        if w not in stopwords:
            if w not in set(['masa','masas']): #Avoid 'masa' and 'masas' to be stemmed to 'mas'
                new_report = new_report + ' ' + stemmer.stem(w)
            else:
                new_report = new_report + ' ' + w
    reports.append(new_report)
df['v_preprocessed'] = pd.Series(reports)   
df.to_csv("report_sentences_preprocessed.csv", columns=['codigoinforme','v_preprocessed'], encoding="ISO-8859-1")


In [None]:
fo = open("text.txt", "w")
fo.write(df['v_clean'].str.cat(sep=' ').replace('.', ''))
fo.close()