In [1]:
import pandas as pd
import os
from docx.api import Document
from simplify_docx import simplify
from tqdm import tqdm
from datetime import date, datetime
import re

In [2]:
path = 'Source'
base = os.listdir(path)
files = {dir: os.listdir(f'{path}/{dir}') for dir in base}
base[:3]

['Adenildo Victor Teles', 'Ana Maria de Queiroz Fernandes', 'Antonieta Souza']

In [3]:
# Functions
def get_X_posotion(row):
    try:
        x_pos = 0
        try:
            x_pos = row.index('x')
        except:
            x_pos = row.index('X')
        finally:
            return row[x_pos-1].replace(':', '')
    except:
        return 'Ñ encontrado'

def get_date(val):
    matched = re.match(r"[0-9]{2}[\/.][0-9]{2}[\/.][0-9]{2,4}", val)
    if bool(matched):
       return matched.group().replace('.', '/')

    return val.replace(' anos', '')

def get_age(val):
    try:
        today = date.today()
        try:
            return today.year - datetime.strptime(val, '%d/%m/%Y').year
        except:
            return today.year - datetime.strptime(val, '%d/%m/%y').year
    except:
        return val

In [4]:
# Main code
# 0 - Temporary variables
# Store files where erros occurred during Document
error_list_document = []
# Store files where erros occurred during Simplifly
error_list_simply = []
# Store all files names and paths
all_files = []
# Store all fines and paths that were successfully processed
processed_all_files = []
# Store data from each file
final_list = []

# 1 - Iterate through each folder
for dir_name, document_list in tqdm(files.items()):
    # 2 - Iterate each document in the folder
    for doc_name in document_list:
        # Variables
        this_file_tables = []
        # 3 - Open the document
        target_path = f'{path}/{dir_name}/{doc_name}'
        all_files.append(target_path)
        if 'pdf' not in doc_name:
            # 4 - read Docx
            try:
                document = Document(target_path)
            except:
                error_list_document.append(target_path)

            # 5 - Read document and apply Simplify, return a JSON
            try:
                # my_doc_as_json = simplify(document)
                my_doc_as_json = simplify(document,{"remove-leading-white-space":False})
            except:
                error_list_simply.append(target_path)

            # 6 - Select all json data
            data_chunk = my_doc_as_json['VALUE'][0]['VALUE']
            
            # 7 - Select tables
            for item in data_chunk:
                if item['TYPE'] == 'table':
                    this_file_tables.append(item)

            # 8 - Iterate over the first table rows and store a cell list in the list
            # Each cell list correspond to a row
            temp_rows_store = []
            if len(this_file_tables) > 0:
                for row in this_file_tables[0]['VALUE']:
                    temp_rows_store.append(row['VALUE'])

                # 9 - Recover content from each cell
                temp_rows = []
                for row_idx, row in enumerate(temp_rows_store):
                    temp_row = []
                    for cell in row:
                        cell_with_information = cell['VALUE']
                        # 10 - Save the content of each cell in a list
                        if len(cell_with_information) > 0:
                            temp_row.append(cell_with_information[0]['VALUE'][0]['VALUE'])
                    # This list contains a idex to identify the row an a list with the content of each cell in that row
                    temp_rows.append((row_idx, temp_row))

                # 11 - Store data in a list to consume and build report
                # File name e data
                processed_all_files.append(target_path)
                final_list.append((target_path, temp_rows))

print(f'Erros during document: {len(error_list_document)}')
print(f'Erros during simplify: {len(error_list_simply)}')
print(f'Total files: {len(all_files)}')
print(f'Total processed files: {len(processed_all_files)}')

  warn("Skipping unexpected tag: %s" % (current.tag),
  warn("Skipping unexpected tag: %s" % (current.tag),
  warn("Skipping unexpected tag: %s" % (current.tag),
  warn("Skipping unexpected tag: %s" % (current.tag),
100%|██████████| 101/101 [00:16<00:00,  6.08it/s]

Erros during document: 0
Erros during simplify: 16
Total files: 282
Total processed files: 242





In [5]:
not_processed_files = list(filter(lambda x: 'pdf' not in x, set(all_files) - set(processed_all_files)))

In [7]:
desired_columns =  ['Arquivo', 'Sexo', 'Paciente / Cuidador', 'Data nascimento', 'Idade', 'Resumo', 'Uso de Riluzol', 'Outros medicamentos']
result_df = pd.DataFrame(columns=desired_columns)

for item in final_list:
    len_df = len(result_df)
    file_path = item[0].split('/', 1)[1]
    sex = 'Ñ encontrado'
    idade = 'Ñ encontrado'
    miss_placed_age = (False, '')
    for row in item[1]:
        # Is pacient
        if any(('Paciente / Cuidador' in x for x in row[1])):
            try:
                pacient = row[1][1]
                if 'anos' in pacient:
                    miss_placed_age = (True, pacient)
            except:
                pacient = 'Ñ encontrado'
        # Age
        if any(('nascimento' in x for x in row[1])):
            try:
                birth = row[1][1]
            except:
                birth = 'Ñ encontrado'
        # Resume
        if any(('Resumo' in x for x in row[1])):
            try:
                resume = row[1][1]
                if 'anos' in resume:
                    miss_placed_age = (True, resume)
            except:
                resume = 'Ñ encontrado'
        # Riluzol
        if any(('Riluzol' in x for x in row[1])):
            riluzol = get_X_posotion(row[1])
        # Outros medicamentos
        if any(('Outros medicamentos' in x for x in row[1])):
            try:
                other = row[1][1]
                if 'anos' in other:
                    miss_placed_age = (True, other)
            except:
                other = 'Ñ encontrado'

    if miss_placed_age[0] and ( birth == 'Ñ encontrado' or '/' not in birth):
            temp_holder = miss_placed_age[1].replace('.', '').split(' ')
            temp_holder = temp_holder[temp_holder.index('anos')-1]
            idade = temp_holder
            
    result_df.loc[len_df] = [file_path, sex, pacient, birth, idade, resume, riluzol, other]

result_df['Data nascimento'] = result_df['Data nascimento'].apply(lambda x: get_date(x))

for idx, item in result_df.iterrows():
    if item.Idade == 'Ñ encontrado' and '/' in item['Data nascimento']:
        result_df.loc[idx].Idade = get_age(item['Data nascimento'])
        
result_df.head(5)

Unnamed: 0,Arquivo,Sexo,Paciente / Cuidador,Data nascimento,Idade,Resumo,Uso de Riluzol,Outros medicamentos
0,Adenildo Victor Teles/(1) ADENILDO_VICTOR_TELE...,Ñ encontrado,Adenildo Victor Teles,20/07/1941,81,Ñ encontrado,Não,Lantus 100UI/mL(20 unid antes do café da manhã...
1,Ana Maria de Queiroz Fernandes/Ana Maria de Qu...,Ñ encontrado,Ana Maria de Queiroz Fernandes/ Geriane,20/07/1941,81,62 anos. 1ª consulta. Sem dificuldade de deglu...,Sim,Duloxetina (60 mg).
2,Ana Maria de Queiroz Fernandes/Ana Maria Queir...,Ñ encontrado,Ana Maria de Queiroz Fernandes/ Geriane,20/07/1941,81,62 anos. 1ª consulta. Sem dificuldade de deglu...,Sim,Gabapentina; Fluoxentina
3,Ana Maria de Queiroz Fernandes/Ana Maria Queir...,Ñ encontrado,Ana Maria de Queiroz Fernandes/ Geriane,09/05/1959,63,62 anos. Sem dificuldade de deglutição. Não ap...,Sim,Sertralina 25mg
4,Antonieta Souza/Antonieta Souza (1)_31_07_20.docx,Ñ encontrado,Antonieta Souza,09/05/1959,63,Ainda em suspeita se ELA. Evolução muito lenta...,Não,"aprazolan 1 mg, nitarzatina,..."
