### 1) Reading the configuration file

In [1]:
import os

read_files = []
write_file = "../data/"

with open("../data/gli.cfg", "r") as config_file:
    for line in config_file.readlines():
        instruction, filename = line.split("=")
        filename = filename.strip()
        
        if instruction == "LEIA":
            file_path = os.path.join("../data/CysticFibrosis", filename)
            read_files.append(file_path)
        elif instruction == "ESCREVA":
            write_file += filename

### 2) Read the XML files

In [2]:
from xml.etree import ElementTree as ET

def get_recordnum_text(file):
    xml_file = ET.parse(file)
    xml_root = xml_file.getroot()
    recordnum_text = {}
    
    for record in xml_root:
        text = ""
        for element in record:
            if element.tag == "RECORDNUM":
                record_num = int(element.text)
            elif element.tag == "ABSTRACT" or element.tag == "EXTRACT":
                text = element.text.upper()
                text = text.replace("\n", "")
                text = text.replace(";", "")
        recordnum_text[record_num] = text
    
    # Retorna dicionário na forma RecordNum: Texto para cada arquivo passado
    return recordnum_text

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/casalecchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/casalecchi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def preproccess_text(text):
    tokens = word_tokenize(text)
    stop_en = stopwords.words("english")
    filtered_text = [w for w in tokens if not w.lower() in stop_en]
    
    # Pega um texto, tokeniza e remove as stopwords
    return filtered_text

In [5]:
def word_frequency(text, record_num):
    tokenized_text = preproccess_text(text)
    frequency_dict = {}
    for word in tokenized_text:
        keys = list(frequency_dict.keys())
        if word in keys:
            frequency_dict[word].append(record_num)
        else:
            frequency_dict[word] = [record_num]
            
    # Retorna um dicionário para o texto passado, com a frequência das palavras
    # Palavra: [record_num, record_num, ...]
    return frequency_dict

In [6]:
def get_inverted_list(read_files):
    inverted_list = {}
    
    for file in read_files:
        # Faz primeiro o dicionário do arquivo, com os record_num: textos
        file_records = get_recordnum_text(file)
        file_record_nums = list(file_records.keys())
        
        # Depois, pega cada record
        for record_num in file_record_nums:
            # Faz o dicionário de frequência de um record
            record_dict = word_frequency(file_records[record_num], record_num)
            
            # Pegar o dicionário de frequência e juntar no geral
            used_tokens = list(record_dict.keys())
            
            # Atualiza ou cria os tokens presentes no record no dicionário geral
            for token in used_tokens:
                # Pega o que já tinha, ou [] caso não exista
                previous_records = inverted_list.get(token, [])
                if previous_records == []:
                    # Cria
                    inverted_list[token] = record_dict[token]
                else:
                    #Atualiza
                    inverted_list[token] += record_dict[token]
    return inverted_list

In [7]:
with open(write_file, 'w') as w_file:
    inverted_list = get_inverted_list(read_files)
    tokens = list(inverted_list.keys())
    w_file.write("Token;Appearance\n")
    for token in tokens:
        w_file.write(f"{token};{inverted_list[token]}\n")