### 1) Reading the configuration file

In [1]:
import os

read_files = []
write_file = "../data/"

with open("../data/gli.cfg", "r") as config_file:
    for line in config_file.readlines():
        instruction, filename = line.split("=")
        filename = filename.strip()
        
        if instruction == "LEIA":
            file_path = os.path.join("../data/CysticFibrosis", filename)
            read_files.append(file_path)
        elif instruction == "ESCREVA":
            write_file += filename

### 2) Read the XML files

In [2]:
from xml.etree import ElementTree as ET

def get_recordnum_text(file):
    xml_file = ET.parse(file)
    xml_root = xml_file.getroot()
    recordnum_text = {}
    
    for record in xml_root:
        text = ""
        for element in record:
            if element.tag == "RECORDNUM":
                record_num = int(element.text)
            elif element.tag == "ABSTRACT" or element.tag == "EXTRACT":
                text = element.text.upper()
        recordnum_text[record_num] = text
    
    # Retorna dicionário na forma RecordNum: Texto para cada arquivo passado
    return recordnum_text

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/casalecchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/casalecchi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def preproccess_text(text):
    tokens = wordpunct_tokenize(text)
    stop_en = stopwords.words("english")
    
    # Remove as stopwords
    # filtered_text = [w for w in tokens if not w.lower() in stop_en]

    # Remove os símbolos
    filtered_text = []
    for word in tokens:
        # Nas queries é usada essa abreviação
        if word.upper() == "CF":
            tokens.extend(["CYSTIC", "FIBROSIS"])
        
        # Remove stopwords
        if word.lower() in stop_en:
            continue
        # Remove palavras que contenham caracteres além de letras
        elif not word.isalpha():
            continue
        # Remova palavras com tamanho menor que 3 caracteres
        elif len(word) < 3:
            continue
        # Palavra vai "entrar" mas antes vamos aplicar um stemming
        else:
            stemmer = PorterStemmer()
            word_stemmed = stemmer.stem(word)
            print(word_stemmed.upper())
            filtered_text.append(word_stemmed.upper())
    
    # Pega um texto, tokeniza e remove as stopwords
    return filtered_text

In [16]:
def word_frequency(text, record_num):
    tokenized_text = preproccess_text(text)
    frequency_dict = {}
    for word in tokenized_text:
        keys = list(frequency_dict.keys())
        if word in keys:
            frequency_dict[word].append(record_num)
        else:
            frequency_dict[word] = [record_num]
            
    # Retorna um dicionário para o texto passado, com a frequência das palavras
    # Palavra: [record_num, record_num, ...]
    return frequency_dict

In [17]:
def get_inverted_list(read_files):
    inverted_list = {}
    
    for file in read_files:
        # Faz primeiro o dicionário do arquivo, com os record_num: textos
        file_records = get_recordnum_text(file)
        file_record_nums = list(file_records.keys())
        
        # Depois, pega cada record
        for record_num in file_record_nums:
            # Faz o dicionário de frequência de um record
            record_dict = word_frequency(file_records[record_num], record_num)
            
            # Pegar o dicionário de frequência e juntar no geral
            used_tokens = list(record_dict.keys())
            
            # Atualiza ou cria os tokens presentes no record no dicionário geral
            for token in used_tokens:
                # Pega o que já tinha, ou [] caso não exista
                previous_records = inverted_list.get(token, [])
                if previous_records == []:
                    # Cria
                    inverted_list[token] = record_dict[token]
                else:
                    #Atualiza
                    inverted_list[token] += record_dict[token]
    return inverted_list

In [18]:
with open(write_file, 'w') as w_file:
    inverted_list = get_inverted_list(read_files)
    tokens = list(inverted_list.keys())
    w_file.write("Token;Appearance\n")
    for token in tokens:
        w_file.write(f"{token};{inverted_list[token]}\n")

SIGNIFIC
PSEUDOMONA
AERUGINOSA
INFECT
RESPIRATORI
TRACT
CYSTIC
FIBROSI
PATIENT
STUDI
MEAN
IMMUNOELECTROPHORET
ANALYSI
PATIENT
SERA
NUMBER
PRECIPITIN
PSEUDOMONA
AERUGINOSA
CONCENTR
SERUM
PROTEIN
ADDIT
CLINIC
RADIOGRAPH
STATU
LUNG
EVALU
USE
SCORE
SYSTEM
PRECIPITIN
PSEUDOMONA
AERUGINOSA
DEMONSTR
SERA
MAXIMUM
NUMBER
ONE
SERUM
CONCENTR
SERUM
PROTEIN
SIGNIFICANTLI
CHANG
COMPAR
MATCH
CONTROL
PERSON
NOTABL
IGG
IGA
ELEV
ACUT
PHASE
PROTEIN
CHANG
LATTER
SUGGEST
ACTIV
TISSU
DAMAG
CONCENTR
ACUT
PHASE
PROTEIN
NOTABL
HAPTOGLOBIN
CORREL
NUMBER
PRECIPITIN
SUGGEST
RESPIRATORI
TRACT
INFECT
PATIENT
MANI
PRECIPITIN
ACCOMPANI
TISSU
DAMAG
INFECT
PATIENT
PRECIPITIN
RESULT
INDIC
PROTECT
VALU
MANI
PRECIPITIN
TISSU
RESPIRATORI
TRACT
SALIVARI
AMYLAS
LEVEL
DETERMIN
NORMAL
SUBJECT
BIRTH
ADULT
LIFE
CHILDREN
CONDIT
SOMETIM
ASSOCI
LOW
PANCREAT
AMYLAS
MALNUTRIT
COELIAC
DISEAS
CYSTIC
FIBROSI
MIX
SALIVA
COLLECT
CARE
STANDARDIS
CONDIT
AMYLAS
MEASUR
METHOD
DAHLQVIST
WIDE
SCATTER
VALU
NORMAL
SUBJECT
CONCENTR
ROSE
LOW
LEVEL


ASSESS
NUTRIT
STATU
PATIENT
CYSTIC
FIBROSI
PANCREA
CFP
SHOW
POOR
GROWTH
ASSOCI
LOW
CONCENTR
ALBUMIN
UREA
NITROGEN
CHOLESTEROL
SERUM
ELEV
WHITE
BLOOD
CELL
WBC
COUNT
PATIENT
CFP
MAINTAIN
WEIGHT
APPROXIM
STANDARD
DEVIAT
MEAN
YEAR
PROGRESS
DECLIN
GROWTH
RATE
COMPAR
NORMAL
COMPLET
DIETARI
SUPPLEMENT
CONSIST
BEEF
SERUM
HYDROLYS
GLUCOS
POLYM
MEDIUM
CHAIN
TRIGLYCERID
GIVEN
PATIENT
YEAR
PATIENT
RECEIV
DIET
SHOW
SIGNIFIC
GAIN
WEIGHT
SIGNIFIC
INCREAS
CLINIC
SCORE
SIGNIFIC
INCREAS
SERUM
ALBUMIN
LEVEL
SIGNIFIC
DROP
WBC
COUNT
COMPAR
CONTROL
PATIENT
RECEIV
SUPPLEMENT
PATIENT
CYSTIC
FIBROSI
FOUND
PNEUMATOSI
COLI
ASSOCIT
RECTAL
PROLAPS
CYSTIC
FIBROSI
SEVER
FACTOR
PREDISPOS
PNEUMATOSI
YET
RARE
REPORT
SYMPTOM
NONSPECIF
DIAGNOSI
CONSID
PATIENT
CYSTIC
FIBROSI
CHRONIC
LUNG
DISEAS
VAGU
ABDOMIN
COMPLAINT
THREE
PATIENT
CYSTIC
FIBROSI
NOTE
SWELL
KNEE
ANKL
JOINT
EXACERB
LUNG
DISEAS
SYNOVI
FLUID
ANALYZ
ONE
PATIENT
SYNOVIUM
UNDERW
BIOPSI
ANOTH
STUDI
EXCLUD
CAUS
ARTHRITI
CONTRIBUT
NEW
INFORM
NATUR
SECONDARI
HYPERTR

INFERTIL
ABNORM
MESONEPHR
DERIV
COMMON
PATIENT
CYSTIC
FIBROSI
UNIFORM
FIND
TAUSSIG
REPORT
PATIENT
CYSTIC
FIBROSI
PROVE
FERTIL
ABNORM
VA
COULD
DOCUMENT
PATHOLOG
STUDI
VA
FOUND
PRESENT
PATIENT
DIE
CYSTIC
FIBROSI
MANI
PATIENT
ABSENC
COULD
RULE
PHYSIC
EXAMIN
HEALTHI
YOUNG
CHILDREN
VA
EASILI
PALPABL
EVEN
DEGRE
CERTAINTI
IDENTIF
LESS
POST
PUBERT
MALE
CYSTIC
FIBROSI
DISEAS
PROTEAN
MANIFEST
WOULD
MISTAK
RULE
CYSTIC
FIBROSI
VA
PALPAT
VICTIM
CYSTIC
FIBROSI
LABEL
INFERTIL
CASE
CALCIFI
SWELL
PARATESTICULAR
TISSU
DISCOV
INFANT
DIFFER
AGE
DESCRIB
EXACT
CAUS
OBSCUR
THREE
PATHOLOG
CONDIT
SUGGEST
POSSIBL
DUE
OLD
REACTION
MECONIUM
PERITON
PATENT
PROCESSU
VAGINALI
SURVIV
STUDI
DONE
CHILDREN
CYSTIC
FIBROSI
HEART
FAILUR
THIRTI
PERCENT
SURVIV
FIRST
FOUR
WEEK
MEDIAN
SURVIV
GROUP
TWO
THREE
MONTH
END
FIRST
YEAR
ONSET
FAILUR
PERCENT
DIE
MONTH
PERCENT
DIE
CHILDREN
CYSTIC
FIBROSI
ISOENZYM
ALKALIN
PHOSPHATAS
DETERMIN
MICROELECTROPHORET
POLYACRYLAMID
STARCH
GEL
STUDI
DONE
EVALU
CLINIC
SIGNIFIC
ADDIT
DATA
DIAGNOSI
L

PRESENT
STUDI
USE
IMMUNOLOG
METHODOLOG
CONFIRM
PREVIOU
OBSERV
LABORATORI
ABSENC
PROTEAS
COMPON
ARGININ
ESTERAS
ACTIV
PLASMA
PATIENT
CYSTIC
FIBROSI
STUDI
POOL
PLASMA
CONTROL
INDIVIDU
ACTIV
PARTIAL
PURIFI
ADSORPT
COLUMN
SOYBEAN
TRYPSIN
INHIBITOR
CONJUG
SEPHAROS
FOLLOW
ELUT
BENZAMIDIN
FRACTION
PURIFI
ISOELECTROFOCUS
POLYACRYLAMID
GEL
PROTEIN
AROUND
RANG
ELUT
UTIL
PREPAR
ANTISERUM
IMMUNOELECTROPHORESI
ACTIV
PLASMA
SAMPL
CONTROL
SUBJECT
PATIENT
CYSTIC
FIBROSI
PERFORM
UTIL
ANTISERUM
CONTROL
FOUR
PRECIPITIN
ARC
RESIDU
ESTERAS
ACTIV
OBSERV
WHEREA
THREE
SEEN
PLASMA
PATIENT
CYSTIC
FIBROSI
DOUBL
GEL
DIFFUS
EXPERI
USE
SPECIF
ANTISERA
RULE
PRESENC
TRYPSIN
CHYMOTRYPSIN
PLASMINOGEN
PROTHROMBIN
ESTERAS
ALPHA
ONE
TRYPSIN
INHIBITOR
INTER
ALPHA
TRYPSIN
INHIBITOR
CONCENTR
BENZAMIDIN
ELUAT
ANTISERA
ALPHA
TWO
MACROGLOBULIN
GAVE
IMMUNOPRECIPIT
READILI
STAIN
PROTEOLYT
ACTIV
IMMUNOELECTROPHORESI
ALPHA
TWO
MACROGLOBULIN
PRECIPITIN
BAND
CORRESPOND
BAND
ABSENT
PLASMA
PATIENT
CYSTIC
FIBROSI
CONTRAST
ALPHA
TWO
MACR

MICROBI
TRANSFORM
BILE
ACID
INCUB
STOOL
HOMOGEN
CHILDREN
CYSTIC
FIBROSI
DECREAS
STUDI
UNDERTAKEN
REPORT
MARK
INCREAS
FECAL
BILE
ACID
EXCRET
CHILDREN
CYSTIC
FIBROSI
ATTEMPT
CONFIRM
FIND
PERFORM
CHOLYLGLYCIN
BREATH
TEST
MEASUR
FECAL
BILE
ACID
FAT
EXCRET
PATIENT
CYSTIC
FIBROSI
ACQUIR
PANCREAT
INSUFFICI
STUDI
DONE
PATIENT
TAKE
PANCREAT
ENZYM
COTAZYM
ALSO
WITHOUT
MEDIC
EXCRET
BREATH
NORMAL
PATIENT
ACQUIR
PANCREAT
INSUFFICI
EVEN
LOWER
CYSTIC
FIBROSI
WITHOUT
COTAZYM
THERAPI
FECAL
BILE
ACID
EXCRET
SLIGHTLI
ELEV
GROUP
WITHOUT
COTAZYM
BECAM
NORMAL
COTAZYM
PATIENT
ACQUIR
PANCREAT
INSUFFICI
STEATORRHEA
PRESENT
PATIENT
GROUP
IMPROV
COTAZYM
THERAPI
BILE
ACID
MALABSORPT
CYSTIC
FIBROSI
ACQUIR
PANCREAT
INSUFFICI
MINIM
PROBABL
CLINIC
IMPORT
ROLLER
KERN
REPORT
BILE
ACID
MALABSORPT
PATIENT
CYSTIC
FIBROSI
ACQUIR
PANCREAT
INSUFFICI
MINIM
FIND
VARIANC
STUDI
DESERV
COMMENT
ABSENC
FECAL
BILE
ACID
VALU
CONTROL
AUTHOR
CANNOT
STATE
LEVEL
TWICE
NORMAL
VALU
ESPECI
SINC
PAPER
CITE
NORMAL
BILE
ACID
EXCRET
REFER
MEASU

CYSTIC
FIBROSI
AUTOSOM
RECESS
DISORD
REMAIN
COMMON
LETHAL
GENET
DISEAS
CAUCASIAN
PEOPL
DIAGNOSI
USUAL
MADE
BASE
UPON
CLINIC
PRESENT
CHRONIC
RESPIRATORI
GASTROINTESTIN
SYMPTOM
POSIT
SWEAT
ELECTROLYT
TEST
OFTEN
FAMILI
HISTORI
DISORD
MANIFEST
CHRONIC
COUGH
RECURR
PNEUMONIA
BRONCHIECTASI
CLUB
PNEUMOTHORAX
HEMOPTYSI
MALABSORPT
SECONDARI
PANCREAT
INSUFFICI
WELL
ASSOCI
CYSTIC
FIBROSI
DIAGNOSI
USUAL
DELAY
HOWEV
OFTEN
APPRECI
PHYSICIAN
MAY
ASSOCI
BROAD
SPECTRUM
LESS
COMMON
MANIFEST
WELL
EXTREM
VARIABL
SEVER
ILL
LACK
FAMILIAR
MANIFEST
MAY
DELAY
DIAGNOSI
ADOLESC
ADULTHOOD
FIRST
SEEM
MILD
CLINIC
EXPRESS
DISORD
PATIENT
GENET
VARIABL
SECOND
SUBTL
MANIFEST
WIDE
APPRECI
PHYSICIAN
PRACTIC
CYSTIC
FIBROSI
CYSTIC
FIBROSI
CYSTIC
FIBROSI
FIVE
CHILDREN
CYSTIC
FIBROSI
GIVEN
REPEAT
INTRAVEN
INFUS
SOYA
OIL
EMULS
INTRALIPID
MONTH
EFFECT
GROWTH
HEALTH
SWEAT
CHLORID
PANCREAT
ENZYM
ACTIV
COMPAR
CONTROL
GROUP
BENEFICI
EFFECT
DEMONSTR
RECOMMEND
FORM
THERAPI
PLACE
ROUTIN
TREATMENT
CHILDREN
CYSTIC
FIBROSI
POSSIBL
TREAT

CYSTIC
FIBROSI
COMMON
CAUS
CHRONIC
OBSTRUCT
PULMONARI
DISEAS
COPD
PANCREAT
INSUFFICI
FIRST
THREE
DECAD
LIFE
UNIT
STATE
REPORT
DESCRIB
PATIENT
CYSTIC
FIBROSI
AGE
YEAR
REVIEW
ANOTH
CASE
REPORT
LITERATUR
PATIENT
ELEV
SWEAT
CHLORID
SODIUM
LEVEL
PROVE
EXCEL
DISCRIMIN
CYSTIC
FIBROSI
EVEN
PATIENT
OLDER
AGE
GROUP
COPD
PRESENT
PER
CENT
MAJOR
CAUS
MORBID
MORTAL
DIFFER
COPD
ETIOLOG
PROGRESS
DOWNHIL
COURS
PATIENT
PUNCTUAT
RECURR
SYMPTOMAT
EXACERB
CHRONIC
BACTERI
BRONCHITI
CAUS
PSEUDOMONA
AERUGINOSA
STAPHYLOCCOCU
AUREU
TERMIN
PULMONARI
INSUFFICI
COR
PULMONAL
DEATH
COPD
COMPLIC
MINOR
HEMOPTYSI
PER
CENT
MASSIV
HEMOPTYSI
PER
CENT
PNEUMOTHORAX
PER
CENT
PROBLEM
RARE
CHILDREN
SINUS
PRESENT
EXAMIN
ROENTGENOGRAPH
PER
CENT
NASAL
POLYPOSI
PANCREAT
INSUFFICI
PRESENT
PER
CENT
PATIENT
CONTRAST
YOUNGER
PATIENT
SELDOM
SYMPTOMAT
ALTHOUGH
STEATORRHEA
AZOTORRHEA
STILL
MASSIV
INTUSSUSCEPT
MECONIUM
ILEU
EQUIVAL
FECAL
ACCUMUL
FREQUENT
ADULT
PER
CENT
RARE
CHILDREN
REQUIR
IMMEDI
DIAGNOST
THERAPEUT
INTERVENT
ENEMA
DIATRIZ

PURPOS
ARTICL
REVIEW
BRIEFLI
TECHNIQU
APPLIC
ELECTRON
PROBE
RAY
ANALYSI
STUDI
INTEREST
PATHOLOGIST
DEALT
PRINCIPL
RAY
FLUORESC
SPECTROMETRI
CAPABL
LIMIT
ENERGI
DISPERS
RAY
ANALYSI
EQUIP
SPECIMEN
PREPAR
METHOD
ELECTRON
PROBE
RAY
ANALYSI
DISCUSS
APPLIC
ELECTRON
PROBE
RAY
ANALYSI
INCLUD
MINER
DEPOSIT
LUNG
INTRANUCLEAR
BISMUTH
INCLUS
GOLD
DEPOSIT
TISSU
NATUR
HAEMOSIDERIN
COPPER
WILSON
DISEAS
CALCIUM
MITOCHONDRIA
SURGIC
IMPLANT
FOREIGN
BODI
EYE
WOUND
SPECIF
KOSSA
METHOD
CALCIUM
CYTOCHEM
LOCAL
ACID
PHOSPHATAS
CHROMAFFIN
ARGENTAFFIN
REACTION
TRANSFER
ANTIGEN
MATERI
MACROPHAG
LYMPHOCYT
DIAGNOSI
CYSTIC
FIBROSI
ELECTRON
PROBE
RAY
ANALYSI
NAIL
SEVER
GENET
DISEAS
DESCRIB
AFFECT
LUNG
USUAL
LEAST
CASE
PEDIATR
AGE
GROUP
FOUR
DISORD
DISCUSS
SELECT
WITHER
HIGH
FREQUENC
DEMONSTR
VARIETI
WAY
GENET
FACTOR
INFLUENC
DISEAS
CYSTIC
FIBROSI
ONE
COMMON
METABOL
DISORD
INHERIT
AUTOSOM
RECESS
TRAIT
IMMOTIL
CILIA
SYNDROM
INCLUD
KARTAGEN
SYNDROM
SINGL
GENE
DISORD
INVOLV
STRUCTUR
ABNORM
CILIA
ASTHMA
COMMON
DISORD
CHI