In [2]:
#Python code to illustrate parsing of XML files
# importing the required modules
import csv
import requests
import xml.etree.ElementTree
import pickle
import glob

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
snowball_stemmer = SnowballStemmer('english')
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/carlosm/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/carlosm/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [3]:
def GetString(loc):
    strg = []
    e = xml.etree.ElementTree.parse(loc)
    root =e.getroot()
    dom = e.findall('uima.cas.Sofa')
    for atype in dom:
        strg.append(atype.get('sofaString'))
    return strg


def preprocess(text):

    # Replace punctuation with tokens so we can use them in our model
    text = text.lower()
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' <QUOTATION_MARK> ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('(', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('\n', ' <NEW_LINE> ')
    text = text.replace(':', ' <COLON> ')
    words = text.split()
    tokens = []
#     Stop Words removal
    words = [w for w in words if not w in stop_words] 
    for word in words:
#         Word Lemmatizer 
        word = wordnet_lemmatizer.lemmatize(word)
        try:
            float(word)
            pass
        except:
            flag = 0
            for letters in word:
                if letters.lower() in '0123456789@#$%^&*(){}[];:=- /':
                    flag = 1
                    break
                    
            if flag == 0:
                tokens.append(word.lower())
    
    return tokens

loc = "Data/*.xml"
processed_loc = "Processed/"
All_xml = glob.iglob(loc, recursive=True)
LstOfXMLTokens = []
LstOfFile = []
for j, filename in enumerate(All_xml):
    LstOfFile.append(filename)
    strg = GetString(filename)[0]
    LstOfXMLTokens.append(preprocess(strg))
    


In [4]:
tokens_final = []

for tokens_list in LstOfXMLTokens:
    strg = ''
    for tokens in tokens_list:
        tokens = " " + tokens
        strg += tokens
    tokens_final.append(strg.lower())
print (len(tokens_final))
# print (tokens_final)

1


## Idf score tells in how many documents has a word occoured

## tf score tells in how many times occours in a single document

## Therefore TFIDF ranks a word by how many times it occours in a perticular document and how many times is the word found in all documents


## For more detailed description please refer : https://github.com/anujgupta82/Representation-Learning-for-NLP/blob/master/module1/TF-IDF.ipynb

### Advantages : It helps in capturing short forms, for eg. If a doctor writes lung cancer as lg_cancer and uses the same multiple times in the same document 


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tokenize = lambda doc: doc.lower().split(" ")

vectorizer = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
X = vectorizer.fit_transform(tokens_final)
idf = vectorizer.idf_


In [7]:
print (dict(zip(vectorizer.get_feature_names(), idf)))

{'': 1.0, '<comma>': 1.0, '<period>': 1.0, 'adenopathy': 1.0, 'along': 1.0, 'also': 1.0, 'ap': 1.0, 'appear': 1.0, 'bone': 1.0, 'bronchus': 1.0, 'ca': 1.0, 'cannot': 1.0, 'case': 1.0, 'cect': 1.0, 'centilobular': 1.0, 'change': 1.0, 'clear': 1.0, 'collapse': 1.0, 'compared': 1.0, 'comparison': 1.0, 'consolidation': 1.0, 'contrast': 1.0, 'cool': 1.0, 'ct': 1.0, 'dated': 1.0, 'destructive': 1.0, 'differentiated': 1.0, 'distal': 1.0, 'done': 1.0, 'emphysema': 1.0, 'emphysematous': 1.0, 'enhanced': 1.0, 'enlarged': 1.0, 'follow': 1.0, 'great': 1.0, 'heart': 1.0, 'hemithorax': 1.0, 'hilar': 1.0, 'however': 1.0, 'hypodense': 1.0, 'increased': 1.0, 'ipsilateral': 1.0, 'largest': 1.0, 'left': 1.0, 'lesion': 1.0, 'liver': 1.0, 'lobe': 1.0, 'loss': 1.0, 'lower': 1.0, 'lung': 1.0, 'lymphatic': 1.0, 'main': 1.0, 'mass': 1.0, 'measuring': 1.0, 'mediastinal': 1.0, 'mediastinum': 1.0, 'node': 1.0, 'nodule': 1.0, 'normal': 1.0, 'noted': 1.0, 'paraseptal': 1.0, 'paratracheal': 1.0, 'pedal': 1.0, 'perfo

In [8]:
import pandas as pd
feature_names = vectorizer.get_feature_names()

df = pd.DataFrame(X.T.todense(), index=feature_names, columns=LstOfFile)

In [9]:
df.to_csv('Results/ranking.csv', sep=',', encoding='utf-8')
df

Unnamed: 0,Data/radlexsnomedrxnormtest.xml
,0.065063
<comma>,0.214874
<period>,0.256635
adenopathy,0.065063
along,0.110160
also,0.110160
ap,0.065063
appear,0.136541
bone,0.065063
bronchus,0.065063
