In [2]:
#Python code to illustrate parsing of XML files
# importing the required modules
import csv
import requests
import xml.etree.ElementTree
import pickle
import glob

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
snowball_stemmer = SnowballStemmer('english')
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/carlosm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/carlosm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def GetString(loc):
    strg = []
    e = xml.etree.ElementTree.parse(loc)
    root =e.getroot()
    dom = e.findall('uima.cas.Sofa')
    for atype in dom:
        strg.append(atype.get('sofaString'))
    return strg


def preprocess(text):

    # Replace punctuation with tokens so we can use them in our model
    text = text.lower()
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' <QUOTATION_MARK> ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('(', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('\n', ' <NEW_LINE> ')
    text = text.replace(':', ' <COLON> ')
    words = text.split()
    tokens = []
#     Stop Words removal
    words = [w for w in words if not w in stop_words] 
    for word in words:
#         Word Lemmatizer 
        word = wordnet_lemmatizer.lemmatize(word)
#         Word Stemming
        word = snowball_stemmer.stem(word)
        try:
            float(word)
            pass
        except:
            flag = 0
            for letters in word:
                if letters.lower() in '0123456789@#$%^&*(){}[];:=- /':
                    flag = 1
                    break
                    
            if flag == 0:
                tokens.append(word.lower())
    
    return tokens

loc = "../Data/*.xml"
processed_loc = "Processed/"
All_xml = glob.iglob(loc, recursive=True)
LstOfXMLTokens = []
LstOfFile = []
for j, filename in enumerate(All_xml):
    LstOfFile.append(filename)
    strg = GetString(filename)[0]
    LstOfXMLTokens.append(preprocess(strg))
    


In [5]:
tokens_final = []

for tokens_list in LstOfXMLTokens:
    strg = ''
    for tokens in tokens_list:
        tokens = " " + tokens
        strg += tokens
    tokens_final.append(strg.lower())
print (len(tokens_final))
print (tokens_final)

1
[' cect scan thorax <period> <period> contrast enhanc scan thorax perform <period> case ca lung <comma> follow <comma> comparison done previous ct date <period> volum loss note right hemithorax collaps upper lobe ipsilater shift mediastinum seen previous seen right hilar mass distal collaps consolid slight increas previous scan <period> howev mass cannot well differenti <period> subcm nodul right lower appear unchang <period> sever centilobular emphysemat chang also seen lung <comma> predomin upper lobe along parasept emphysema unchang enlarg node seen superior mediastinum <comma> right paratrach <comma> ap window <comma> prevascular region <comma> largest measur <period> x <period> unchang compar previous scan <period> pleural space clear <period> trachea left main stem bronchus normal <period> heart mediastin great vessel appear normal <period> subcm hypodens lesion seen liver unchang <period> destruct bone lesion <period> case ca lung <comma> follow <comma> comparison done previou

## Idf score tells in how many documents has a word occoured

## tf score tells in how many times occours in a single document

## Therefore TFIDF ranks a word by how many times it occours in a perticular document and how many times is the word found in all documents


## For more detailed description please refer : https://github.com/anujgupta82/Representation-Learning-for-NLP/blob/master/module1/TF-IDF.ipynb

### Advantages : It helps in capturing short forms, for eg. If a doctor writes lung cancer as lg_cancer and uses the same multiple times in the same document 


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tokenize = lambda doc: doc.lower().split(" ")

vectorizer = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
X = vectorizer.fit_transform(tokens_final)
idf = vectorizer.idf_


In [7]:
print (dict(zip(vectorizer.get_feature_names(), idf)))

{'': 1.0, '<comma>': 1.0, '<period>': 1.0, 'adenopathi': 1.0, 'along': 1.0, 'also': 1.0, 'ap': 1.0, 'appear': 1.0, 'bone': 1.0, 'bronchus': 1.0, 'ca': 1.0, 'cannot': 1.0, 'case': 1.0, 'cect': 1.0, 'centilobular': 1.0, 'chang': 1.0, 'clear': 1.0, 'collaps': 1.0, 'compar': 1.0, 'comparison': 1.0, 'consolid': 1.0, 'contrast': 1.0, 'cool': 1.0, 'ct': 1.0, 'date': 1.0, 'destruct': 1.0, 'differenti': 1.0, 'distal': 1.0, 'done': 1.0, 'emphysema': 1.0, 'emphysemat': 1.0, 'enhanc': 1.0, 'enlarg': 1.0, 'follow': 1.0, 'great': 1.0, 'heart': 1.0, 'hemithorax': 1.0, 'hilar': 1.0, 'howev': 1.0, 'hypodens': 1.0, 'increas': 1.0, 'ipsilater': 1.0, 'largest': 1.0, 'left': 1.0, 'lesion': 1.0, 'liver': 1.0, 'lobe': 1.0, 'loss': 1.0, 'lower': 1.0, 'lung': 1.0, 'lymphat': 1.0, 'main': 1.0, 'mass': 1.0, 'measur': 1.0, 'mediastin': 1.0, 'mediastinum': 1.0, 'node': 1.0, 'nodul': 1.0, 'normal': 1.0, 'note': 1.0, 'parasept': 1.0, 'paratrach': 1.0, 'pedal': 1.0, 'perform': 1.0, 'pleural': 1.0, 'predomin': 1.0, 'p

In [8]:
import pandas as pd
feature_names = vectorizer.get_feature_names()

df = pd.DataFrame(X.T.todense(), index=feature_names, columns=LstOfFile)

In [9]:
df.to_csv('../Results/ranking.csv', sep=',', encoding='utf-8')
df

Unnamed: 0,../Data/radlexsnomedrxnormtest.xml
,0.065200
<comma>,0.215330
<period>,0.257179
adenopathi,0.065200
along,0.110394
also,0.110394
ap,0.065200
appear,0.136830
bone,0.065200
bronchus,0.065200
