# Individual Task: Information Retrieval

#### General Functions 

- parse the xml files 
- pre-process the xml files


In [1]:
import os
from xml.dom import minidom

import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')

import numpy as np 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\antho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\antho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


The Following Functions will get a filename which will then parse that particular file and return the data from it. 

In [37]:
docsPathName = "./WES-Dataset/docs/"

def parseDocument(filename):
    content = processDocument(filename)
    data = tokenizeDocuments(content)
    data = casefoldDocuments(data)
    data = stopwordRemoval(data)
    data = stemDocument(data)

    return data

def processDocument(filename):
   
    #doc = minidom.parse(pathName + '/' + filename)
    doc = minidom.parse(filename)
    name = doc.getElementsByTagName("raw")[0]
    content = name.firstChild.data

    return content 

def tokenizeDocuments(content):

    tokens = word_tokenize(content)

    return tokens

def casefoldDocuments(tokens):

    caseFold = []

    for token in tokens:
        caseFold.append(token.casefold())

    return caseFold

def stopwordRemoval(caseFold):

    stop_words = set(stopwords.words('english'))

    stopWordsRemoved = []

    for word in caseFold: 
        if word not in stop_words:
            stopWordsRemoved.append(word)

    return stopWordsRemoved
    
def stemDocument(stopWordsRemoved):

    porter = PorterStemmer()

    stemmed = []

    for term in stopWordsRemoved:
        stemmed.append(porter.stem(term))   
    
    return stemmed


#### Building the TF IDF 

- TFIDF = TF(term in a document) * IDF(term)
- TF() = Number of times a term appeared in a document / total terms in that document
- IDF() = Log base 2 (Total Number of Documents / Number of Documents with the term in it)

#### To Start Off we need to prepare the following:
1. Word Count - In how many documents the word is.
2. TF - We will pass the Document and the Word and return the Occurance / N 
3. IDF - We will pass a word and return np.log(total number of docs / word occurance)
4. TFIDF - Combining. we will pass the documents then we will call the TF and IDF 

For Testing Purposes, we will be using 2 Documents


### Let's set the Dictionary which will contain all of the unique words. 

In [38]:
# Parsed Documents - This inclues tokenization, case-folding, stop-word removal and stemming
documents = []

# Unique Terms
dictionary = [] 

# This is the Dataset - Set to two docs for debuging purposing as faster processing times. 
docs = os.listdir(docsPathName)

for doc in docs: 
    filename = "" 
    filename = docsPathName + doc
    tcss = parseDocument(filename)
    documents.append(tcss)

    # Loop to get the Unique Terms
    for term in tcss:
        if term not in dictionary:
            dictionary.append(term)

glossary = {}

termNum = 0

for word in dictionary:
    glossary[word] = termNum
    termNum += 1

Now that we have retrieved all the information neccessary, we can proceed with building the TFIDF

DF will get the amount of documents that a particular word is found in x amount of docs

In [8]:
def getDF():
    docFreq = {}

    for term in dictionary:
        docFreq[term] = 0

        for doc in documents:
            if term in doc:
                docFreq[term] += 1
    
    return docFreq

DF = getDF()

def getTF(doc, term):

    numOfWords = len(doc)

    occurance = len([token for token in doc if token == term])

    TF = occurance/numOfWords

    return TF

def getIDF(term):

    try: 
        occurance = DF[term] + 1
    except:
        occurance = 1
    
    IDF = np.log2(len(documents)/float(occurance))

    #print("Term: ", term, " Doc: ", DF[term], " IDF: ", IDF)
    
    return IDF



In [43]:
def computeTFIDF(doc):

    TFIDF = np.zeros((len(dictionary),))

    #print(TFIDF)

    for term in doc:
        vecTF = getTF(doc, term)
        vecIDF = getIDF(term)
        TFIDF[glossary[term]] = vecTF * vecIDF

    return TFIDF

# Storing the DOC TFIDF 

docTFIDF = []

for doc in documents:
    vector = computeTFIDF(doc)
    docTFIDF.append(vector)

# Part 2: Cosine Similarity

We are to take 2 inputs: Vector A and Vector B which will work out the dot product and return the cosine similarity

In [55]:
from numpy import linalg 

def getCS(vecA, vecB):
    
    CS = np.dot(vecA, vecB) / np.linalg.norm(vecA) * np.linalg.norm(vecA) 

    return CS


In [56]:
def part2(queryPath):

    query = parseDocument(queryPath)
    print("Query: ", query)

    queryTFIDF = computeTFIDF(query)

    return queryTFIDF




In [97]:
queryPathName = "./WES-Dataset"
query = queryPathName + "/queries/wes2015.q02.naf"

vectorTFIDF = part2(query)

#print(len(vectorTFIDF))
#for i in vectorTFIDF:
    #print(i, end = ' ')

cosSim = getCS(docTFIDF, vectorTFIDF)
#print(len(cosSim))
#print(cosSim)
#print(type(cosSim))

cosSimDict = cosSim.tolist()

key_value = {}

for i in range(len(cosSim)):
    key_value[i] = cosSim[i]

def docRank(docNum):
    for i in range(len(docs)):
        if i == docNum-1:
            print(processDocument(docsPathName + docs[i]) + "\n")

sorted_kv = sorted(key_value.items(), key = lambda kv:(kv[1], kv[0]), reverse=True) 

topThree = list(sorted_kv)[:3]

for i in range(3):
    for x in range(1):
        docRank(topThree[i][x])


Query:  ['famou', 'german', 'poetri']
The First American to walk in Space – Edward White.

Edward White during Gemini 4 performing EVA.  On June, 3, 1965, Edward Higgins White becomes the first American to “walk” in space in the course of the Gemini 4 space mission. White is one of the three U.S. astronauts, who died along with his fellow astronauts Virgil “Gus” Grissom and Roger B. Chaffee during prelaunch testing for the first manned Apollo mission at Cape Canaveral. Edward White earned his Bachelor degree at the U.S. Military Academy and he was commissioned as a 2nd Lieutenant in the Air Force. After he had some experience in flight, he spent over three years in West Germany flying in the defense of NATO. White’s education in aeronautical engineering began in 1958 at the University of Michigan and he was appointed test pilot shortly after, gaining over 3000 flight hours with the Air Force. Project Gemini 4 became the very first multi-day space flight by the United States. The main o