In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
#perform preprocessing

import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

tag_dict = {"J": wordnet.ADJ,"N": wordnet.NOUN,"V": wordnet.VERB,"R": wordnet.ADV}

[nltk_data] Downloading package stopwords to C:\Users\Shripad
[nltk_data]     Bhat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Shripad
[nltk_data]     Bhat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Shripad
[nltk_data]     Bhat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Shripad Bhat\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [195]:
# Collecting all the documents for preprocessing

In [3]:
import os
path = 'FIRE_Dataset_EN_2010\\TELEGRAPH_UTF8'



files = []
for r, d, f in os.walk(path):
    for file in f:
        if '.utf8' in file:
            files.append(os.path.join(r, file))



In [4]:
len(files)

125586

In [5]:
#Function to parse the xml documents
def parseXML(xmlFile):
  tree = ET.parse(xmlFile) 
  document_id = ""
  document_text = ""
  root = tree.getroot()
  for  elem in root:
    if elem.tag == "DOCNO":
      document_id = elem.text
    elif elem.tag == "TEXT":
      document_text = elem.text


  return document_id,document_text

In [6]:
    def removeTagsPunctuation(text):
        x = re.sub("[\.\?:,\\n/()!<$%\*\-+>\"\'\[\];]"," ",text)
        x = re.sub("[0-9]"," ",x)
        return x
    
    def decontracted(phrase):
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase
    
    def removeStopWordsAndLemmatize(text):

        stop_words = set(stopwords.words('english')) 
        word_tokens = word_tokenize(text) 

        final_sentence = []

        #performing lemmatization using tags associated with the words
        lemmatizer = WordNetLemmatizer()
        tagged_sentence = nltk.pos_tag(word_tokens)
        final_sentence = [lemmatizer.lemmatize(word,tag_dict.get(tag[0].upper(),wordnet.NOUN)) for word,tag in tagged_sentence]   

        filtered_sentence = [w for w in final_sentence if not w in stop_words] 
        return " ".join(filtered_sentence)

In [None]:
doc_ids = []
doc_texts = []

for fileName in files:
  try:
    doc_id,doc_text = parseXML(fileName)
  except:
    print("error")    
  doc_ids.append(doc_id)
  doc_texts.append(doc_text)    

In [None]:
#Creating data frame of all the documents
data = pd.DataFrame(list(zip(doc_ids,doc_texts)),columns=['DOCNO','TEXT'])

In [None]:
data.head()

In [9]:
#Performing pre processing for all the documents
preprocessed_text_column = []
for i in range(data.shape[0]):
    text_column = data['TEXT'][i]
    text_column_decontracted = decontracted(text_column.lower())
    text_column_punctuation_removed = removeTagsPunctuation(text_column_decontracted)
    text_column_lemmatized = removeStopWordsAndLemmatize(text_column_punctuation_removed)
    preprocessed_text_column.append(text_column_lemmatized)

In [10]:
preprocessed_data = pd.DataFrame(list(zip(doc_ids,preprocessed_text_column)),columns=['DOCNO','TEXT'])

In [33]:
#Function to parse all the queries
def parseQueryXML(xml_file):
  tree = ET.parse(xml_file) 
  query_numbers = []
  query_descriptions = []
  root = tree.getroot()
  for  elem in root:
    if elem.tag == "top":
      for children in elem:
        if children.tag == "num":
            query_numbers.append(children.text)
        elif children.tag == "desc":
            query_descriptions.append(children.text)

  return query_numbers,query_descriptions   

In [34]:
query_numbers,query_descriptions = parseQueryXML('FIRE_Dataset_EN_2010\\en.topics.76-125.2010.txt')

In [37]:
#Pre processing for the queries
query_desc_preprocessed = []
for i in range(len(query_descriptions)):
    text_query = query_descriptions[i]
    text_query_decontracted = decontracted(text_query.lower())
    text_query_punctuation_removed = removeTagsPunctuation(text_query_decontracted)
    text_query_lemmatized = removeStopWordsAndLemmatize(text_query_punctuation_removed)
    query_desc_preprocessed.append(text_query_lemmatized)
    

In [14]:
preprocessed_data.shape

(125586, 2)

In [38]:
query_data = pd.DataFrame(list(zip(query_numbers,query_desc_preprocessed)),columns=['query_no','query_desc'])

In [39]:
query_data.head()

Unnamed: 0,query_no,query_desc
0,76,reason behind protest meena leader inclusion g...
1,77,attack hezbollah guerrilla indian israeli force
2,78,conflict ashok singhal president vishwa hindu ...
3,79,plan build road china mount everest
4,80,initiation legal proceeding advani involvement...


In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Creating tfidf matrix for all the documents
tfIdfVectorizer = TfidfVectorizer()
tfIdfVectorizer.fit(preprocessed_data['TEXT'].values)
tfidf_matrix = tfIdfVectorizer.transform(preprocessed_data['TEXT'].values)

In [103]:
tfidf_matrix.shape

(125586, 221939)

In [234]:
#Performing truncated SVD decomposition with only 250 components
from scipy.sparse.linalg import svds
U, s, V = svds(tfidf_matrix.T,k = 250)

In [235]:
U.shape,s.shape,V.shape

((221939, 250), (250,), (250, 125586))

In [104]:
#Generating tfidf matrix for the queries
query_tfidf = tfIdfVectorizer.transform(query_data.head(5)['query_desc'].values)

In [105]:
query_tfidf.shape

(5, 221939)

In [236]:
from numpy.linalg import inv
s_diag = np.diag(s)
s_inverse = inv(s_diag)

In [237]:
#Creating new query vector from the U,s obtained from SVD decomposition
truncated_query = []
for query in query_tfidf:
    q = np.dot(np.dot(query.toarray(),U),s_inverse)
    truncated_query.append(q)
    

In [238]:
v_T = V.T

In [239]:
v_T.shape

(125586, 250)

In [240]:
from sklearn.metrics.pairwise import cosine_similarity

#performing similarity check between queries and documents and finding top 5 relevant documents

query_relevant_documents = dict()
for k in range(len(truncated_query)):
    query = truncated_query[k]
    similarity_values = []
    for i in range(v_T.shape[0]):
        value = cosine_similarity(query,[v_T[i]])
        similarity_values.append([value,i])
        
    similarity_values.sort(reverse = True)
    top_5 = similarity_values[0:5]
    query_text = query_desc_preprocessed[k]
    query_num = query_data['query_no'][k]
    doc_indices = [ i[1] for i in top_5 ]
    relevant_docs = data['TEXT'][doc_indices]
    relevant_doc_ids = data['DOCNO'][doc_indices]
    query_relevant_documents[query_num] = [query_text,relevant_doc_ids,relevant_docs]    

In [187]:
f = open("FIRE_Dataset_EN_2010\\en.qrels.76-125.2010.txt", "r")
l = f.readlines()

In [188]:
# Actual relevant documents for each of the queries

query_rel_doc = dict()
for i in l:
    q_list = i.split()
    if q_list[0] > '80':
        break
    if i[len(i) - 2] == '1':
        query_num = q_list[0]
        if query_rel_doc.get(query_num) == None:
            query_rel_doc[query_num] = [ q_list[2]]
        else:
            li = query_rel_doc[query_num]
            li.append(q_list[2])
    
    

In [241]:

for i in query_relevant_documents.keys():
    print("Query Number: ",i)
    print("Documents retrieved using LSI: ")
    print(query_relevant_documents[i][1])
    print("\nRelevant documents: ")
    for q in query_rel_doc[i]:
        print(q)
    lsi_retrieved = set(query_relevant_documents[i][1])
    act_docs = set(query_rel_doc[i])
    print("\nNumber of relevant documents retrieved: ",len(lsi_retrieved.intersection(act_docs)))
    print("Relevant documents retrieved using LSI:")
    for d in lsi_retrieved.intersection(act_docs):
        print(d)
    print("\n")
    print("*"*100)
    

Query Number:  76
Documents retrieved using LSI: 
79779        1060703_nation_story_6430226.utf8
81176        1060922_nation_story_6778378.utf8
79019        1060521_nation_story_6250723.utf8
79589        1060622_nation_story_6385113.utf8
110663    1070604_frontpage_story_7872841.utf8
Name: DOCNO, dtype: object

Relevant documents: 
1070530_nation_story_7849973.utf8
1070602_nation_story_7865940.utf8
1070603_nation_story_7869357.utf8
1070611_nation_story_7906812.utf8

Number of relevant documents retrieved:  0
Relevant documents retrieved using LSI:


****************************************************************************************************
Query Number:  77
Documents retrieved using LSI: 
75502    1060720_frontpage_story_6501523.utf8
84389      1060724_opinion_story_6493371.utf8
35093      1051229_foreign_story_5657775.utf8
73002      1060714_foreign_story_6477224.utf8
73027      1060717_foreign_story_6488684.utf8
Name: DOCNO, dtype: object

Relevant documents: 
1050110_foreig

# Query No: 77, 3 relevant documents are retrieved 
# Query No: 79, 1 relevant document is retrieved
# when LSI was implemented by using truncated SVD of TFIDF-MATRIX with 250 components