In [71]:
import os
import glob

# Collecting dataset

In [72]:
all_text = glob.glob('/home/deepcompute/ann_mary_shaju/IR/Entertainment/*.txt')

In [73]:
print(len(all_text))

1053


In [74]:
data = {}
i = 0
for text in all_text:
    with open(text, "r", encoding = "latin1") as file:
        data[i] = file.readlines()
        i = i + 1
    data[i-1] = [" ".join(data[i-1]), i]

In [75]:
import pandas

In [76]:
data = pandas.DataFrame(data).T
data.columns = ['Text', 'Doc ID']

In [77]:
data.shape

(1053, 2)

In [78]:
data.head(10)

Unnamed: 0,Text,Doc ID
0,\n In article <artmel.735538777@well.sf.ca.us>...,1
1,Distribution: na\n Message-ID: <1r77ph$66i@acc...,2
2,"\n In article <C61rDq.5v5@chinet.chi.il.us>, s...",3
3,\n In article <strnlghtC5t3K6.InF@netcom.com> ...,4
4,\n : There are chips which perform the voice c...,5
5,\n In <C5x2xs.EF0@lerami.lerctr.org> merlin@le...,6
6,"\n For example, I don't own a cordless phon...",7
7,Distribution: world\n Message-ID: <1r1om5$c5m@...,8
8,\n In article <1qnmnp$db8@sol.TIS.COM> mjr@tis...,9
9,\n \n In article <strnlghtC5yBKA.Dp5@netcom.co...,10


In [79]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1053 entries, 0 to 1052
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    1053 non-null   object
 1   Doc ID  1053 non-null   object
dtypes: object(2)
memory usage: 24.7+ KB


# Data Processing

In [80]:
unique = list(data.Text.unique())
len(unique)

1048

In [81]:
data.sort_values("Text", inplace = True)
data.drop_duplicates(subset ="Text", keep = "first", inplace = True)
data.shape

(1048, 2)

In [82]:
data.head(10)

Unnamed: 0,Text,Doc ID
188,,189
606,\n,607
66,\n \t\tI think I should also point out that th...,67
646,\n \t The points raised about checking what i...,647
419,"\n \tActually, many of us have noted this. We ...",420
518,\n \tDoes anyone out there know of any ftp sit...,519
264,\n \tEven more interesting: the SMTP server at...,265
232,"\n \tFrom: ""dan mckinnon"" <dan.mckinnon@canrem...",233
978,\n \tFrom: Marc VanHeyningen <mvanheyn@cs.indi...,979
313,\n \tFrom: andersom@spot.Colorado.EDU (Marc An...,314


In [83]:
# Removing html tags
from bs4 import BeautifulSoup
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

data['Text'] = data['Text'].apply(lambda x: strip_html(x))
data.head(10)

Unnamed: 0,Text,Doc ID
188,,189
606,\n,607
66,\n \t\tI think I should also point out that th...,67
646,\n \t The points raised about checking what i...,647
419,"\n \tActually, many of us have noted this. We ...",420
518,\n \tDoes anyone out there know of any ftp sit...,519
264,\n \tEven more interesting: the SMTP server at...,265
232,"\n \tFrom: ""dan mckinnon"" \n \n \t I have lu...",233
978,\n \tFrom: Marc VanHeyningen \n \n \tThe major...,979
313,\n \tFrom: andersom@spot.Colorado.EDU (Marc An...,314


In [84]:
# Removing numbers
import re
def remove_numbers(text):
  text = re.sub(r'\d+', '', text)
  return text

data['Text'] = data['Text'].apply(lambda x: remove_numbers(x))
data.head(10)

Unnamed: 0,Text,Doc ID
188,,189
606,\n,607
66,\n \t\tI think I should also point out that th...,67
646,\n \t The points raised about checking what i...,647
419,"\n \tActually, many of us have noted this. We ...",420
518,\n \tDoes anyone out there know of any ftp sit...,519
264,\n \tEven more interesting: the SMTP server at...,265
232,"\n \tFrom: ""dan mckinnon"" \n \n \t I have lu...",233
978,\n \tFrom: Marc VanHeyningen \n \n \tThe major...,979
313,\n \tFrom: andersom@spot.Colorado.EDU (Marc An...,314


In [85]:
# converting text to lower
data['Text'] = data.apply(lambda row: row['Text'].lower(), axis=1) 

In [86]:
import nltk
from nltk.tokenize import word_tokenize 
nltk.download('stopwords')  
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords 
stop = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/deepcompute/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/deepcompute/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/deepcompute/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [87]:
# Tokenization of data
data['Text'] = data.apply(lambda row: word_tokenize(row['Text']), axis=1) 

In [88]:
# Removing non-ASCII characters from list of tokenized words
import unicodedata

def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

data['Text'] = data.apply(lambda row: remove_non_ascii(row['Text']), axis=1)
data.head(10)

Unnamed: 0,Text,Doc ID
188,[],189
606,[],607
66,"[i, think, i, should, also, point, out, that, ...",67
646,"[the, points, raised, about, checking, what, i...",647
419,"[actually, ,, many, of, us, have, noted, this,...",420
518,"[does, anyone, out, there, know, of, any, ftp,...",519
264,"[even, more, interesting, :, the, smtp, server...",265
232,"[from, :, ``, dan, mckinnon, '', i, have, lurk...",233
978,"[from, :, marc, vanheyningen, the, majority, o...",979
313,"[from, :, andersom, @, spot.colorado.edu, (, m...",314


In [89]:
# Removing punctuations and special characters

import re
def remove_punctuations(words):
    """Remove punctuations from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words


data['Text'] = data.apply(lambda row: remove_punctuations(row['Text']), axis=1)
data.head(10)

Unnamed: 0,Text,Doc ID
188,[],189
606,[],607
66,"[i, think, i, should, also, point, out, that, ...",67
646,"[the, points, raised, about, checking, what, i...",647
419,"[actually, many, of, us, have, noted, this, we...",420
518,"[does, anyone, out, there, know, of, any, ftp,...",519
264,"[even, more, interesting, the, smtp, server, a...",265
232,"[from, dan, mckinnon, i, have, lurked, here, a...",233
978,"[from, marc, vanheyningen, the, majority, of, ...",979
313,"[from, andersom, spotcoloradoedu, marc, anders...",314


In [90]:
# Removing stop words
def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stop:
            new_words.append(word)
    return new_words

data['Text'] = data.apply(lambda row: remove_stopwords(row['Text']), axis=1)
data.head(10)

Unnamed: 0,Text,Doc ID
188,[],189
606,[],607
66,"[think, also, point, mystical, des, engines, k...",67
646,"[points, raised, checking, actually, chip, opp...",647
419,"[actually, many, us, noted, noted, program, st...",420
518,"[anyone, know, ftp, sites, deal, electronics, ...",519
264,"[even, interesting, smtp, server, csrcncslnist...",265
232,"[dan, mckinnon, lurked, bit, lately, though, m...",233
978,"[marc, vanheyningen, majority, discussion, inv...",979
313,"[andersom, spotcoloradoedu, marc, anderson, al...",314


In [91]:
# Lemmatizing data

from nltk.stem.wordnet import WordNetLemmatizer         
lemmatizer = WordNetLemmatizer()
def lemmatize_list(words):
    new_words = []
    for word in words:
        new_words.append(lemmatizer.lemmatize(word, pos='v'))
    return new_words

data['Text'] = data.apply(lambda row: lemmatize_list(row['Text']), axis=1)
data.head(10)

Unnamed: 0,Text,Doc ID
188,[],189
606,[],607
66,"[think, also, point, mystical, des, engines, k...",67
646,"[point, raise, check, actually, chip, oppose, ...",647
419,"[actually, many, us, note, note, program, star...",420
518,"[anyone, know, ftp, sit, deal, electronics, pr...",519
264,"[even, interest, smtp, server, csrcncslnistgov...",265
232,"[dan, mckinnon, lurk, bite, lately, though, ma...",233
978,"[marc, vanheyningen, majority, discussion, inv...",979
313,"[andersom, spotcoloradoedu, marc, anderson, al...",314


In [93]:
# Removing empty texts
for i in range(0,len(data['Text'])-1):
    if len(data['Text'].iloc[i]) == 0:
        data = data.drop(data.index[i])

data.head(10)

Unnamed: 0,Text,Doc ID
66,"[think, also, point, mystical, des, engines, k...",67
646,"[point, raise, check, actually, chip, oppose, ...",647
419,"[actually, many, us, note, note, program, star...",420
518,"[anyone, know, ftp, sit, deal, electronics, pr...",519
264,"[even, interest, smtp, server, csrcncslnistgov...",265
232,"[dan, mckinnon, lurk, bite, lately, though, ma...",233
978,"[marc, vanheyningen, majority, discussion, inv...",979
313,"[andersom, spotcoloradoedu, marc, anderson, al...",314
452,"[brad, clarinetcom, brad, templeton, let, assu...",453
365,"[pmetzger, snarkshearsoncom, perry, e, metzger...",366


In [94]:
# Normalizing the words using porter stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def stemming(words):
    new_words = []
    for word in words:
        new_words.append(ps.stem(word))
        new_words.append(word)
    return new_words

data['Text'] = data.apply(lambda row: stemming(row['Text']), axis=1)
data.head(10)

Unnamed: 0,Text,Doc ID
66,"[think, think, also, also, point, point, mysti...",67
646,"[point, point, rais, raise, check, check, actu...",647
419,"[actual, actually, mani, many, us, us, note, n...",420
518,"[anyon, anyone, know, know, ftp, ftp, sit, sit...",519
264,"[even, even, interest, interest, smtp, smtp, s...",265
232,"[dan, dan, mckinnon, mckinnon, lurk, lurk, bit...",233
978,"[marc, marc, vanheyningen, vanheyningen, major...",979
313,"[andersom, andersom, spotcoloradoedu, spotcolo...",314
452,"[brad, brad, clarinetcom, clarinetcom, brad, b...",453
365,"[pmetzger, pmetzger, snarkshearsoncom, snarksh...",366


# Boolean Retrieval

## Inverted index construction

In [95]:
inverted_index = {}

for index,row in data.iterrows():
    unique_terms_in_document = set(row['Text'])
    for term in unique_terms_in_document:
        if term not in inverted_index.keys():
            inverted_index[term] = [row['Doc ID']]
        else:
            inverted_index[term].append(row['Doc ID'])

In [96]:
# Method to process the given query
def processing_query(query):
    infix_tokens=word_tokenize(query)
    precedence = {}
    precedence['NOT'] = 3
    precedence['AND'] = 3
    precedence['OR'] = 3
    precedence['('] = 2
    precedence[')'] = 1
              
    output = []
    operator_stack = []
 
    for token in infix_tokens:
        if (token.upper() == '('):
            operator_stack.append(token)
        elif (token.upper() == ')'):
            operator = operator_stack.pop()
            while operator != '(':
                output.append(operator)
                operator = operator_stack.pop()
 
        elif (token.upper() in precedence):
            if (operator_stack):
                current_operator = operator_stack[-1]
                while (operator_stack and precedence[current_operator.upper()] > precedence[token.upper()]):
                    output.append(operator_stack.pop())
                    if (operator_stack):
                        current_operator = operator_stack[-1]
            operator_stack.append(token)
        else:
            output.append(token.lower())
    while (operator_stack):
        output.append(operator_stack.pop())
    return output

In [97]:
# Boolean retrieval implementation
def boolean_retrieval(postfix):
    boolean_op = ['AND', 'OR', 'NOT']
    doc_list = []
    query_list = []
    for exp in postfix:
        if exp in boolean_op:
            if query_list or exp == 'NOT':
                output = []
                if exp == 'NOT':
                    if query_list:
                        query = query_list.pop()
                        output = list(set(data['Doc ID'].tolist())-set(inverted_index[query]))
                    else:
                        doc_list = list(set(data['Doc ID'].tolist())-set(doc_list))
                else:
                    query1 = query_list.pop()
                    if query_list:
                        query2 = query_list.pop()
                        if exp == 'AND':
                            output = list(set(inverted_index[query1]) & set(inverted_index[query2]))
                        if exp == 'OR':
                            output = list(set(inverted_index[query1]) | set(inverted_index[query2]))
                        doc_list = doc_list + output
                    else:
                        if exp == 'AND':
                            doc_list = list(set(doc_list) & set(inverted_index[query1]))
                        if exp == 'OR':
                            doc_list = list(set(doc_list) | set(inverted_index[query1]))            
        else:
            query_list.append(exp)
    doc_list.sort()
    return doc_list

## Boolean retrieval Implementation

In [98]:
query = input('Enter the query (Please mention the boolean queries as AND, OR, NOT): ')
processed_query = processing_query(query) 
print("\nGiven query: ", query)
print("\nDocuments in which the given query is present is")
boolean_retrieval(processed_query)

Enter the query (Please mention the boolean queries as AND, OR, NOT): NOT(MARCUS OR anderson) AND (NOT POINT AND NOT ALSO)

Given query:  NOT(MARCUS OR anderson) AND (NOT POINT AND NOT ALSO)

Documents in which the given query is present is


[715, 788, 828]

# Phrase queries and proximity queries

## Positional Index Construction

In [99]:
positional_index = {}

for index,row in data.iterrows():
    index = 0
    for term in row['Text']:
        if term not in positional_index.keys():
            positional_index[term] = {}
            positional_index[term][row['Doc ID']] = [i]
        else:
            if row['Doc ID'] in positional_index[term].keys():
                positional_index[term][row['Doc ID']].append(i)
            else:
                positional_index[term][row['Doc ID']] = [i]
        i = i + 1


## PROXIMITY QUERY IMPLEMENTATION

In [100]:
query1 = (input("Enter input query1: ")).lower()
query2 = (input("Enter input query2: ")).lower()
proximity = int(input("Enter the proximity: "))

Enter input query1: SENDER
Enter input query2: news
Enter the proximity: 5


In [101]:
# Finding the documents in which both queries are present

if query1 in positional_index.keys():
    query1_doc = set(positional_index[query1].keys())
if query2 in positional_index.keys():
    query2_doc = set(positional_index[query2].keys())

# Documents in which query1 and query2 is present
query1_and_query2_document = query1_doc & query2_doc

resultant_dict = {}
for document in query1_and_query2_document:
    q1_positions, q1_position_len = positional_index[query1][document], len(positional_index[query1][document])
    q2_positions, q2_position_len = positional_index[query2][document], len(positional_index[query1][document])
    if q1_position_len <= q2_position_len:
        for position in q1_positions:
            proximity_list = []
            proximity_list = [x for x in range(position-proximity,position+proximity+1)]
            for query_2_position in q2_positions:
                if query_2_position in proximity_list:
                    if document not in resultant_dict.keys():
                        resultant_dict[document] = {}
                    if position in resultant_dict.keys():
                        resultant_dict[document][position].append(query_2_position)
                    else:
                        resultant_dict[document][position] = [query_2_position]  
    else:
        for position in q2_positions:
            proximity_list = []
            proximity_list = [x for x in range(position-proximity,position+proximity+1)]
            for query_1_position in q1_positions:
                if query_1_position in proximity_list:
                    if document not in resultant_dict.keys():
                        resultant_dict[document] = {}
                    if query_1_position in resultant_dict.keys():
                        resultant_dict[document][query_1_position].append(position)
                    else:
                        resultant_dict[document][query_1_position] = [position]  

In [102]:
print("Documents having {} and {} within {} words of each other and there positions are,".format(query1,query2,proximity))
print("\n")
for key, value in resultant_dict.items():
    print("Document ID:",key)
    for k, v in value.items():
        print("\t{} position - {} and {} position - {} ".format(query1,k,query2,v))
    print('\n')

Documents having sender and news within 5 words of each other and there positions are,


Document ID: 899
	sender position - 344037 and news position - [344040] 
	sender position - 344038 and news position - [344043] 


Document ID: 838
	sender position - 340561 and news position - [340564] 
	sender position - 340562 and news position - [340567] 


Document ID: 808
	sender position - 138375 and news position - [138378] 
	sender position - 138376 and news position - [138378] 


Document ID: 711
	sender position - 344563 and news position - [344566] 
	sender position - 344564 and news position - [344569] 


Document ID: 345
	sender position - 344263 and news position - [344266] 
	sender position - 344264 and news position - [344269] 


Document ID: 108
	sender position - 343855 and news position - [343858] 
	sender position - 343856 and news position - [343861] 


Document ID: 496
	sender position - 344435 and news position - [344438] 
	sender position - 344436 and news position - [34444

Recall = (# relevant documents retrieved/ total # of relevant documents) 


Precision = (# relevant documents retrieved/ total # of documents retrieved)