In [38]:
#Name: Aikya Banerjee 
#UIN : 675064035 
#Course : CS494 Information Retrieval and Web Search
#HW2


from os import listdir
from os.path import isfile, join

import os
import math

from pathlib import Path

import re
from collections import defaultdict
from collections import Counter
import nltk
from nltk.stem import PorterStemmer

nltk.download('stopwords')

my_path = os.getcwd()


def stemmerStopEliminator(word_list):
    stop_words = nltk.corpus.stopwords.words("english")
    dict_stop_words = Counter(stop_words)
    new_words = []
    
    ps = PorterStemmer()
    
    for word in word_list:
        if dict_stop_words[word]==0:
            new_word = ps.stem(word)
            if dict_stop_words[new_word]==0:
                new_words.append(new_word)
    return new_words

def tokenize_documents():
    doc_folder = my_path+"/cranfieldDocs"
    os.chdir(doc_folder)
    onlyfiles = [f for f in listdir(doc_folder) if isfile(join(doc_folder, f))]
    text_corpus = {}
    regex_EXP = re.compile(r'[^\W\d]+')
    for path in onlyfiles:
        with open(path,encoding="latin-1") as f:
            text = f.readlines()
            flag = 0
            matcher = re.search("[\d]+",f.name)
            file_name = int(matcher.group())
            string_title = ""
            string_text = ""
            for line in text:
                if "<TITLE>" in line:
                    flag = 1
                    continue
                if "</TITLE>" in line:
                    flag = 0
                if flag == 1:
                    string_title+=line
            flag = 0
            for line in text:
                if "<TEXT>" in line:
                    flag = 1
                    continue
                if "</TEXT>" in line:
                    flag = 0
                if flag == 1:
                    string_text+=line
            
            word_match=regex_EXP.finditer(string_title)
            words = []
            for i in word_match:
                words.append(i.group())
            word_match=regex_EXP.finditer(string_text)
            for i in word_match:
                words.append(i.group())
            trimmed_words = []
            for word in words:
                if len(word)!=1 and len(word)!=2:
                    trimmed_words.append(word.lower())
            new_stemmed_words = stemmerStopEliminator(trimmed_words)
            text_corpus[file_name]=new_stemmed_words
    return text_corpus




def tokenize_queries():
    os.chdir(my_path)
    with open("queries.txt",encoding="latin-1") as f:
        queries = {}
        text = f.readlines()
        counter = 1
        for line in text:
            words = []
            regex_EXP = re.compile(r'[^\W\d]+')
            word_match=regex_EXP.finditer(line)
            
            for i in word_match:
                words.append(i.group())
            trimmed_words = []
            for word in words:
                if len(word)!=1 and len(word)!=2:
                    trimmed_words.append(word.lower())
            new_stemmed_words = stemmerStopEliminator(trimmed_words)
            queries[counter]=new_stemmed_words
            counter+=1
    return queries

doc_collection = tokenize_documents()
query_collection = tokenize_queries()



def df_calc(doc_collection):
    df = defaultdict(list)
    for key,value in doc_collection.items():
        for word in value:
            if key not in df[word]:
                df[word].append(key)
    return df

                
df = df_calc(doc_collection)

def cosine_similarity(top_n):
    doc_length = {}
    for key_i,value_i in doc_collection.items():
        tf = Counter(value_i)
        doc_l = 0
        for key_j,value_j in tf.items():
            div_l = len(doc_collection)/len(df[key_j])
            idf_l = math.log(div_l,2)
            weight = value_j*idf_l
            doc_l += weight*weight
        doc_length[key_i] = math.sqrt(doc_l)
    cosine_sim = {}
    for key_i,value_i in doc_collection.items():
        for key_j,value_j in query_collection.items():
            sumx = 0
            for word in value_j:
                if len(df[word])==0:
                    continue
                div = len(doc_collection)/len(df[word])
                idf = math.log(div,2)
                term_f = value_i.count(word)
                sumx += term_f*idf*idf
            if key_j not in cosine_sim:
                cosine_sim[key_j] = [(key_i,sumx/doc_length[key_i])]
            else:
                cosine_sim[key_j].append((key_i,sumx/doc_length[key_i]))
    def take_second(elem):
        return elem[1]
    for key,value in cosine_sim.items():
        value.sort(key=take_second,reverse=True)
        l1 = []
        for val in value:
            l1.append(val[0])
        if top_n<0:
            value[:] = l1
        else:
            value[:] = l1[:top_n]
    return cosine_sim


def process_rel():
    os.chdir(my_path)
    with open("relevance.txt","r") as f:
        text = f.readlines()
        rel = {}
        for line in text:
            line_x = re.findall("[\d]+",line)
            if int(line_x[0]) not in rel:
                rel[int(line_x[0])] = [int(line_x[1])]
            else:
                rel[int(line_x[0])].append(int(line_x[1]))
    return rel
            

def common_docs(l1,l2):
    counter = Counter(l2)
    final = []
    for i in l1:
        if counter[i]>0:
            final.append(counter)
    return len(final)

queries = process_rel()

def calc_recall(top_n):
    cos_sim = cosine_similarity(top_n)
    recall = {}
    for i in range(1,len(queries)+1):
        l1 = queries[i]
        l2 = cos_sim[i]
        retrieved = common_docs(l1,l2)
        total = len(queries[i])
        recall[i] = retrieved/total
    return recall

def calc_precision(top_n):
    cos_sim = cosine_similarity(top_n)
    precision = {}
    for i in range(1,len(queries)+1):
        l1 = queries[i]
        l2 = cos_sim[i]
        retrieved = common_docs(l1,l2)
        total = top_n
        precision[i] = retrieved/total
    return precision

print("Query Id's and document Id's in pairs (of descending order) :\n")

final_list = []

def list_retrieved():
    for key,value in cosine_similarity(-1).items():
        for doc_id in value:
            final_list.append((key,doc_id))
    
list_retrieved()
        
print(final_list,"\n")
    


def precision_recall_output():
    for i in [10,50,100,500]:
        print("For top",i,"documents, we have :\n")
        print("Queries\t\tPrecision\tRecall\n")
        print("----------------------------------------------\n")
        prec = calc_precision(i)
        recall = calc_recall(i)
        for j in range(1,len(queries)+1):
            print("Query",j,"\t",prec[j],"\t\t",recall[j],"\n")
        print("Average Precision (across all",len(queries),"queries): ",sum(prec.values())/len(prec),"\n")
        print("Average Recall (across all",len(queries),"queries): ",sum(recall.values())/len(recall),"\n\n")
    
precision_recall_output()


                
                
                
                
            

        
        



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aikyab/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Query Id's and document Id's in pairs (of descending order) :

For top 10 documents, we have :

Queries		Precision	Recall

----------------------------------------------

Query 1 	 0.0 		 0.0 

Query 2 	 0.2 		 0.13333333333333333 

Query 3 	 0.3 		 0.2 

Query 4 	 0.1 		 0.05555555555555555 

Query 5 	 0.2 		 0.10526315789473684 

Query 6 	 0.2 		 0.1111111111111111 

Query 7 	 0.4 		 0.4444444444444444 

Query 8 	 0.1 		 0.25 

Query 9 	 0.1 		 0.125 

Query 10 	 0.2 		 0.08333333333333333 

Average Precision (across all 10 queries):  0.18 

Average Recall (across all 10 queries):  0.15080409356725147 


For top 50 documents, we have :

Queries		Precision	Recall

----------------------------------------------

Query 1 	 0.0 		 0.0 

Query 2 	 0.12 		 0.4 

Query 3 	 0.12 		 0.4 

Query 4 	 0.04 		 0.1111111111111111 

Query 5 	 0.18 		 0.47368421052631576 

Query 6 	 0.1 		 0.2777777777777778 

Query 7 	 0.1 		 0.5555555555555556 

Query 8 	 0.06 		 0.75 

Query 9 	 0.12 		 0.75 

Qu

Query Id's and document Id's in pairs (of descending order) :

[(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (4, 1), (4, 2), (5, 1), (5, 2), (6, 1), (6, 2), (7, 1), (7, 2), (8, 1), (8, 2), (9, 1), (9, 2), (10, 1), (10, 2)] 

For top 10 documents, we have :

Queries		Precision	Recall

----------------------------------------------

Query 1 	 0.0 		 0.0 

Query 2 	 0.0 		 0.0 

Query 3 	 0.0 		 0.0 

Query 4 	 0.0 		 0.0 

Query 5 	 0.0 		 0.0 

Query 6 	 0.0 		 0.0 

Query 7 	 0.0 		 0.0 

Query 8 	 0.0 		 0.0 

Query 9 	 0.0 		 0.0 

Query 10 	 0.0 		 0.0 

Average Precision (across all 10 queries):  0.0 

Average Recall (across all 10 queries):  0.0 


For top 50 documents, we have :

Queries		Precision	Recall

----------------------------------------------

Query 1 	 0.0 		 0.0 

Query 2 	 0.0 		 0.0 

Query 3 	 0.0 		 0.0 

Query 4 	 0.0 		 0.0 

Query 5 	 0.0 		 0.0 

Query 6 	 0.0 		 0.0 

Query 7 	 0.0 		 0.0 

Query 8 	 0.0 		 0.0 

Query 9 	 0.0 		 0.0 

Query 10 	 0.0 		 0.