In [1]:
import numpy as np
from itertools import chain
from string import digits
from indexer import *
# from search import *

# #Loads indexed file -> List of dictionary items________________

In [16]:

indexed_file = open('index.txt', 'r').readlines()

docnumbers = []

# Loads indexed file back into a list of dictionary items [{term: {document:[positions]}}...]
def format_txt_file():
    index_list = []
    term_list = []
    for line in indexed_file:
        position = {}
        index = {}
        # get the term
        if line.endswith(':\n'):
            term = line.replace(':', '').strip()
            term_list.append(term)

        # get the list of positions + save in donumbers
        if line.startswith('\t'):
            split_position = (line.replace('\t', '').replace('\n','').replace(' ', '')).split(':')
            docno, position_list2 = split_position[0], split_position[1]
            idxs = list(map(int, position_list2.split(',')))
            position[int(docno)] = idxs
            docnumbers.append(docno)

        # List of dictionary items [{term : {doc:[positions]}} , ...]
        if len(position)>0:
            index[term] = position
            index_list.append(index)

    return index_list


# load index
inverted_index = format_txt_file()
inverted_index

# docnumbers
# N = len(list(set(docnumbers)))
# print(N)


[{'000bn': {21: [165, 214]}},
 {'000bn': {135: [297]}},
 {'000bn': {258: [21]}},
 {'000bn': {314: [65]}},
 {'000bn': {329: [87, 122]}},
 {'000bn': {357: [29]}},
 {'000bn': {3438: [195]}},
 {'000bn': {3551: [121]}},
 {'000bn': {3602: [262]}},
 {'000bn': {3660: [62]}},
 {'000bn': {3931: [376]}},
 {'000gwh': {3411: [439]}},
 {'000m': {157: [64]}},
 {'000mw': {3414: [220, 225]}},
 {'000mw': {3417: [221, 293, 305, 476]}},
 {'000mw': {3434: [454]}},
 {'000mw': {3520: [41]}},
 {'000mw': {3602: [33]}},
 {'000mw': {3649: [38, 77]}},
 {'009m': {3503: [852]}},
 {'011bn': {3323: [75]}},
 {'01m': {170: [83]}},
 {'01m': {3612: [22]}},
 {'01m': {3829: [45]}},
 {'025p': {3331: [205]}},
 {'025p': {3618: [69]}},
 {'02m': {3480: [73]}},
 {'02m': {3609: [40]}},
 {'02m': {3616: [35]}},
 {'02m': {3841: [38]}},
 {'031bn': {3929: [155]}},
 {'03bn': {39: [55, 98]}},
 {'03bn': {3324: [46]}},
 {'03bn': {3333: [128]}},
 {'03bn': {3821: [32]}},
 {'03m': {3450: [266]}},
 {'03m': {3484: [226]}},
 {'03m': {3616: [29]

# #Functions1 ___________________________________________________

In [3]:
def preprocess_query(query):
    pp_query = []
    stopwords = sort_stopwords()
    query = query.split(' ')
    for term in query:
        if term not in stopwords:
            term = re.sub(r'\W+', '', stem(term.lower()))
            pp_query.append(term)
    return pp_query


def preprocess_term(term):
    return re.sub(r'\W+', '', stem(term.lower()))


# For a term, retrieves a list of all positions from the inverted index.
def getpositions(term):
    position_list = []
    for index in inverted_index:
        if term in index.keys():
            position_list.append(index.get(term))
    return position_list


# takes list of documents and returns the all documents in collection except those in list.
def getnot(lst):
    all_docs = sorted(list(set(docnumbers)))
    return [n for n in ([int(x) for x in all_docs]) if n not in lst]


# extracts the documents from a list of {doc:[position]} dictionaries
def get_docs(position_list):
    docs = []
    for position in position_list:
        for key in position.keys():
            docs.append(key)
    return docs


# Phrase_Search ___________________________________________

In [4]:

def phrasesearch(i, phrase):

    # used for both phrase search and proximity search.
    # if phrase search, i=1, if proximity search, i is passed from proximity search method.

    phrase = re.sub('"', '', phrase)
    term1, term2 = phrase.split(' ')
    term1_positions = getpositions(preprocess_term(term1))
    term2_positions = getpositions(preprocess_term(term2))
    results = []

    # loops through all positions that both terms occur in and adds to list if distance between terms <= i.

    for position in term1_positions:
        for key in position:
            term1_doc = key
            term1_pos = position[key]

            for position2 in term2_positions:
                for key2 in position2:
                    term2_doc = key2
                    term2_pos = position2[key2]

                    if term1_doc == term2_doc:
                        for p in term1_pos:
                            for p2 in term2_pos:
                                if abs(p-p2) <= i:
                                    results.append(position)
                                    results.append(position2)

    return results # return list of postions

# Proximity_Search ___________________________________________

In [5]:
def proximitysearch(query):

    # format query and send to phrase search with i being the distance given.

    query = re.sub('#', '', query)
    i, query = query.split('(')
    query = re.sub(r'([^\s\w]|_)+', '', query)
    results = phrasesearch(int(i), query)

    return list(set(get_docs(results)))

# Boolean_Search ___________________________________________

In [6]:

def boolean_search(query):

    # Gets type of boolean query, splits into the two terms mentioned.

    results = []

    if 'AND NOT' in query:
        idx1 = query.index('AND')
        idx2 = idx1 + 7
    elif 'OR NOT' in query:
        idx1 = query.index('OR')
        idx2 = idx1 + 6
    elif 'AND' in query:
        idx1 = query.index('AND')
        idx2 = idx1 + 3
    elif 'OR' in query:
        idx1 = query.index('OR')
        idx2 = idx1 + 2

    term1 = query[:idx1].strip()
    term2 = query[idx2:].strip()

    # If either term is a phrase search then get results from phrase method.

    if term1.startswith('"') and term1.endswith('"'):
        term1_positions = phrasesearch(1, term1)
    else:
        term1_positions = getpositions(preprocess_term(term1))
        
    if term2.startswith('"') and term2.endswith('"'):
        term2_positions = phrasesearch(1, term2)
    else:
        term2_positions = getpositions(preprocess_term(term2))

    # Convert to list of documents without indexes

    term1_positions = get_docs(term1_positions)
    term2_positions = get_docs(term2_positions)


    if 'NOT' in query:
        term2_positions = getnot(term2_positions) # revert list

    if 'AND' in query:
        results = list(set(term1_positions) & set(term2_positions))
    if 'OR' in query:
        results = list(set(term1_positions) | set(term2_positions))

    return results


# RankedIR_Search ___________________________________________

In [19]:
def rankedir_search(query):

    # gets list of positions for each term in the query and calculates tfidf score for each document

    query = query.split(' ')
    N = len(list(set(docnumbers)))
    tfidfs = {} # Dictionary to store {docnumber: tfidf score}

    def tfidf(tf, df):
        return (1 + np.log10(tf)) * (np.log10(N/df))

    for term in query:
        term = preprocess_term(term)
        positions = getpositions(term)  # [ {docnumber : [position]} , ... ]
        docfreq = len(positions)

        for position in positions:      # {docnumber : [position]} , ...
            for doc in position:        # docnumber : [position]
                termfreq = len(position[doc])
                t = tfidf(termfreq, docfreq)

                if doc not in tfidfs.keys():    # if document not exits in tfidfs[] -> add it |else| add the value of (t) to the existant doc score  
                    tfidfs[doc] = t
                else:
                    newval = tfidfs[doc].__add__(t)
                    tfidfs[doc] = newval
                    
    return tfidfs

# RankedIR_exemple
Query_ex = rankedir_search("8 the education with computers")
Query_ex


{49: 1.4436974992327127,
 70: 1.4436974992327127,
 106: 1.4436974992327127,
 135: 3.67500133768554,
 171: 2.816957203708436,
 194: 1.4436974992327127,
 216: 2.1325162615022633,
 241: 1.4436974992327127,
 261: 1.4436974992327127,
 265: 1.4436974992327127,
 275: 1.4436974992327127,
 336: 1.4436974992327127,
 351: 1.8782937511668367,
 371: 2.1325162615022633,
 3360: 1.4436974992327127,
 3399: 1.4436974992327127,
 3403: 1.4436974992327127,
 3409: 1.4436974992327127,
 3449: 1.4436974992327127,
 3543: 1.4436974992327127,
 3553: 1.4436974992327127,
 3554: 1.4436974992327127,
 3636: 1.8782937511668367,
 3662: 1.8782937511668367,
 3674: 2.821335023771814,
 3724: 1.4436974992327127,
 3749: 1.4436974992327127,
 3750: 1.8782937511668367,
 3793: 2.3128900031009607,
 3826: 1.4436974992327127,
 3856: 3.437420830858725,
 3866: 1.4436974992327127,
 3914: 3.3205669555504924,
 3915: 3.5547321549323754,
 3928: 5.629426908439597,
 3932: 2.816957203708436,
 14: 1.0555173278498313,
 16: 1.0555173278498313,
 

# #Functions2__Affichage _____________________________________

In [8]:

# formats the list of results per query to TREC format for boolean, phrase and proximity queries
def print_results(queryno, results):    
    query_results = []
    if len(results) > 0:
        for documentnumber in results:
            output_string = "{} 0 {} 0 1 0".format(queryno, documentnumber)
            query_results.append(output_string)

    return query_results

# formats the list of results per query to TREC format for rank queries
def print_results_IR(queryno, results):   
    query_results = []
    results_c = results.copy()
    for doc, score in results_c.items():
        if score == 0.0:
            results.pop(doc)
    results = (sorted(results.items(), key=lambda kv: kv[1], reverse=True))
    for item in results:
        doc, score = item
        output = "{} 0 {} 0 {} 0".format(queryno, doc, round(score, 3))
        query_results.append(output)

    return query_results


# Query in list format, preprocesses
def parsequery(queryno, query):
    results = []    # list of positions
    querytype = "not_ir"       # variable used to decide which print/save method to use for rank or bool/phrase query
    results_string = []

    # check structure of query to send to appropriate search method

    if 'AND' in query or 'OR' in query:
        results = boolean_search(query)

    elif query.startswith('#') and query.endswith(")"):
        results = proximitysearch(query)

    elif query.startswith('"') and query.endswith('"'):
        positions = phrasesearch(1, query)
        t = []
        for p in positions:
            for key in p:
                t.append(key)
        results.extend(list(set(t)))

    elif len(query.split(' ')) == 1: # single word query
        for item in getpositions(query):
            for key in item.keys():
                results.append(key)
    else:
        querytype = "IR"
        results = rankedir_search(query)

    

    if querytype == "IR":
        query = preprocess_query(query)
        results_string.append(print_results_IR(queryno, results))
    else:
        results_string.append(print_results(queryno, results))

    return list(chain.from_iterable(results_string))



# #Results ________________________________________________

In [10]:

# build_index("collections/trec.sample.txt")
query_file = open("queries/queries.lab3.txt", 'r').readlines()
# query_file = open("queries/queries.lab2.txt", 'r').readlines()


if __name__=='__main__':

    print("\nANSWERING QUERIES\n...")

    output = []

    for query in query_file:
        queryno = int(query.split()[0])
        query = query.lstrip(digits).strip()
        results_string = parsequery(queryno, query)

        if len(results_string) > 1000: # only print out first 1000 queries
            results_string = results_string[:1000]

        if len(results_string)>0:
            output.append(results_string)

    # save to file

    output = list(chain.from_iterable(output))
    f = open('results.txt', 'w')

    for line in output:
        f.write(line + "\n")
    f.close()

    print("QUERYING COMPLETE\n")



ANSWERING QUERIES
...
QUERYING COMPLETE

