# Positional Indexing

## Retrieving corpus

In [41]:
lis = []

for i in range(1,1000):
    with open(f"../text_files/file{i}_1e.txt","r") as file:
        lis.append(file.read())

In [42]:
import nltk
from collections import defaultdict

def cpi(docs):
    pi = defaultdict(dict)
    
    ids = 0
    while ids < len(docs):
        d = docs[ids]
        tokens = nltk.word_tokenize(d.lower())  # Tokenize d and convert to lowercase
        position = 0
        while position < len(tokens):
            token = tokens[position]
            if ids not in pi[token].keys():
                pi[token][ids] = [position]
            else:
                pi[token][ids].append(position)
            position += 1
        ids += 1

    piwf = dict()
    
    for token in pi.keys():
        piwf[token] = list()
        tdf = len(set(pi[token].keys()))
        piwf[token].append(tdf)
        piwf[token].append(pi[token])
    return piwf

pi = cpi(lis)

# Print positional index
for term, postings in pi.items():
    print(f"{term}: {postings}")

loving: [4, {0: [0], 253: [31], 390: [2], 722: [6]}]
vintage: [18, {0: [1, 3], 50: [27], 149: [11], 196: [7, 42], 277: [4], 421: [8], 438: [3, 32], 493: [10], 596: [28], 637: [62], 673: [27], 724: [17], 736: [9], 826: [34], 846: [11, 56], 894: [15], 906: [1], 935: [27]}]
springs: [5, {0: [2, 12], 271: [2, 13], 468: [27], 805: [4], 936: [24, 50]}]
strat: [36, {0: [4], 24: [2, 5], 89: [3], 148: [4], 162: [51, 65], 196: [17], 240: [4], 244: [1], 252: [22], 344: [9], 352: [54], 379: [7, 16, 19, 49], 395: [5, 45, 59], 399: [60], 421: [12], 439: [2, 35], 454: [28], 456: [3], 468: [7], 518: [7, 51], 528: [29], 558: [19, 22], 564: [5], 578: [12], 610: [72], 625: [13, 17], 649: [7, 16], 651: [25], 690: [3, 52], 724: [23], 800: [5], 837: [5], 852: [3, 7], 939: [3], 977: [11], 992: [33]}]
good: [204, {0: [5], 1: [23], 3: [2], 7: [11], 8: [8], 12: [29, 35], 15: [3], 17: [36], 18: [2], 27: [88], 28: [63], 29: [25], 36: [21], 39: [25], 42: [16, 51], 43: [19], 45: [36], 57: [5], 64: [46], 71: [1], 76

## Save and load pickle

In [43]:
import pickle

with open("pi.pkl", "wb") as f:
    pickle.dump(pi, f)

In [44]:
# Load positional index from the .pkl file
with open("pi.pkl", "rb") as f:
    positional_index_loaded = pickle.load(f)

# Print loaded positional index
for term, postings in positional_index_loaded.items():
    print(f"{term}: {postings}")

loving: [4, {0: [0], 253: [31], 390: [2], 722: [6]}]
vintage: [18, {0: [1, 3], 50: [27], 149: [11], 196: [7, 42], 277: [4], 421: [8], 438: [3, 32], 493: [10], 596: [28], 637: [62], 673: [27], 724: [17], 736: [9], 826: [34], 846: [11, 56], 894: [15], 906: [1], 935: [27]}]
springs: [5, {0: [2, 12], 271: [2, 13], 468: [27], 805: [4], 936: [24, 50]}]
strat: [36, {0: [4], 24: [2, 5], 89: [3], 148: [4], 162: [51, 65], 196: [17], 240: [4], 244: [1], 252: [22], 344: [9], 352: [54], 379: [7, 16, 19, 49], 395: [5, 45, 59], 399: [60], 421: [12], 439: [2, 35], 454: [28], 456: [3], 468: [7], 518: [7, 51], 528: [29], 558: [19, 22], 564: [5], 578: [12], 610: [72], 625: [13, 17], 649: [7, 16], 651: [25], 690: [3, 52], 724: [23], 800: [5], 837: [5], 852: [3, 7], 939: [3], 977: [11], 992: [33]}]
good: [204, {0: [5], 1: [23], 3: [2], 7: [11], 8: [8], 12: [29, 35], 15: [3], 17: [36], 18: [2], 27: [88], 28: [63], 29: [25], 36: [21], 39: [25], 42: [16, 51], 43: [19], 45: [36], 57: [5], 64: [46], 71: [1], 76

## Phrase Queries

In [45]:
def retdoc(pi, query):
    if len(query) == 0:
        return set()

    valid = set(pi.get(query[0], [0, {}])[1].keys())
    i = 1
    while i < len(query):
        valid = valid & set(pi.get(query[i], [0, {}])[1].keys())
        i += 1

    if not valid:
        return valid

    init_word = query[0]
    fin_v = []

    for d in valid:
        pos_v = set(pi[init_word][1][d])
        i = 1
        while i < len(query):
            cur_positions = set([position - i for position in pi[query[i]][1][d]])
            pos_v = pos_v & cur_positions
            i += 1

        if pos_v:
            fin_v.append(d)

    return fin_v

In [46]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import string

N = int(input("Enter the number of queries:"))

i = 0
while i < N:
    query = input("Enter query:")

    query_lower = query.lower()  # Convert to lowercase

    words = word_tokenize(query_lower)

    stopwords_english = stopwords.words('english')

    words_stopwords = []

    # Remove stop words
    j = 0
    while j < len(words):
        if words[j] not in stopwords_english:
            words_stopwords.append(words[j])
        j += 1

    # Remove punctuation
    wc = []
    j = 0
    while j < len(words_stopwords):
        if words_stopwords[j] not in string.punctuation:
            wc.append(words_stopwords[j])
        j += 1

    wf = []

    j = 0
    while j < len(wc):
        if wc[j].strip() != '':
            wf.append(wc[j])
        j += 1

    doc_ret = retdoc(pi, wf)
    print("Number of documents retrieved:", len(doc_ret))
    print("Name of the documents retrieved:", end=" ")

    j = 0
    while j < len(doc_ret):
        print(f"file{doc_ret[j] + 1}.txt", end="")
        if j != len(doc_ret) - 1:
            print(", ", end="")
        j += 1

    print()
    i += 1

Number of documents retrieved: 18
Name of the documents retrieved: file1.txt, file674.txt, file737.txt, file197.txt, file422.txt, file936.txt, file907.txt, file494.txt, file847.txt, file51.txt, file597.txt, file278.txt, file150.txt, file439.txt, file725.txt, file827.txt, file638.txt, file895.txt
Number of documents retrieved: 6
Name of the documents retrieved: file166.txt, file264.txt, file746.txt, file174.txt, file886.txt, file542.txt
