<h1>Part 1 : Data Pre-processing</h1>
<div>In this section we build a function that accepts raw text extracted from a doc or a query and after applying normalization, tokenization, stopword removal and stemming returns an array of tokens (token stream).</div>

In [229]:
import pandas as pd
from xml.dom import minidom
import xml.etree.ElementTree as ET

persian_words_count = {}
mydoc = minidom.parse('persian.xml')
texts = mydoc.getElementsByTagName('text')
titles = mydoc.getElementsByTagName('title')
for i in range(len(texts)):
    A = titles[i].firstChild.data.split(' ')
    B = texts[i].firstChild.data.split(' ')
    for term in A:
        if(term in persian_words_count):
            persian_words_count[term] += 1
        else:
            persian_words_count[term] = 1
    for term in B:
        if(term in persian_words_count):
            persian_words_count[term] += 1
        else:
            persian_words_count[term] = 1

375


In [242]:
persian_stopwords_count = []
for term in persian_words_count:
    persian_stopwords_count.append((term, persian_words_count[term]))
persian_stopwords_count.sort(key=lambda x:x[1], reverse=True)

In [244]:
persian_stopwords = []
for i in range(11):
    persian_stopwords.append(persian_stopwords_count[i][0])
print(persian_stopwords)

['', 'و', 'در', '=', 'به', 'از', 'که', 'این', 'را', 'با', '|']


In [246]:
english_words_count = {}
data = pd.read_csv('ted_talks.csv')
descs = data.iloc[:, 1].values
titles = data.iloc[:, 14].values
for i in range(len(titles)):
    A = titles[i].split(' ')
    B = descs[i].split(' ')
    for term in A:
        if(term in english_words_count):
            english_words_count[term] += 1
        else:
            english_words_count[term] = 1
    for term in B:
        if(term in english_words_count):
            english_words_count[term] += 1
        else:
            english_words_count[term] = 1

In [248]:
english_stopwords_count = []
for term in english_words_count:
    english_stopwords_count.append((term, english_words_count[term]))
english_stopwords_count.sort(key=lambda x:x[1], reverse=True)

In [250]:
english_stopwords = []
for i in range(7):
    english_stopwords.append(english_stopwords_count[i][0])
print(english_stopwords)

['the', 'and', 'of', 'to', 'a', 'in', '--']


In [266]:
from __future__ import unicode_literals
from hazm import *
from PersianStemmer import PersianStemmer
import nltk
# nltk.download('punkt')
from nltk import word_tokenize
from nltk.stem import PorterStemmer

def prepare_text(raw_text, lang = 'fa', stem = True):
    if(lang == 'en'):
        tokens = word_tokenize(raw_text)
        tokens = [word for word in tokens if word.isalpha()]
        tokens = [word.lower() for word in tokens]
        porter = PorterStemmer()
        prepared_text = []
        for word in tokens:
            if(word not in english_stopwords):
                if(stem):
                    prepared_text.append(porter.stem(word))
                else:
                    prepared_text.append(word)
        return prepared_text
    
    elif(lang == 'fa'):
        normalizer = Normalizer()
        normalized_text = normalizer.normalize(raw_text)
        tokenizer = WordTokenizer()
        tokenized_text = tokenizer.tokenize(normalized_text)
        ps = PersianStemmer()
        prepared_text = []
        for word in tokenized_text:
            if(word[0] >= "آ" and word[0] <= "ی" and word not in persian_stopwords):
                if(stem):
                    prepared_text.append(ps.run(word))
                else:
                    prepared_text.append(word)
        return prepared_text

raw_text = input()
print(prepare_text(raw_text, 'en'))

KeyboardInterrupt: Interrupted by user

<h1>Part 2 : Indexing</h1>
<div>In this section we build a function for positional indexing and biword indexing. We then save this indexes and also we have functions for adding and deleting docs in a dynamic way meaning that you don't need to repeat the indexing process from the beginning.</div>

In [65]:
positional_index = {}
def add_positional(term, docid, position, t):
    if(term not in positional_index.keys()):
        positional_index[term] = {}
        positional_index[term][int(docid)] = {}
        positional_index[term][int(docid)][t] = [position]
    else:
        if(int(docid) not in positional_index[term].keys()):
            positional_index[term][int(docid)] = {}
            positional_index[term][int(docid)][t] = [position]
        else:
            if(t not in positional_index[term][int(docid)]):
                positional_index[term][int(docid)][t] = [position]
            else:
                positional_index[term][int(docid)][t].append(position)

In [66]:
docids = []
def construct_positional_indexes(docs_path, data_format = 'xml'):
    if(data_format == 'xml'):
        mydoc = minidom.parse(docs_path)
        texts = mydoc.getElementsByTagName('text')
        titles = mydoc.getElementsByTagName('title')
        ids = mydoc.getElementsByTagName('id')
        tree = ET.parse(docs_path)
        root = tree.getroot()
        for child in root:
            for c in child:
                if(c.tag == '{http://www.mediawiki.org/xml/export-0.10/}id'):
                    docids.append(int(c.text))
        for i in range(len(texts)):
            A = prepare_text(titles[i].firstChild.data)
            B = prepare_text(texts[i].firstChild.data)
            for j in range(len(A)):
                add_positional(A[j], docids[i], j, 'title')
            for j in range(len(B)):
                add_positional(B[j], docids[i], j, 'text')
    
    elif(data_format == 'csv'):
        data = pd.read_csv(docs_path)
        descs = data.iloc[:, 1].values
        titles = data.iloc[:, 14].values
        for i in range(len(descs)):
            docids.append(i)
            A = prepare_text(titles[i], 'en')
            B = prepare_text(descs[i], 'en')
            for j in range(len(A)):
                add_positional(A[j], i, j, 'title')
            for j in range(len(B)):
                add_positional(B[j], i, j, 'text')

In [67]:
construct_positional_indexes('persian.xml')
construct_positional_indexes('ted_talks.csv', 'csv')

In [252]:
def get_posting_list(term):
    return sorted(list(positional_index[term].keys()))

In [253]:
term = input()
print(get_posting_list(term))

بازی
[3014, 4388, 5403, 5786, 6647]


In [257]:
def get_position(term):
    return positional_index['term']

In [258]:
term = input()
print(get_position(term))

بازی
{166: {'text': [8]}, 869: {'text': [1]}, 884: {'text': [29]}, 1106: {'text': [34]}, 1861: {'text': [1]}, 1899: {'text': [40]}, 2325: {'text': [33]}, 2426: {'title': [7], 'text': [36, 39]}, 2517: {'text': [31]}}


In [210]:
bigram_index = {}
def add_bigram(word):
    new_word = "$" + word + "$"
    for i in range(len(new_word) - 1):
        bi = new_word[i] + new_word[i + 1]
        if(bi not in bigram_index.keys()):
            bigram_index[bi] = [word]
        else:
            if(word not in bigram_index[bi]):
                bigram_index[bi].append(word)

In [211]:
def construct_bigram_indexes(docs_path, data_format = 'xml'):
    if(data_format == 'xml'):
        mydoc = minidom.parse(docs_path)
        texts = mydoc.getElementsByTagName('text')
        titles = mydoc.getElementsByTagName('title')
        for i in range(len(texts)):
            A = prepare_text(titles[i].firstChild.data, 'fa', False)
            B = prepare_text(texts[i].firstChild.data, 'fa', False)
            for j in range(len(A)):
                add_bigram(A[j])
            for j in range(len(B)):
                add_bigram(B[j])
                
    elif(data_format == 'csv'):
        data = pd.read_csv(docs_path)
        descs = data.iloc[:, 1].values
        titles = data.iloc[:, 14].values
        for i in range(len(descs)):
            A = prepare_text(titles[i], 'en', False)
            B = prepare_text(descs[i], 'en', False)
            for j in range(len(A)):
                add_bigram(A[j])
            for j in range(len(B)):
                add_bigram(B[j])

In [212]:
construct_bigram_indexes('persian.xml')
construct_bigram_indexes('ted_talks.csv', 'csv')

In [259]:
def find_bigram(bigram):
    return bigram_index[bigram]

In [261]:
bigram = input()
print(find_bigram(bigram))

با
['بازی', 'مهاباد', 'زبان', 'بار', 'داشته_باشد', 'بادغیس', 'بامیان', 'باستان', 'آلبانی', 'الباسان', 'بانتو', 'با', 'باورها', 'رده|باور', 'انبار-رده|Belief', 'زبان\u200cشناسی', 'بیابانگرد', 'بیابان', 'جبالیه', 'زبان\u200cهای', 'بمباران', 'باجلانی', 'باتنه', 'باغ', 'باده', 'صهبا', 'انبار', 'باقی', 'جلال\u200cآباد', 'سبالو', 'بابازار', 'ضرباهنگ\u200cهای', 'بالیکسیر', 'بارتین', 'بایبورد', 'بابل', 'باشگاه', 'بازی\u200cها', 'باستانی', 'راوبال', 'ویکی\u200cانبار-رده|Geography', 'کشور|آلبانی', 'انبار-رده|Albania', 'فرانسه\u200cزبانی', 'بالکان', 'آلبانیایی\u200cزبان', 'انبار-رده|Country', 'انبار-رده|Maps', 'انبار-رده|Countries', 'انبار-رده|Provinces', 'انبار-رده|Asia', 'انبار-رده|Europe', 'انبار|Category', 'انبار-رده|Caucasus', 'انبار-رده|Middle', 'بادن-وورتمبرگ', 'بایرن', 'انبار-رده|Germany', 'آلمانی\u200cزبان', 'ژرمنی\u200cزبان', 'انبار-رده|Members', 'ویکی\u200cانبار-رده|Rivers', 'انبار-رده|Geography', 'انبار-رده', 'دربارهٔ', 'باغ\u200cهای', 'الفبا', 'درباره', 'کردی\u200cزبان', 'ارمنی\u200c

In [None]:
def add_document_to_indexes(docs_path, doc_num, file_format = 'xml'):
    if(file_format == 'xml'):
        mydoc = minidom.parse(docs_path)
        texts = mydoc.getElementsByTagName('text')
        titles = mydoc.getElementsByTagName('title')
        if(doc_num not in docids):
            docs = []
            tree = ET.parse(docs_path)
            root = tree.getroot()
            for child in root:
                for c in child:
                    if(c.tag == '{http://www.mediawiki.org/xml/export-0.10/}id'):
                        docs.append(int(c.text))
            i = docs.index(doc_num)
            A = prepare_text(titles[i].firstChild.data)
            B = prepare_text(texts[i].firstChild.data)
            for j in range(len(A)):
                add_positional(A[j], doc_num, j, 'title')

            for j in range(len(B)):
                add_positional(B[j], doc_num, j, 'text')
            docids.append(doc_num)
            
    elif(file_format == 'csv'):
        data = pd.read_csv(docs_path)
        descs = data.iloc[:, 1].values
        titles = data.iloc[:, 14].values
        if(doc_num not in docids):
            docs = []
            for i in range(len(descs)):
                docs.append(i)
            i = docs.index(doc_num)
            A = prepare_text(titles[i])
            B = prepare_text(descs[i])
            for j in range(len(A)):
                add_positional(A[j], doc_num, j, 'title')

            for j in range(len(B)):
                add_positional(B[j], doc_num, j, 'text')
            docids.append(doc_num)

In [None]:
add_document_to_indexes('persian.xml', 3022)

In [96]:
def delete_document_from_indexes(docs_path, doc_num, file_format = 'xml'):
    if(file_format == 'xml'):
        mydoc = minidom.parse(docs_path)
        texts = mydoc.getElementsByTagName('text')
        titles = mydoc.getElementsByTagName('title')
        if(doc_num in docids):
            docs = []
            tree = ET.parse(docs_path)
            root = tree.getroot()
            for child in root:
                for c in child:
                    if(c.tag == '{http://www.mediawiki.org/xml/export-0.10/}id'):
                        docs.append(int(c.text))
            i = docs.index(doc_num)
            A = prepare_text(titles[i].firstChild.data)
            B = prepare_text(texts[i].firstChild.data)
            for j in range(len(A)):
                if(A[j] in positional_index.keys()):
                    if(doc_num in positional_index[A[j]]):
                        del positional_index[A[j]][doc_num]
                    if(len(positional_index[A[j]].keys()) == 0):
                        del positional_index[A[j]]

            for j in range(len(B)):
                if(B[j] in positional_index.keys()):
                    if(doc_num in positional_index[B[j]]):
                        del positional_index[B[j]][doc_num]
                    if(len(positional_index[B[j]].keys()) == 0):
                        del positional_index[B[j]]
            docids.remove(doc_num)
            
    elif(file_format == 'csv'):
        data = pd.read_csv(docs_path)
        descs = data.iloc[:, 1].values
        titles = data.iloc[:, 14].values
        if(doc_num in docids):
            docs = []
            for i in range(len(descs)):
                docs.append(i)
            i = docs.index(doc_num)
            A = prepare_text(titles[i])
            B = prepare_text(descs[i])
            for j in range(len(A)):
                if(A[j] in positional_index.keys()):
                    if(doc_num in positional_index[A[j]]):
                        del positional_index[A[j]][doc_num]
                    if(len(positional_index[A[j]].keys()) == 0):
                        del positional_index[A[j]]

            for j in range(len(B)):
                if(B[j] in positional_index.keys()):
                    if(doc_num in positional_index[B[j]]):
                        del positional_index[B[j]][doc_num]
                    if(len(positional_index[B[j]].keys()) == 0):
                        del positional_index[B[j]]
            docids.remove(doc_num)

In [None]:
delete_document_from_indexes('persian.xml', 3022)

In [273]:
import json
    
def save_index(destination, index):
    j = json.dumps(index)
    f = open(destination,"w")
    f.write(j)
    f.close()

In [213]:
save_positional_index('positional.json', positional_index)
save_bigram_index('bigram.json', bigram_index)

In [None]:
def load_index(source):
    with open(source) as json_file:
        return json.load(json_file)

In [None]:
load_positional_index('positional.json')
load_bigram_index('bigram.json')

<h1>Part 3 : Index compression</h1>
<div>In this section we are going to use variable byte coding and gamma coding to compress our positional index postiong lists and compare the index size in all 3 methods. We also need to write a decode function for both variable byte coding and gamma coding technique.</div>

In [267]:
from sys import getsizeof
from objsize import get_deep_size

print(get_deep_size(positional_index))

45674344


In [269]:
from math import log

def log2(x):
    return log(x, 2)

def unary(N):
    return N*"1" + "0"

def binary(x, l = 1): 
    s = '{0:0%db}' % l 
    return s.format(x)

def gamma(N):
    if(N == 0):
        return '2'
    if(N == 1):
        return '0'
    n = int(log2(N)) 
    b = N - 2**(int(log2(N))) 
    l = int(log2(N)) 
    return unary(n) + binary(b, l) 
    
def gamma_coding(index):
    result = {}
    for term in index:
        for doc in index[term]:
            for t in index[term][doc]:
                posting_list = index[term][doc][t]
                temp = gamma(posting_list[0])
                for i in range(1, len(posting_list)):
                    temp += gamma(posting_list[i] - posting_list[i - 1])
                if(term in result):
                    if(doc in result[term]):
                        result[term][doc][t] = temp
                    else:
                        result[term][doc] = {}
                        result[term][doc][t] = temp
                else:
                    result[term] = {}
                    result[term][doc] = {}
                    result[term][doc][t] = temp
    return result

In [271]:
gamma_coding_positional_index = gamma_coding(positional_index)
print(get_deep_size(gamma_coding_positional_index))

42771238


In [297]:
save_index('gamma_encoded.json', gamma_coding_positional_index)

In [275]:
def variable_byte(N):
    b = binary(N)
    temp = ""
    k = 0
    for i in range(1, len(b) + 1):
        if(k == 7):
            temp = '0' + temp
            k = 0
        temp = b[-i] + temp
        k += 1
    for j in range(8 - k):
        temp = '0' + temp
    temp = temp[:-8] + '1' + temp[-7:]
    return temp

def variable_byte_code(index):
    result = {}
    for term in index:
        for doc in index[term]:
            for t in index[term][doc]:
                posting_list = index[term][doc][t]
                temp = variable_byte(posting_list[0])
                for i in range(1, len(posting_list)):
                    temp += variable_byte(posting_list[i] - posting_list[i - 1])
                if(term in result):
                    if(doc in result[term]):
                        result[term][doc][t] = temp
                    else:
                        result[term][doc] = {}
                        result[term][doc][t] = temp
                else:
                    result[term] = {}
                    result[term][doc] = {}
                    result[term][doc][t] = temp
    return result

In [276]:
variable_byte_coding_positional_index = variable_byte_code(positional_index)
print(get_deep_size(variable_byte_coding_positional_index))

{3014: {'title': '10000000'}, 4388: {'title': '10000001'}, 5403: {'text': '10011110'}, 5786: {'text': '100100101000010110001000'}, 6647: {'title': '10000000', 'text': '10000000'}}
43431906


In [296]:
save_index('variable_byte_encoded.json', variable_byte_coding_positional_index)

In [281]:
def decode_gamma(code):
    if(code[0] == 0):
        return 0
    k = 0
    for i in range(len(code)):
        if(code[i] == '1'):
            k += 1
        elif(code[i] == '0'):
            break
    return int('1' + code[k + 1:], 2)
    
def decode_gamma_coding(encoded):
    result = {}
    for term in encoded:
        for doc in encoded[term]:
            for t in encoded[term][doc]:
                a = 0
                posting_list = []
                code = encoded[term][doc][t]
                while(True):
                    if(len(code) == 0):
                        break
                    if(code[0] == '0'):
                        a += 1
                        posting_list.append(a)
                        code = code[1:]
                    elif(code[0] == '2'):
                        posting_list.append(a)
                        code = code[1:]
                    else:
                        k = 0
                        for i in range(len(code)):
                            if(code[i] == '1'):
                                k += 1
                            else:
                                break
                        temp = code[0: 2*k + 1]
                        code = code[2*k + 1:]
                        a = a + decode_gamma(temp)
                        posting_list.append(a)
                if(term in result):
                    if(doc in result[term]):
                        result[term][doc][t] = posting_list
                    else:
                        result[term][doc] = {}
                        result[term][doc][t] = posting_list
                else:
                    result[term] = {}
                    result[term][doc] = {}
                    result[term][doc][t] = posting_list
    return result

In [286]:
decoded_gamma = decode_gamma_coding(gamma_coding_positional_index)

{3014: {'title': [0]}, 4388: {'title': [1]}, 5403: {'text': [30]}, 5786: {'text': [18, 23, 31]}, 6647: {'title': [0], 'text': [0]}}


In [287]:
def compare_index(index1, index2):
    if(list(index1.keys()) != list(index2.keys())):
        return False
    flag = True
    for key in index1:
        if(index1[key] != index2[key]):
            flag = False
            break
    return flag

In [289]:
print(compare_index(positional_index, decoded_gamma))

True


In [292]:
def decode_variable_byte(index):
    result = {}
    for term in index:
        for doc in index[term]:
            for t in index[term][doc]:
                code = index[term][doc][t]
                posting_list = []
                a = 0
                r = ""
                while(True):
                    if(len(code) == 0):
                        break
                    temp = code[0:8]
                    code = code[8:]
                    r += temp[1:8]
                    if(temp[0] == '1'):
                        a += int(r, 2)
                        posting_list.append(a)
                        r = ""
                if(term in result):
                    if(doc in result[term]):
                        result[term][doc][t] = posting_list
                    else:
                        result[term][doc] = {}
                        result[term][doc][t] = posting_list
                else:
                    result[term] = {}
                    result[term][doc] = {}
                    result[term][doc][t] = posting_list
    return result

In [293]:
decoded_variable_byte = decode_variable_byte(variable_byte_coding_positional_index)

In [295]:
print(compare_index(positional_index, decoded_variable_byte))

True


<h1>Part 4 : Query correction</h1>
<div>In this section we use the bigram indexing to correct spelling errors in the query. We first use the jaccard distance to find the most likely cases of spelling correction and from that set we use the edit distant measure to find the best answer.</div>

In [214]:
def jakard_distance(word1, word2):
    w1 = "$" + word1 + "$"
    w2 = "$" + word2 + "$"
    bi1 = []
    bi2 = []
    for i in range(len(w1) - 1):
        temp = w1[i] + w1[i + 1]
        if(temp not in bi1):
            bi1.append(temp)
            
    for i in range(len(w2) - 1):
        temp = w2[i] + w2[i + 1]
        if(temp not in bi2):
            bi2.append(temp)
    
    U = []
    M = []
    for bi in (bi1 + bi2):
        if(bi not in U):
            U.append(bi)
    
    for bi in bi1:
        if(bi in bi2):
            M.append(bi)
    jakard = len(M) / len(U)
    return jakard

def edit_distance(word1, word2):
    minimum = min(len(word1), len(word2))
    maximum = max(len(word1), len(word2))
    distance = 0
    for i in range(minimum):
        if(word1[i] != word2[i]):
            distance += 1
    distance += (maximum - minimum)
    return distance

In [306]:
print(jakard_distance('information', 'informant'))
print(edit_distance('modern', 'mortem'))

0.4666666666666667
4


In [218]:
def correct_query(query):
    words = query.split(" ")
    correct_query = ""
    for word in words:
        if word in positional_index.keys():
            correct_query += word
            correct_query += " "
        else:
            all1 = []
            temp = "$" + word + "$"
            for i in range(len(temp) - 1):
                bi = temp[i] + temp[i + 1]
                if(bi in bigram_index):
                    for bigram in bigram_index[bi]:
                        if(bigram not in all1):
                            all1.append(bigram)
            jakard = []
            for a in all1:
                jakard.append(jakard_distance(a, word))
            J = sorted(jakard, reverse = True)
            maximum = J[9]
            candidates = []
            for i in range(len(jakard)):
                if(jakard[i] >= maximum):
                    candidates.append(all1[i])
            distance = []
            for c in candidates:
                distance.append(edit_distance(c, word))
            minimum = min(distance)
            for i in range(len(distance)):
                if(distance[i] == minimum):
                    result = candidates[i]
                    break
            correct_query += result
            correct_query += " "
                
    return correct_query[0:-1]

In [226]:
print(correct_query('باظیابی اتلاعاط'))
print(correct_query('اشتان شیشتان'))
print(correct_query('moderm infornation retreval'))
print(correct_query('susspicious spac ship in afkhanistan'))

بازی اطلاعات
استان شیروان
modern information retreat
suspicious space ship in afghanistan


<h1>Part 5 : Query search</h1>
<div>In this section we implement a weighted search method based on tf-idf and also an approximate search function with a specified window size.<div>

In [340]:
import math
import numpy as np

def search(query, lang = 'fa', weight=2, returned = 10, all_docs = None):
    relevant_docs = []
    N = len(docids)
    t = len(positional_index.keys())

    idf = []
    words = prepare_text(query, lang)
#     essential = prune_text(query)
    essential = []
            
    q = []
    all1 = []
    for word in positional_index.keys():
        all1.append(word)
        

    query_terms = []
    for word in words:
        if(word not in query_terms):
            query_terms.append(word)

    q1 = []
    for term in query_terms:
        q1.append(1 + math.log(words.count(term), 10))
    

    for term in query_terms:

        temp1 = 0
        for d in docids:
            if(d in positional_index[term].keys()):
                temp1 += 1

        idf.append(math.log(N / (temp1), 10))
        
    score = []
    docids1 = docids
    if(all_docs != None):
        docids1 = all_docs
    
    for i in docids1:
        if(lang == 'en' and i > 3000):
            continue
        if(lang == 'fa' and i < 3000):
            continue
        flag = False
        f = True
            
        if(len(essential) != 0):
            for word in essential:
                if(i not in positional_index[word]):
                    flag = True
                    break
                else:
                    if('text' not in positional_index[word][i].keys()):
                        flag = True
                        break
            if(flag == True):
                continue
            A = positional_index[essential[0]][i]['text'] 
            f = False
            for j in A:
                temp = 0
                for k in range(len(essential)):
                    if((j + k) in positional_index[essential[k]][i]['text']):
                        temp += 1
                if(temp == len(essential)):
                    f = True
                    break

        if(flag == True):
            continue
        
        if(f == False):
            continue
                
        s = 0
        length_title = 0
        length_text = 0
        for term in query_terms:
            if(i not in positional_index[term]):
                length_title += 0
                length_text += 0
            else:
                if('title' not in positional_index[term][i].keys()):
                    length_title += 0
                else:
                    length_title += (len(positional_index[term][i]['title'])) ** 2

                if('text' not in positional_index[term][i].keys()):
                    length_text += 0
                else:
                    length_text += (len(positional_index[term][i]['text'])) ** 2
        length_title = math.sqrt(length_title)
        length_text = math.sqrt(length_text)
        v = 0
        for term in query_terms:
            if(i not in positional_index[term]):
                s += 0
            else:
                if('title' not in positional_index[term][i].keys()):
                    s += 0
                else:
                    temp = len(positional_index[term][i]['title'])
                    s += weight * ((1 + math.log(temp, 10)) * idf[v] * q1[v]) / length_title

                if('text' not in positional_index[term][i].keys()):
                    s += 0
                else:
                    temp = len(positional_index[term][i]['text'])
                    s += ((1 + math.log(temp, 10)) * idf[v] * q1[v]) / length_text
            v += 1
        score.append([i, s])

    result = sorted(score, key = lambda x:x[1], reverse = True)
    z = min(returned, len(result))
    for i in range(z):
        relevant_docs.append(result[i][0])
        
    return relevant_docs

In [341]:
print(search("walking a dog", 'en'))
print(search("بازی فوتبال"))

[277, 278, 1771, 2498, 2455, 1805, 1489, 331, 822, 1398]
[6647, 6417, 6418, 6753, 3014, 4388, 5403, 7085, 5786, 3016]


In [342]:
def proximate_search(query, window, lang = 'fa'):
    words = prepare_text(query, lang)
    docs = []
    relevant_docs = []
    for word in words:
        docs.append(set(positional_index[word].keys()))
    all_docs = set.intersection(*docs)
    for doc in all_docs:
        posting_lists = []
        flag = False
        for word in words:
            if('text' in positional_index[word][doc]):
                posting_lists.append(positional_index[word][doc]['text'])
            else:
                flag = True
                break
        if(flag):
            continue
        for p in posting_lists[0]:
            flag2 = False
            for i in range(1, len(posting_lists)):
                if(len(list(filter(lambda x : x <= (p + window) and x >= (p - window), posting_lists[i]))) == 0):
                    flag2 = True
                    break
            if(flag2):
                continue
            else:
                relevant_docs.append(doc)
                break
    return search(query, lang, all_docs = relevant_docs)

In [345]:
print(proximate_search("demonic lyrics", 2, 'en'))
print(proximate_search("rest world", 4, 'en'))

[43]
[1697, 2231, 1342, 63]
