<h1>Part 1 : Data Pre-processing</h1>
<div>In this section we build a function that accepts raw text extracted from a doc or a query and after applying normalization, tokenization, stopword removal and stemming returns an array of tokens (token stream).</div>

In [30]:
from __future__ import unicode_literals
from hazm import *
from PersianStemmer import PersianStemmer
from langdetect import detect
import nltk
# nltk.download('punkt')
from nltk import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

def prepare_text(raw_text, lang = 'fa'):
    if(lang == 'en'):
        tokens = word_tokenize(raw_text)
        tokens = [word for word in tokens if word.isalpha()]
        tokens = [word.lower() for word in tokens]
        porter = PorterStemmer()
        prepared_text = []
        for word in tokens:
            if(tokens.count(word) < 1/15 or len(tokens) < 120):
                prepared_text.append(porter.stem(word))
        return prepared_text
    
    elif(lang == 'fa'):
        normalizer = Normalizer()
        normalized_text = normalizer.normalize(raw_text)
        tokenizer = WordTokenizer()
        tokenized_text = tokenizer.tokenize(normalized_text)
        ps = PersianStemmer()
        prepared_text = []
        for word in tokenized_text:
            if(word[0] >= "آ" and word[0] <= "ی" and (tokenized_text.count(word) < 1/15 or len(tokenized_text) < 120)):
                prepared_text.append(ps.run(word))
        return prepared_text

raw_text = input()
print(prepare_text(raw_text, 'en'))

KeyboardInterrupt: Interrupted by user

<h1>Part 2 : Indexing</h1>
<div>In this section we build a function for positional indexing and biword indexing. We then save this indexes and also we have functions for adding and deleting docs in a dynamic way meaning that you don't need to repeat the indexing process from the beginning.</div>

In [31]:
positional_index = {}
def add_positional(term, docid, position, t):
    if(term not in positional_index.keys()):
        positional_index[term] = {}
        positional_index[term][docid] = {}
        positional_index[term][docid][t] = [position]
    else:
        if(docid not in positional_index[term].keys()):
            positional_index[term][docid] = {}
            positional_index[term][docid][t] = [position]
        else:
            if(t not in positional_index[term][docid]):
                positional_index[term][docid][t] = [position]
            else:
                positional_index[term][docid][t].append(position)

In [32]:
docids = []
def construct_positional_indexes(docs_path):
    # TODO 
    from xml.dom import minidom
    mydoc = minidom.parse(docs_path)
    texts = mydoc.getElementsByTagName('text')
    titles = mydoc.getElementsByTagName('title')
    ids = mydoc.getElementsByTagName('id')
    import xml.etree.ElementTree as ET
    tree = ET.parse(docs_path)
    root = tree.getroot()
    for child in root:
        for c in child:
            if(c.tag == '{http://www.mediawiki.org/xml/export-0.10/}id'):
                docids.append(c.text)
    for i in range(len(texts)):
        A = prepare_text(titles[i].firstChild.data)
        B = prepare_text(texts[i].firstChild.data)
        for j in range(len(A)):
            add_positional(A[j], docids[i], j, 'title')
            
        for j in range(len(B)):
            add_positional(B[j], docids[i], j, 'text')


construct_positional_indexes('persian.xml')

In [36]:
# print(len(positional_index.keys()))
print(docids)

['3014', '3016', '3017', '3021', '3022', '3023', '3026', '3027', '3029', '3030', '3033', '3036', '3037', '3039', '3041', '3043', '3046', '3047', '3049', '3055', '3056', '3058', '3059', '3060', '3061', '3065', '3068', '3069', '3070', '3071', '3072', '3073', '3074', '3076', '3091', '3095', '3098', '3099', '3100', '3101', '3103', '3111', '3117', '3118', '3119', '3120', '3121', '3128', '3129', '3130', '3141', '3157', '3179', '3197', '3199', '3205', '3217', '3219', '3220', '3227', '3229', '3233', '3243', '3248', '3251', '3252', '3260', '3261', '3263', '3273', '3274', '3276', '3277', '3280', '3282', '3283', '3284', '3285', '3286', '3287', '3288', '3289', '3290', '3291', '3292', '3293', '3294', '3295', '3296', '3297', '3298', '3300', '3301', '3302', '3303', '3304', '3305', '3307', '3308', '3309', '3310', '3311', '3312', '3313', '3314', '3315', '3316', '3317', '3318', '3319', '3320', '3324', '3325', '3329', '3331', '3332', '3333', '3334', '3335', '3339', '3341', '3342', '3343', '3345', '3347',

In [None]:
bigram_index = {}
def add_bigram(word):
    new_word = "$" + word + "$"
    for i in range(len(new_word) - 1):
        bi = new_word[i] + new_word[i + 1]
        if(bi not in bigram_index.keys()):
            bigram_index[bi] = [word]
        else:
            if(word not in bigram_index[bi]):
                bigram_index[bi].append(word)

In [None]:
def construct_bigram_indexes(docs_path):
    # TODO 
    from xml.dom import minidom
    mydoc = minidom.parse(docs_path)
    texts = mydoc.getElementsByTagName('text')
    titles = mydoc.getElementsByTagName('title')
    for i in range(len(texts)):
        A = prepare_text(titles[i].firstChild.data)
        B = prepare_text(texts[i].firstChild.data)
        for j in range(len(A)):
            add_bigram(A[j])
            
        for j in range(len(B)):
            add_bigram(B[j])

construct_bigram_indexes('persian.xml')

In [None]:
def add_document_to_indexes(docs_path, doc_num):
    # TODO 
    from xml.dom import minidom
    mydoc = minidom.parse(docs_path)
    texts = mydoc.getElementsByTagName('text')
    titles = mydoc.getElementsByTagName('title')
    if(doc_num not in docids):
        docs = []
        import xml.etree.ElementTree as ET
        tree = ET.parse(docs_path)
        root = tree.getroot()
        for child in root:
            for c in child:
                if(c.tag == '{http://www.mediawiki.org/xml/export-0.10/}id'):
                    docs.append(c.text)
        i = docs.index(doc_num)
        A = prepare_text(titles[i].firstChild.data)
        B = prepare_text(texts[i].firstChild.data)
        for j in range(len(A)):
            add_positional(A[j], doc_num, j, 'title')
        
        for j in range(len(B)):
            add_positional(B[j], doc_num, j, 'text')
        
        docids.append(doc_num)

add_document_to_indexes('persian.xml', '3022')

In [None]:
def delete_document_from_indexes(docs_path, doc_num):
    from xml.dom import minidom
    mydoc = minidom.parse(docs_path)
    texts = mydoc.getElementsByTagName('text')
    titles = mydoc.getElementsByTagName('title')
    if(doc_num in docids):
        docs = []
        import xml.etree.ElementTree as ET
        tree = ET.parse(docs_path)
        root = tree.getroot()
        for child in root:
            for c in child:
                if(c.tag == '{http://www.mediawiki.org/xml/export-0.10/}id'):
                    docs.append(c.text)
        i = docs.index(doc_num)
        A = prepare_text(titles[i].firstChild.data)
        B = prepare_text(texts[i].firstChild.data)
        for j in range(len(A)):
            if(A[j] in positional_index.keys()):
                if(doc_num in positional_index[A[j]]):
                    del positional_index[A[j]][doc_num]
                if(len(positional_index[A[j]].keys()) == 0):
                    del positional_index[A[j]]
        
        for j in range(len(B)):
            if(B[j] in positional_index.keys()):
                if(doc_num in positional_index[B[j]]):
                    del positional_index[B[j]][doc_num]
                if(len(positional_index[B[j]].keys()) == 0):
                    del positional_index[B[j]]
        
        docids.remove(doc_num)


        

delete_document_from_indexes('persian.xml', '3022')

In [37]:
import json

def save_index(destination):
    j = json.dumps(positional_index)
    f = open(destination,"w")
    f.write(j)
    f.close()
    pass

save_index('positional.json')

In [None]:
import json

def load_index(source):
    with open(source) as json_file:
        positional_index = json.load(json_file)

load_index('positional.json')