In [1]:
#Done
from hazm import *

In [2]:
#Done
import os
import math

In [3]:
#Done
def get_stopwords(directory):
    stopwords = []
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), 'r') as f:
            for stopword in f.read().splitlines():
                if stopword not in stopwords:
                    stopwords.append(stopword)
    return stopwords        

In [4]:
#Done
def prepare_text(raw_text):
    prepared_text = []
    normalizer = Normalizer()
    normalized_text = normalizer.normalize(raw_text)
    tokenized_text = word_tokenize(normalized_text)
    stopwords = get_stopwords('stopwords')
    stemmer = Stemmer()
    
    for token in tokenized_text:
        if token not in stopwords:
            stemmed_text = stemmer.stem(token)
            if stemmed_text != "":
                prepared_text.append(stemmed_text)

    return prepared_text

In [5]:
#Done
def prepare_text(raw_text):
    prepared_text = []
    stopwords = get_stopwords('stopwords')
    
    normalizer = Normalizer()
    normalized_text = normalizer.normalize(raw_text)
    for stop in stopwords:
        normalized_text = normalized_text.replace(stop, " ")
    tokenized_text = word_tokenize(normalized_text)
    stemmer = Stemmer()
    
    for token in tokenized_text:
        if token not in stopwords:
            stemmed_text = stemmer.stem(token)
            if stemmed_text != "":
                prepared_text.append(stemmed_text)

    return prepared_text

In [6]:
#Done
from xml.dom.minidom import parse

In [7]:
#Done
# Reading Data (XML file) and store it in dom variable
dom = parse('project1_data/data/Persian.xml')

In [8]:
#Done
def get_title(raw_title):
    title = raw_title[0].childNodes[0].data
    return title

In [9]:
#Done
def get_text(raw_text):
    text = raw_text[0].childNodes[0].data
    return text

In [10]:
#Done
def get_id(raw_id):
    doc_id = raw_id[0].childNodes[0].data
    return doc_id

In [11]:
#Done
import json

In [12]:
#Done
positional_index = {}
doc_length = {} #number of words for each documents

def document_length(title_words_list, text_words_list, doc_id):
    all_words = []
    for word in title_words_list:
        if word not in all_words:
            all_words.append(word)
    
    for word in text_words_list:
        if word not in all_words:
            all_words.append(word)
    doc_length[doc_id] = len(all_words)
    

def construct_positional_indexes(docs_path):
    dom = parse(docs_path)
    pages = dom.getElementsByTagName('page')    
    for page in pages:
        title = get_title(page.getElementsByTagName('title'))
        text = get_text(page.getElementsByTagName('text'))
        doc_id = get_id(page.getElementsByTagName('id'))
        
        text_words_list = prepare_text(text)
        title_words_list = prepare_text(title)
        
        document_length(title_words_list, text_words_list, doc_id)
        
        for index, text_word in enumerate(text_words_list):
            if text_word not in positional_index:
                initializer = {doc_id: {'text': []}}
                positional_index[text_word] = initializer
            elif doc_id not in positional_index[text_word]:
                positional_index[text_word][doc_id] = {'text': []}
            elif 'text' not in positional_index[text_word][doc_id]:
                positional_index[text_word][doc_id]['text'] = []
            positional_index[text_word][doc_id]['text'].append(index)

            
        for index, title_word in enumerate(title_words_list):
            if title_word not in positional_index:
                initializer = {doc_id: {'title': []}}
                positional_index[title_word] = initializer
            elif doc_id not in positional_index[title_word]:
                positional_index[title_word][doc_id] = {'title': []}
            elif 'title' not in positional_index[title_word][doc_id]:
                positional_index[title_word][doc_id]['title'] = []
            positional_index[title_word][doc_id]['title'].append(index)
    

construct_positional_indexes('project1_data/data/Persian.xml')

In [13]:
#Done
def get_posting_list(word):
    # Make sure how to get posting list! From reading file or as an argument
    return positional_index[word]

In [14]:
def get_words_with_bigram(bigram):
    # WARNING: not based on slides     
    words = []
    for word in positional_index:
        if bigram in word:
            words.append(word)
    return words
# get_words_with_bigram('لا')

In [15]:
def get_words_with_bigram(bigram):
    # WARNING: based on slides    
    inverted_index = {}
    for word in positional_index:
        word_length = len(word)
        if word_length >= 2:
            for i in range(len(word) - 2):
                bi = word[i: i + 2]
                if bi not in inverted_index:
                    inverted_index[bi] = []
                if word not in inverted_index[bi]:
                    inverted_index[bi].append(word)
    return inverted_index[bigram]
                    
# get_words_with_bigram('لا')

In [17]:
def add_document_to_indexes(docs_path, doc_num):
    address = docs_path + '/' + doc_num + '.xml'
    construct_positional_indexes(address)

# add_document_to_indexes('data/wiki', 20)

In [18]:
def delete_document_from_indexes(docs_path, doc_num):
    address = docs_path + '/' + doc_num + '.xml'
    dom = parse(address)
    pages = dom.getElementsByTagName('page')    
    for page in pages:
        title = get_title(page.getElementsByTagName('title'))
        text = get_text(page.getElementsByTagName('text'))
        doc_id = get_id(page.getElementsByTagName('id'))
        
        text_words_list = prepare_text(text)
        title_words_list = prepare_text(title)
        
        for index, text_word in enumerate(text_words_list):
            if text_word not in positional_index:
                print('کلمه پیدا نشد')
            elif doc_id not in positional_index[text_word]:
                print('چنین کلمه‌ای با این شماره سند موجود نیست')
            else:
                positional_index[text_word].remove(doc_id)
            if len(positional_index[text_word]) == 0:
                del positional_index[text_word]
                
        for index, title_word in enumerate(title_words_list):
            if title_word not in positional_index:
                print('کلمه پیدا نشد')
            elif doc_id not in positional_index[title_word]:
                print('چنین کلمه‌ای با این شماره سند موجود نیست')
            else:
                positional_index[title_word].remove(doc_id)
            if len(positional_index[title_word]) == 0:
                del positional_index[title_word]

# delete_document_from_indexes('data/wiki', 10)

In [19]:
def save_index(destination):
    full_destination = destination + '.json'
    with open(full_destination, 'w') as f:
        json.dump(positional_index, f, ensure_ascii=False)

# save_index('storage/index_backup')

In [20]:
import re

In [110]:
def get_all_queries(directory):
    queries = []
    for filename in os.listdir(directory):
        with open (os.path.join(directory, filename), 'r') as f:
            query = f.readline()
            queries.append(query)
    return queries
queries = get_all_queries('project1_data/data/queries/')
for query in queries:
    print(parsing_query(query))

([], ['طبیعت', 'دامنه', 'کوه', 'ایرانی'])
(['علوم اجتماعی'], ['مطالعه', 'در', 'دانشگاه'])
([], ['هیتلر', 'در', 'جنگ', 'جهانی', 'اول'])
(['منظومه شمسی'], ['سیاره', 'های', 'بزرگ', '\n'])
([], ['جنگل', 'های', 'بلوط', 'ایران'])
([], ['زندگی', 'حیوانات', 'وحشی'])
([], ['مسابقات', 'فوتبال', 'المپیک'])
([], ['کشورهای', 'عضو', 'اتحادیه', 'آفریقا'])
([], ['کتاب', 'های', 'برگزیده', 'کودک', 'و', 'نوجوان'])
([], ['برنده', 'جایزه', 'بهترین', 'فیلم', 'در', 'جشنواره'])
([], ['ابزار', 'های', 'فضایی', 'و', 'پیشرفته', 'ناسا'])
([], ['سواحل', 'دریای', 'سرخ'])
(['خلیج فارس'], ['پایتخت', 'کشورهای', 'حوزه', ''])
([], ['کشورهای', 'دارای', 'نفت', 'در', 'خاورمینا'])
([], ['انتخابات', 'نمایندگان', 'مجلس', 'ایالتی', 'در', 'آمریکا'])
([], ['تاریخچه', 'هنر', 'نمایشی', 'در', 'ایران'])
(['باشگاه فوتبال'], ['', 'اروپایی'])
([], ['تاریخ', 'علوم', 'اجتماعی', 'در', 'اروپا'])
([], ['جاذبه', 'گردشگری', 'در', 'استان', 'کردستان'])
([], ['درمان', 'بیماری', 'افسردگی'])


In [199]:
def positional_intersect(phrase):
    phrases = prepare_text(phrase)
    
    if len(phrase) == 1:
        return get_posting_list(phrases[0])
    
    phrase_touples = []
    
    for i in range(len(phrases) - 1):
        phrase_touples.append((phrases[i], phrases[i + 1]))
    
    answer = {}
    
    for index, (phrase1, phrase2) in enumerate(phrase_touples):
        doc_list1 = {}
        if index == 0:
            doc_list1 = positional_index[phrase1]
            
        elif not answer:
            return answer
        
        else:
            doc_list1 = answer
            answer = {}

        try:
            doc_list_len1 = len(doc_list1)
            doc_list2 = positional_index[phrase2]
            doc_list_len2 = len(doc_list2)

        except:
            return answer
        i, j = 0, 0
        while i < doc_list_len1 and j < doc_list_len2:
            doc1_keys = list(doc_list1.keys())
            doc2_keys = list(doc_list2.keys())


            doc1_key = doc1_keys[i]
            doc2_key = doc2_keys[j]


            if doc1_key == doc2_key:
                doc_id = doc1_key
                contexts = ['title', 'text']
                for context in contexts:
                    try:
                        context_indices1 = doc_list1[doc1_key][context]
                        context_indices2 = doc_list2[doc2_key][context]
                    except:
                        continue


                    m, n = 0, 0
                    while m < len(context_indices1) and n < len(context_indices2):
                        first_phrase_index = context_indices1[m]
                        second_phrase_index = context_indices2[n]

                        if second_phrase_index - first_phrase_index == 1:
                            if doc_id not in answer:
                                answer[doc_id] = {}

                            if context not in answer[doc_id]:
                                answer[doc_id][context] = []

                            answer[doc_id][context].append(second_phrase_index)

                            m += 1
                            n += 1
                        elif first_phrase_index < second_phrase_index:
                            m += 1
                        else:
                            n += 1
                    i += 1
                    j += 1

            elif doc1_key < doc2_key:
                i += 1
            else:
                j += 1
                
#     print(answer)
    return answer


In [108]:
def get_phrase_occurence(phrase): #count the number of occurence of phrase in docuemnts
    documents = positional_intersect(phrase)
    document_occurence = len(list(documents.keys()))
    return document_occurence
print(get_phrase_occurence('خلیج فارس'))

58


In [152]:
def phrase_normalizer(double_quoted):
    new_double_quotes = []
    for quote in double_quoted:
        quote_list = prepare_text(quote)
        phrase = ""
        for quotes in quote_list:
            phrase += quotes + " "
        new_double_quotes.append(phrase.strip())
#         print(phrase)
    return new_double_quotes

def parsing_query(query):
    all_quotations = re.findall('"([^"]*)"', query)
    query = query.replace('"', "")
    for quoted in all_quotations:
        query = query.replace(quoted, "")
        query = query.replace('  ', " ")
    normalized_phrases = phrase_normalizer(all_quotations[:])
    not_sequential = prepare_text(query)
    return normalized_phrases, not_sequential

# double_quoted, not_sequential = parsing_query('سلام مردم "عزیز ایران" به برنامه "خندوانه هندوانه" خوش آمدید')

In [208]:
N = len(doc_length)

def get_quoted_posting_list(phrases):
    first_double_quote = phrases.pop()
    first_double_quote_info = positional_intersect(first_double_quote)
    first_double_quote_docs = list(first_double_quote_info.keys()) #docs containing the first double quote
    answer = first_double_quote_docs
    
    for double_quote in phrases:
        double_quote_info = positional_intersect(double_quote)
        if double_quote_info:            
            other_double_quote_docs = list(double_quote_info.keys())
            new_answer = []
            for ans_doc_id in answer:
                for doc_id_other in other_double_quote_docs:
                    if ans_doc_id == doc_id_other:
                        new_answer.append(ans_doc_id)
            answer = new_answer
        else:
            return []

    return answer


def get_dictionary(query_list):
    print('query_list', query_list)
    dic = {}
    for term in query_list:
        if term not in dic:
            dic[term] = 1
        else:
            dic[term] += 1
    return dic


def consine_score(query_list, method, weight, quoted=False):
    scores = {} #score of documents
    query_terms_occurence = get_dictionary(query_list) #terms occurence in query
    terms = list(query_terms_occurence.keys()) #all distinct terms in query
    print('terms', terms)
    for term in terms:
        doc_normalization = 0     
        if term in positional_index:
            docs_list = positional_index[term]
            tf_q = query_terms_occurence[term]
            document_occurence = len(docs_list)            
            idf = math.log(N / document_occurence)
            w_tq = tf_q * idf

            for doc in docs_list:
                tf = 0
                if 'title' in docs_list[doc]:
                    title_length = len(docs_list[doc]['title'])
                    tf += weight * (1 + math.log(title_length))
                elif 'text' in docs_list[doc]:
                    text_length = len(docs_list[doc]['text'])
                    tf += 1 + math.log(text_length)    
                w_td = tf
                doc_normalization += w_td * w_td
                if doc in scores:
                    scores[doc] += w_tq * w_td
                else:
                    scores[doc] = w_tq * w_td
    if method == 'ltc-lnc':
        doc_normalization = math.sqrt(doc_normalization)
        for doc in scores:
            scores[doc] /= doc_normalization
            
    scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
    return scores



def search(query, method="ltn-lnn", weight=2):
    double_quoted, not_sequential = parsing_query(query)
    if not double_quoted:
        prepared_query = prepare_text(query)
        return consine_score(prepared_query, method, weight)
    
    quoted = True #delete after refactoring
    if quoted:
        all_words = [] #new
        all_words.extend(double_quoted + not_sequential) #new
        docs_have_quoted_list = get_quoted_posting_list(double_quoted[:]) #new
        
        scores = {}
        query_terms_occurence = get_dictionary(all_words)
        terms = list(query_terms_occurence.keys()) #all distinct terms in query
        
        for term in terms:
            doc_normalization = 0
            if term in positional_index:
                docs_list = positional_index[term]
                tf_q = query_terms_occurence[term]
                document_occurence = len(docs_list)            
                idf = math.log(N / document_occurence)
                w_tq = tf_q * idf

                for doc in docs_list:
                    if doc in docs_have_quoted_list and quoted: #new
                        tf = 0
                        if 'title' in docs_list[doc]:
                            title_length = len(docs_list[doc]['title'])
                            tf += weight * (1 + math.log(title_length))
                        elif 'text' in docs_list[doc]:
                            text_length = len(docs_list[doc]['text'])
                            tf += 1 + math.log(text_length)    
                        w_td = tf
                        doc_normalization += w_td * w_td
                        if doc in scores:
                            scores[doc] += w_tq * w_td
                        else:
                            scores[doc] = w_tq * w_td
                        
            elif term in double_quoted: #new
                docs_list = positional_intersect(term)
                tf_q = query_terms_occurence[term]
                document_occurence = len(docs_list)
                idf = math.log(N / document_occurence)
                w_tq = tf_q * idf
                
                for doc in docs_list:
                    if doc in docs_have_quoted_list and quoted:
                        tf = 0
                        if 'title' in docs_list[doc]:
                            title_length = len(docs_list[doc]['title'])
                            tf += weight * (1 + math.log(title_length))
                        elif 'text' in docs_list[doc]:
                            text_length = len(docs_list[doc]['text'])
                            tf += 1 + math.log(text_length)    
                        w_td = tf
                        doc_normalization += w_td * w_td
                        if doc in scores:
                            scores[doc] += w_tq * w_td
                        else:
                            scores[doc] = w_tq * w_td
                            
        if method == 'ltc-lnc':
            doc_normalization = math.sqrt(doc_normalization)
            for doc in scores:
                scores[doc] /= doc_normalization

        scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1])}
        return scores

            



all_quotations []
query_list ['طبیع', 'دامنه', 'کوه', 'ایران']
terms ['طبیع', 'دامنه', 'کوه', 'ایران']
all_quotations ['علوم اجتماعی']
query_list ['علو اجتماع', 'مطالعه', 'در', 'دانشگاه']
all_quotations []
query_list ['هیتلر', 'در', 'جنگ', 'جهان', 'اول']
terms ['هیتلر', 'در', 'جنگ', 'جهان', 'اول']
all_quotations ['منظومه شمسی']
query_list ['منظومه شمس', 'سیاره', 'بزرگ']
all_quotations []
query_list ['جنگل', 'بلوط', 'ایر']
terms ['جنگل', 'بلوط', 'ایر']
all_quotations []
query_list ['زندگ', 'حیو', 'وحش']
terms ['زندگ', 'حیو', 'وحش']
all_quotations []
query_list ['مسابق', 'فوتبال', 'المپیک']
terms ['مسابق', 'فوتبال', 'المپیک']
all_quotations []
query_list ['کشور', 'عضو', 'اتحادیه', 'آفریقا']
terms ['کشور', 'عضو', 'اتحادیه', 'آفریقا']
all_quotations []
query_list ['کتاب', 'برگزیده', 'کودک', 'و', 'نوجو']
terms ['کتاب', 'برگزیده', 'کودک', 'و', 'نوجو']
all_quotations []
query_list ['برنده', 'جایزه', 'به', 'فیل', 'در', 'جشنواره']
terms ['برنده', 'جایزه', 'به', 'فیل', 'در', 'جشنواره']
all_quota

In [214]:
result = []
for i in range(len(queries)):
    result.append(search(queries[i]))
# search(queries[2], 'ltc-lnc')

{'3016': 1.641623965121526,
 '3017': 0.7822426152681052,
 '3022': 1.3244518785550305,
 '3023': 1.641623965121526,
 '3026': 3.274812137801675,
 '3027': 25.91943414154788,
 '3029': 5.767381660335244,
 '3030': 24.574279236958137,
 '3033': 6.261500596429558,
 '3036': 1.8666611418419556,
 '3037': 2.0412135370021955,
 '3039': 5.323972529886772,
 '3047': 1.3244518785550305,
 '3049': 1.641623965121526,
 '3059': 1.641623965121526,
 '3061': 1.3244518785550305,
 '3065': 0.7822426152681052,
 '3068': 1.8666611418419556,
 '3069': 2.9005995731976144,
 '3070': 0.7822426152681052,
 '3072': 1.3244518785550305,
 '3073': 1.3244518785550305,
 '3091': 0.7822426152681052,
 '3099': 13.081418709117564,
 '3103': 15.214493247379705,
 '3119': 19.3546949593205,
 '3120': 23.009015542816602,
 '3129': 1.8666611418419556,
 '3130': 0.7822426152681052,
 '3197': 7.736558544039125,
 '3199': 3.4684537469017758,
 '3205': 2.9005995731976144,
 '3220': 10.543073601545453,
 '3233': 1.641623965121526,
 '3248': 6.312271085538419,