In [2]:
!pip install ir-evaluation-py



In [1]:
import os
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import json
import operator
from ir_evaluation.effectiveness import effectiveness
import time
# nltk.download('stopwords')

C:\Users\Bap_Bap\anaconda3\lib\site-packages\numpy\.libs\libopenblas.QVLO2T66WEPI7JZ63PS3HMOHFEY472BC.gfortran-win_amd64.dll
C:\Users\Bap_Bap\anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


In [2]:
def serialize_sets(obj):
    if isinstance(obj, set):
        return list(obj)

    return obj

# 1. Process dataset

In [5]:
def text_process(content):
    stop_list = stopwords.words('english')
    stem = PorterStemmer()

    content = content.lstrip()
    content = re.sub('[^A-Za-z]+', ' ', content)
    tokens = nltk.word_tokenize(content)
  
    stem_tokens = [stem.stem(word) for word in tokens if word not in stop_list]
    clean_stem_tokens = ' '.join(map(str,  stem_tokens))

    shortword = re.compile(r'\W*\b\w{1,2}\b')
    clean_stem_tokens = shortword.sub('', clean_stem_tokens)

    return clean_stem_tokens

Create two folders to stores files of Cranfield dataset (cran/) and NFCorpus test (nfcorpus/)
For each dataset, create folders to store processed data:
- docs_files: a folder to store each raw doc in separated file
- processed_docs: a folder to store processed doc in separated file
- query_files: a folder to store each raw query in separated file
- vsm_result: a folder to store ranked list of VSM model of each doc in separated file
- prob_result: a folder to store ranked list of VSM model of each doc in separated file

## 1.1. Cranfield 1400

In [6]:
# store each doc in a seperated file
with open('./corpus/cran/cran.all.1400', 'r') as f:
    content = f.readlines()
    for line in content:
      if line.startswith('.I'):
          f_ = line.split('.I')[1]
          name = './cran/doc_files/' + f_[:-1].strip() + '.txt'
          f1 = open(name, 'w')
          f1.write(line)
          f1.close()
      else:
          with open(name, 'a') as f1:
              f1.write(line)
          f1.close()
    f.close()

In [7]:
# save processed doc
dir = os.listdir('./cran/doc_files/')
for d in dir:
    with open('./cran/doc_files/' + d, 'r') as f:
        content = f.readlines()
        flag = False
        content_ = ''
        for line in content:
            if line.startswith('.W'):
                flag = True
            if flag == True:
                if line.startswith('.W'):
                    continue
                else:
                    content_ = content_ + line
        new_content = text_process(content_)
        with open('./cran/processed_doc/' + d, 'w') as f1:
            f1.write(new_content)
            f1.close()
    f.close()

## 1.2. NFCorpus-test

In [8]:
# store each doc in a seperated file
with open('./corpus/nfcorpus/test.docs', 'r') as f:
    lines = f.readlines()
    for line in lines:
        id, doc = line.split('\t')
        f1 = open('./nfcorpus/doc_files/' + id + '.txt', 'w')
        f1.write(doc)
        f1.close()
    f.close()

In [10]:
# save processed doc
dir = os.listdir('./nfcorpus/doc_files/')
for d in dir:
    with open('./nfcorpus/doc_files/' + d, 'r') as f:
        content = f.read()
        new_content = text_process(content)
        with open('./nfcorpus/processed_doc/' + d, 'w') as f1:
            f1.write(new_content)
            f1.close()
    f.close()

# 2. Indexing

In [121]:
def create_vector_index(path):
    inverted_index = {} 
    dir = os.listdir(path)
    no_docs = len(dir)
    for d in dir:
        id = d.split('.')[0]
        f = open(path + d, 'r')
        content = f.read()
        f.close()
        tokens = nltk.word_tokenize(content)
        for token in tokens:
            if token not in inverted_index:
                inverted_index[token] = {}
            if id not in inverted_index[token]:
                inverted_index[token][id]= 1
            else:
                inverted_index[token][id] += 1
    doc = {}
    for term in inverted_index:
        idf = np.log10(no_docs/len(inverted_index[term]))
        for id in inverted_index[term]:
            if id not in doc:
                doc[id] = []
            inverted_index[term][id] = (1 + np.log10(inverted_index[term][id]))*idf
            doc[id].append(inverted_index[term][id])
    doc_norm = {}
    for id in doc:
        sqr = np.square(doc[id])
        sum_sqr = np.sum(sqr)
        norm = np.sqrt(sum_sqr)
        doc_norm[id] = norm
    for term in inverted_index:
        for id in inverted_index[term]:
            inverted_index[term][id] = inverted_index[term][id]/doc_norm[id]
    return inverted_index

In [122]:
def create_prob_index(path):
    inverted_index = {} 
    doc_len = {}
    dir = os.listdir(path)
    L_c = 0
    for d in dir:
        id = d.split('.')[0]
        f = open(path + d, 'r')
        content = f.read()
        f.close()
        tokens = nltk.word_tokenize(content)
        doc_length = len(tokens)
        doc_len[id] = doc_length
        L_c += doc_length
        for token in tokens:
            if token not in inverted_index:
                inverted_index[token] = {}
                inverted_index[token]['all'] = 0
            if id not in inverted_index[token]:
                inverted_index[token][id] = 1
            else: inverted_index[token][id] += 1
            inverted_index[token]['all'] += 1
    for term in inverted_index:
        inverted_index[term]['all'] =  inverted_index[term]['all']/L_c
        for id in inverted_index[term]:
            if id == 'all':
                continue
            inverted_index[term][id] = inverted_index[term][id]/doc_len[id]
    return inverted_index

## 2.1. Cranfield 1400

In [123]:
start_time = time.time()
index = create_vector_index('./cran/processed_doc/')
f = open('./cran/inverted_index_vect.txt', 'w')
f.write(json.dumps(index))
f.close()
print(time.time() - start_time)

1.255183219909668


In [124]:
start_time = time.time()
index= create_prob_index('./cran/processed_doc/')
f = open('./cran/inverted_index_prob.txt', 'w')
f.write(json.dumps(index))
f.close()
print(time.time() - start_time)

1.3352742195129395


## 2.2. NFCorpus-test

In [173]:
start_time = time.time()
index = create_vector_index('./nfcorpus/processed_doc/')
f = open('./nfcorpus/inverted_index_vect.txt', 'w')
f.write(json.dumps(index))
f.close()
print(time.time() - start_time)

4.070080757141113


In [172]:
start_time = time.time()
index = create_prob_index('./nfcorpus/processed_doc/')
f = open('./nfcorpus/inverted_index_prob.txt', 'w')
f.write(json.dumps(index))
f.close()
print(time.time() - start_time)

4.028642654418945


# 3. Retrieval

In [127]:
def text_process(content):
    stop_list = stopwords.words('english')
    stem = PorterStemmer()

    content = content.lstrip()
    content = re.sub('[^A-Za-z]+', ' ', content)
    tokens = nltk.word_tokenize(content)
  
    stem_tokens = [stem.stem(word) for word in tokens if word not in stop_list]
    clean_stem_tokens = ' '.join(map(str,  stem_tokens))

    shortword = re.compile(r'\W*\b\w{1,2}\b')
    clean_stem_tokens = shortword.sub('', clean_stem_tokens)

    return clean_stem_tokens

In [128]:
def process_query(qry):
    return text_process(qry)

In [129]:
def read_index(index_path):
    f = open(index_path, 'r')
    data = f.read()
    index = json.loads(data)
    f.close()

    return index

In [130]:
def vsm_retrieve(qry, index):
    rst = {}
    tokens = nltk.word_tokenize(qry)
    for token in tokens:
        if token not in index:
            continue
        for doc_id in index[token]:
            if doc_id not in rst:
                rst[doc_id] = index[token][doc_id]
            else:
                rst[doc_id] += index[token][doc_id]
    ranked_list = dict(sorted(rst.items(), key=operator.itemgetter(1),reverse=True))
    return list(ranked_list.keys())

In [145]:
def query_likelihood_retrieve(qry, index, l=0.5):
    rst = {}
    tokens = nltk.word_tokenize(qry)
    token_in = []
    for token in tokens:
        if token not in index:
            continue
        token_in.append(token)
        for doc_id in index[token]:
            if doc_id == 'all':
                continue
            rst[doc_id] = 0
    for id in rst:
        for token in token_in:
            if index[token].get(id) is None:
                rst[id] += np.log10((1-l)*index[token]['all'])
            else: rst[id] += np.log10(l*index[token][id] + (1-l)*index[token]['all'])
    ranked_list = dict(sorted(rst.items(), key=operator.itemgetter(1),reverse=True))
    return list(ranked_list.keys())

## 3.1. Cranfield 1400

In [132]:
#save each query in seperate file
f = open('./corpus/cran/cran.qry', 'r')
lines = f.readlines()
f.close()
flag = False
count = 1
for line in lines:
    if line.startswith('.I'):
        if flag == True:
            f = open('./cran/query_files/' + qry_id + '.txt', 'w')
            f.write(qry)
            f.close()
        qry_id = str(count)
        qry = ''
        count += 1
        flag = False
    elif line.startswith('.W'):
        flag = True
    else: qry += line
f = open('./cran/query_files/' + qry_id + '.txt', 'w')
f.write(qry)
f.close()

In [133]:
start_time = time.time()
index = read_index('./cran/inverted_index_vect.txt')
query_list = os.listdir('./cran/query_files/')
for name in query_list:
    qry_id = name.split('.')[0]
    f = open('./cran/query_files/' + name, 'r')
    qry = f.read()
    f.close()
    qry = process_query(qry)
    retrieval_result = vsm_retrieve(qry, index)
    f = open('./cran/vsm_result/' + name, 'w')
    f.write(json.dumps(retrieval_result))
    f.close()
print(time.time() - start_time)

1.5505785942077637


In [142]:
start_time = time.time()
index = read_index('./cran/inverted_index_prob.txt')
query_list = os.listdir('./cran/query_files/')
for name in query_list:
    qry_id = name.split('.')[0]
    f = open('./cran/query_files/' + name, 'r')
    qry = f.read()
    f.close()
    qry = process_query(qry)
    retrieval_result = query_likelihood_retrieve(qry, index, l=0.5)
    f = open('./cran/prob_result/' + name, 'w')
    f.write(json.dumps(retrieval_result))
    f.close()
print(time.time() - start_time)

4.739270448684692


## 3.2. NFCorpus-test

In [138]:
f = open('./corpus/nfcorpus/test.all.queries', 'r', encoding='utf-8')
lines = f.readlines()
f.close()
for line in lines:
    if line == '':
        continue
    qry_id, qry = line.split('\t')
    f = open('./nfcorpus/query_files/' + qry_id + '.txt', 'w', encoding='utf-8')
    f.write(qry)
    f.close()

In [136]:
start_time = time.time()
index = read_index('./nfcorpus/inverted_index_vect.txt')
query_list = os.listdir('./nfcorpus/query_files/')
for name in query_list:
    qry_id = name.split('.')[0]
    f = open('./nfcorpus/query_files/' + name, 'r', encoding='utf-8')
    qry = f.read()
    f.close()
    qry = process_query(qry)
    retrieval_result = vsm_retrieve(qry, index)
    f = open('./nfcorpus/vsm_result/' + name, 'w', encoding='utf-8')
    f.write(json.dumps(retrieval_result))
    f.close()
print(time.time() - start_time)

32.61549711227417


In [146]:
start_time = time.time()
index = read_index('./nfcorpus/inverted_index_prob.txt')
query_list = os.listdir('./nfcorpus/query_files/')
for name in query_list:
    qry_id = name.split('.')[0]
    f = open('./nfcorpus/query_files/' + name, 'r', encoding='utf-8')
    qry = f.read()
    f.close()
    qry = process_query(qry)
    retrieval_result = query_likelihood_retrieve(qry, index, l=0.5)
    f = open('./nfcorpus/prob_result/' + name, 'w', encoding='utf-8')
    f.write(json.dumps(retrieval_result))
    f.close()
print(time.time() - start_time)

1735.146465063095


# 4. Evaluation

In [147]:
# --> an object, which we can use all methods in it, is created
ir = effectiveness()

## 4.1. Cranfield 1400

### Mô hình không gian vector

In [148]:
ir = effectiveness()
interactions = {}

f = open('./corpus/cran/cranqrel', 'r')
lines = f.readlines()
f.close()
for line in lines:
    qry_id, doc_id, _ = line.split()
    if interactions.get(qry_id) is None:
        interactions[qry_id] = {}

    if interactions[qry_id].get('related_documents') is None:
        interactions[qry_id]['related_documents'] = set()
    interactions[qry_id]['related_documents'].add(doc_id)

In [149]:
dir = os.listdir('./cran/vsm_result/')
for d in dir:
    qry_id = d.split('.')[0]
    f = open('./cran/vsm_result/' + d, 'r')
    content = f.read()
    f.close()
    rst_list = json.loads(content)
    interactions[qry_id]['total_result'] = len(rst_list)
    for i in range(len(rst_list)):
        if rst_list[i] in interactions[qry_id]['related_documents']:
            if interactions[qry_id].get('visited_documents') is None:
                interactions[qry_id]['visited_documents'] = []
            interactions[qry_id]['visited_documents'].append(rst_list[i])
            if interactions[qry_id].get('visited_documents_orders') is None:
                interactions[qry_id]['visited_documents_orders'] = {}
            interactions[qry_id]['visited_documents_orders'][rst_list[i]] = i+1

In [150]:
for qry_id in interactions:
    if interactions[qry_id].get('total_result') is None:
        interactions[qry_id]['total_result'] =  '0'
    if interactions[qry_id].get('visited_documents') is None:
        interactions[qry_id]['visited_documents'] = []
    if interactions[qry_id].get('visited_documents_orders') is None:
        interactions[qry_id]['visited_documents_orders'] = {}

In [151]:
print ("Mean Average Precision:")
mean_ap = ir.mean_ap(interactions,['all'])
print(mean_ap)

Mean Average Precision:
{'all': {'count': 225, 'value': 0.38703378534464195}}


In [152]:
print ("Eleven Point - Interpolated Average Precision:")
print ("Recall => Precision")
iap = ir.iap(interactions)
for key in iap:
    print(key, ':', iap[key])

Eleven Point - Interpolated Average Precision:
Recall => Precision
0.0 : 0.7861359193438143
0.1 : 0.7512610302073218
0.2 : 0.6392701577396239
0.3 : 0.5329488259116696
0.4 : 0.4518462924363577
0.5 : 0.38672406643723584
0.6 : 0.2971547690926209
0.7 : 0.2209523838865997
0.8 : 0.18170204096079456
0.9 : 0.13069001005036904
1.0 : 0.11511882697965727


In [153]:
f = open('./cran/vsm_rst.txt', 'w')
f.write(json.dumps(interactions, default=serialize_sets))
f.close()

### Mô hình xác suất Query Likelihood

In [154]:
ir = effectiveness()
interactions = {}

f = open('./corpus/cran/cranqrel', 'r')
lines = f.readlines()
f.close()
for line in lines:
    qry_id, doc_id, _ = line.split()
    if interactions.get(qry_id) is None:
        interactions[qry_id] = {}

    if interactions[qry_id].get('related_documents') is None:
        interactions[qry_id]['related_documents'] = set()
    interactions[qry_id]['related_documents'].add(doc_id)

In [155]:
dir = os.listdir('./cran/prob_result/')
for d in dir:
    qry_id = d.split('.')[0]
    f = open('./cran/prob_result/' + d, 'r')
    content = f.read()
    f.close()
    rst_list = json.loads(content)
    interactions[qry_id]['total_result'] = len(rst_list)
    for i in range(len(rst_list)):
        if rst_list[i] in interactions[qry_id]['related_documents']:
            if interactions[qry_id].get('visited_documents') is None:
                interactions[qry_id]['visited_documents'] = []
            interactions[qry_id]['visited_documents'].append(rst_list[i])
            if interactions[qry_id].get('visited_documents_orders') is None:
                interactions[qry_id]['visited_documents_orders'] = {}
            interactions[qry_id]['visited_documents_orders'][rst_list[i]] = i+1

In [156]:
for qry_id in interactions:
    if interactions[qry_id].get('total_result') is None:
        interactions[qry_id]['total_result'] =  '0'
    if interactions[qry_id].get('visited_documents') is None:
        interactions[qry_id]['visited_documents'] = []
    if interactions[qry_id].get('visited_documents_orders') is None:
        interactions[qry_id]['visited_documents_orders'] = {}

In [157]:
print ("Mean Average Precision:")
mean_ap = ir.mean_ap(interactions,['all'])
print(mean_ap)

Mean Average Precision:
{'all': {'count': 225, 'value': 0.405603669730284}}


In [158]:
print ("Eleven Point - Interpolated Average Precision:")
print ("Recall => Precision")
iap = ir.iap(interactions)
for key in iap:
    print(key, ':', iap[key])

Eleven Point - Interpolated Average Precision:
Recall => Precision
0.0 : 0.8344000289198265
0.1 : 0.8006503280034587
0.2 : 0.6864603822848977
0.3 : 0.554665330586629
0.4 : 0.44945223825311714
0.5 : 0.3887348060319673
0.6 : 0.3113657273362951
0.7 : 0.23169529902585412
0.8 : 0.18918105618024614
0.9 : 0.13492137570221535
1.0 : 0.11857584570608407


In [159]:
f = open('./cran/prob_rst.txt', 'w')
f.write(json.dumps(interactions, default=serialize_sets))
f.close()

## 4.2. NFCorpus

### Mô hình không gian vector

In [160]:
ir = effectiveness()
interactions = {}

f = open('./corpus/nfcorpus/test.2-1-0.qrel', 'r')
lines = f.readlines()
f.close()
for line in lines:
    qry_id, _, doc_id, _ = line.split()
    if interactions.get(qry_id) is None:
        interactions[qry_id] = {}

    if interactions[qry_id].get('related_documents') is None:
        interactions[qry_id]['related_documents'] = set()
    interactions[qry_id]['related_documents'].add(doc_id)

In [161]:
dir = os.listdir('./nfcorpus/vsm_result/')
for d in dir:
    qry_id = d.split('.')[0]
    if qry_id not in interactions:
        continue
    f = open('./nfcorpus/vsm_result/' + d, 'r')
    content = f.read()
    f.close()
    rst_list = json.loads(content)
    interactions[qry_id]['total_result'] = len(rst_list)
    for i in range(len(rst_list)):
        if rst_list[i] in interactions[qry_id]['related_documents']:
            if interactions[qry_id].get('visited_documents') is None:
                interactions[qry_id]['visited_documents'] = []
            interactions[qry_id]['visited_documents'].append(rst_list[i])
            if interactions[qry_id].get('visited_documents_orders') is None:
                interactions[qry_id]['visited_documents_orders'] = {}
            interactions[qry_id]['visited_documents_orders'][rst_list[i]] = i+1

In [162]:
for qry_id in interactions:
    if interactions[qry_id].get('total_result') is None:
        interactions[qry_id]['total_result'] =  '0'
    if interactions[qry_id].get('visited_documents') is None:
        interactions[qry_id]['visited_documents'] = []
    if interactions[qry_id].get('visited_documents_orders') is None:
        interactions[qry_id]['visited_documents_orders'] = {}

In [163]:
print ("Mean Average Precision:")
mean_ap = ir.mean_ap(interactions,['all'])
print(mean_ap)

Mean Average Precision:
{'all': {'count': 323, 'value': 0.2027836490060701}}


In [164]:
print ("Eleven Point - Interpolated Average Precision:")
print ("Recall => Precision")
iap = ir.iap(interactions)
for key in iap:
    print(key, ':', iap[key])

Eleven Point - Interpolated Average Precision:
Recall => Precision
0.0 : 0.6096476250282464
0.1 : 0.43853398593582427
0.2 : 0.334218016124052
0.3 : 0.25722762149618733
0.4 : 0.19817650741372284
0.5 : 0.16648342869225694
0.6 : 0.12595960971004824
0.7 : 0.0916692814734618
0.8 : 0.08233790962959209
0.9 : 0.07083097659219478
1.0 : 0.06272026337196357


In [165]:
f = open('./nfcorpus/vsm_rst.txt', 'w')
f.write(json.dumps(interactions, default=serialize_sets))
f.close()

### Mô hình Query Likelihood

In [166]:
ir = effectiveness()
interactions = {}

f = open('./corpus/nfcorpus/test.2-1-0.qrel', 'r')
lines = f.readlines()
f.close()
for line in lines:
    qry_id, _, doc_id, _ = line.split()
    if interactions.get(qry_id) is None:
        interactions[qry_id] = {}

    if interactions[qry_id].get('related_documents') is None:
        interactions[qry_id]['related_documents'] = set()
    interactions[qry_id]['related_documents'].add(doc_id)

In [167]:
dir = os.listdir('./nfcorpus/prob_result/')
for d in dir:
    qry_id = d.split('.')[0]
    f = open('./nfcorpus/prob_result/' + d, 'r')
    if qry_id not in interactions:
        continue
    content = f.read()
    f.close()
    rst_list = json.loads(content)
    interactions[qry_id]['total_result'] = len(rst_list)
    for i in range(len(rst_list)):
        if rst_list[i] in interactions[qry_id]['related_documents']:
            if interactions[qry_id].get('visited_documents') is None:
                interactions[qry_id]['visited_documents'] = []
            interactions[qry_id]['visited_documents'].append(rst_list[i])
            if interactions[qry_id].get('visited_documents_orders') is None:
                interactions[qry_id]['visited_documents_orders'] = {}
            interactions[qry_id]['visited_documents_orders'][rst_list[i]] = i+1

In [168]:
for qry_id in interactions:
    if interactions[qry_id].get('total_result') is None:
        interactions[qry_id]['total_result'] =  '0'
    if interactions[qry_id].get('visited_documents') is None:
        interactions[qry_id]['visited_documents'] = []
    if interactions[qry_id].get('visited_documents_orders') is None:
        interactions[qry_id]['visited_documents_orders'] = {}

In [169]:
print ("Mean Average Precision:")
mean_ap = ir.mean_ap(interactions,['all'])
print(mean_ap)

Mean Average Precision:
{'all': {'count': 323, 'value': 0.2172095682373519}}


In [170]:
print ("Eleven Point - Interpolated Average Precision:")
print ("Recall => Precision")
iap = ir.iap(interactions)
for key in iap:
    print(key, ':', iap[key])

Eleven Point - Interpolated Average Precision:
Recall => Precision
0.0 : 0.6421587821183629
0.1 : 0.4648908812516916
0.2 : 0.35977965517624294
0.3 : 0.2773239377317193
0.4 : 0.21195318144674277
0.5 : 0.1822185724843052
0.6 : 0.13751834797414308
0.7 : 0.10037797675996199
0.8 : 0.08913046005549542
0.9 : 0.077174623744552
1.0 : 0.06912051190070122


In [171]:
f = open('./nfcorpus/prob_rst.txt', 'w')
f.write(json.dumps(interactions, default=serialize_sets))
f.close()