In [1]:
import numpy as np
import json
import requests
import re
import sys
import time

from itertools import groupby
from joblib import dump, load
from sklearn import svm
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from elasticsearch import Elasticsearch

In [2]:
paper_ids = []
paper_authors = {}

In [3]:
def crawler(start_urls, limit):
    process = CrawlerProcess(get_project_settings())
    process.crawl('first', start_urls=start_urls, limit=limit)
    process.start()

In [4]:
# start = time.time()
# crawler(start_urls = [
#             "https://www.semanticscholar.org/paper/The-Lottery-Ticket-Hypothesis%3A-Training-Pruned-Frankle-Carbin/f90720ed12e045ac84beb94c27271d6fb8ad48cf",
#             "https://www.semanticscholar.org/paper/Attention-is-All-you-Need-Vaswani-Shazeer/204e3073870fae3d05bcbc2f6a8e263d9b72e776",
#             "https://www.semanticscholar.org/paper/BERT%3A-Pre-training-of-Deep-Bidirectional-for-Devlin-Chang/df2b0e26d0599ce3e70df8a9da02e51594e0e992"
#         ], limit=2000)
# print(time.time() - start)

In [5]:
res = requests.get('http://localhost:9200')
print(res.content)

b'{\n  "name" : "DESKTOP-ROLDIVD",\n  "cluster_name" : "elasticsearch",\n  "cluster_uuid" : "5oe-5bcqR7-aftBs4zCoOQ",\n  "version" : {\n    "number" : "7.7.1",\n    "build_flavor" : "unknown",\n    "build_type" : "unknown",\n    "build_hash" : "ad56dce891c901a492bb1ee393f12dfff473a423",\n    "build_date" : "2020-05-28T16:30:01.040088Z",\n    "build_snapshot" : false,\n    "lucene_version" : "8.5.1",\n    "minimum_wire_compatibility_version" : "6.8.0",\n    "minimum_index_compatibility_version" : "6.0.0-beta1"\n  },\n  "tagline" : "You Know, for Search"\n}\n'


In [6]:
# def es_iterate_all_documents(es, index, pagesize=350, **kwargs):
#     """
#     Helper to iterate ALL values from
#     Yields all the documents.
#     """
#     offset = 0
#     while True:
#         result = es.search(index=index, **kwargs, body={
#             "size": pagesize,
#             "from": offset
#         })
#         hits = result["hits"]["hits"]
#         # Stop after no more docs
#         if not hits:
#             break
#         # Yield each entry
#         yield from (hit['_source'] for hit in hits)
#         # Continue from there
#         offset += pagesize

In [30]:
class Index:
    def __init__(self, host, port, data_dir):
        self.es = Elasticsearch([{'host': host, 'port': port}])
        with open(data_dir, encoding="utf8") as json_file:
            self.papers = json.load(json_file)
    def delete(self, index_name='paper_index'):
        self.es.indices.delete(index=index_name, ignore=[400, 404])
    
    def save_data(self, index_name='paper_index'):
        global paper_ids
        for paper in self.papers:
            paper_ids.append(paper['id'])
            paper_authors[paper['id']] = paper['authors']
            self.es.index(index=index_name, id=paper['id'], body=json.dumps({"paper":paper}))

In [12]:
def calc_alpha(host, port, alpha, index_name='paper_index'):
    es = Elasticsearch([{'host': host, 'port': port}])
    global paper_ids
    p = np.zeros((len(paper_ids), len(paper_ids)))
    aux = np.ones((len(paper_ids), len(paper_ids))) * (1/len(paper_ids))
    for paper_idx, paper_id in enumerate(paper_ids):
        entry = es.get(index_name, id=paper_id)['_source']['paper']
        references = entry.get('references', False)
        if references:
            for reference_id in entry['references']:
                try:
                    reference_idx = paper_ids.index(reference_id)
                    p[reference_idx][paper_idx] = 1
                except ValueError:
                    pass

    sums = np.sum(p, axis=1, keepdims=True)
    p = ((sums > 0) * 1) * (((1 - alpha) * p) / (sums + (sums == 0) * 1) + (alpha * aux))  + ((sums == 0) * 1) * aux
    x = np.ones((len(paper_ids))) * (1/(len(paper_ids)))
    while True:
        aux = x @ p
        if np.all(np.abs(aux - x) < 0.00001):
            break
        x = aux
    p = 0
    aux = 0
    del p
    del aux
    for idx, paper_id in enumerate(paper_ids):
        body = es.get(index_name, id=paper_id)['_source']
        body['paper']['page_rank'] = x[idx]
        response = es.index(index=index_name, id=paper_id, body=body)
    print(x)
    print(np.sum(x))

In [13]:
def search(host, port, title, title_weight, abstract, abstract_weight, date, date_weight, use_page_rank, page_rank_weight=0):
    es = Elasticsearch([{'host': host, 'port': port}])
    if use_page_rank:
        search_param = {
            "query": {

                "function_score": {
#                   "query": {
#                       "match_all" : {}
#                   },
                  "functions": [
                    {
                      "filter": {
                        "match_phrase": {
                          "paper.title": title
                        }
                      },
                      "weight": title_weight
                    },
                    {
                      "filter": {
                        "match_phrase": {
                          "paper.abstract": abstract
                        }
                      },
                      "weight": abstract_weight
                    },
                    {
                      "filter": {
                        "range": {
                          "paper.date": {"gte": date}
                        }
                      },
                      "weight": date_weight
                    },
                    {
                        "script_score": {
                            "script": {
                                "source": "_score * saturation(doc['paper.page_rank'].value, 0.0001)"
                                
                            }
                        }
                    },
                  ],
                  "score_mode": "sum", 
                  "boost": "5",
                  "boost_mode": "multiply",

                }

            }  
        }
        response = es.search(index="paper_index", body=search_param, size=10)
        for idx, i in enumerate(response['hits']['hits']):
            paper = i['_source']['paper']
            print(idx, paper['title'], '\n', paper['abstract'], '\n', paper['authors'], '\n', paper['date'])
            print('-' * 60)
        print('*^'*60)
        search_param = {
            "query": {

                "function_score": {
#                   "query": {
#                       "match_all" : {}
#                   },
                  "functions": [
                    {
                      "filter": {
                        "match_phrase": {
                          "paper.title": title
                        }
                      },
                      "weight": title_weight
                    },
                    {
                      "filter": {
                        "match_phrase": {
                          "paper.abstract": abstract
                        }
                      },
                      "weight": abstract_weight
                    },
                    {
                      "filter": {
                        "range": {
                          "paper.date": {"gte": date}
                        }
                      },
                      "weight": date_weight
                    }
                  ],
                  "score_mode": "sum", 
                  "boost": "5",
                  "boost_mode": "multiply",

                }

            },
            "sort": [{ "_score": { "order": "desc" }}],
        }
        response = es.search(index="paper_index", body=search_param, size=10)
        for idx, i in enumerate(response['hits']['hits']):
            paper = i['_source']['paper']
            print(idx, paper['title'], '\n', paper['abstract'], '\n', paper['authors'], '\n', paper['date'])
            print('-' * 60)
    else:
        search_param = {
#             "from" : 0, "size" : 10,

            "query": {

                "function_score": {
#                   "query": {
#                       "match_all" : {}
#                   },
                  "functions": [
                    {
                      "filter": {
                        "match_phrase": {
                          "paper.title": title
                        }
                      },
                      "weight": title_weight
                    },
                    {
                      "filter": {
                        "match_phrase": {
                          "paper.abstract": abstract
                        }
                      },
                      "weight": abstract_weight
                    },
                    {
                      "filter": {
                        "range": {
                          "paper.date": {"gte": date}
                        }
                      },
                      "weight": date_weight
                    }
                  ],
                  "score_mode": "sum", 
                  "boost": "5",
                  "boost_mode": "multiply",

                }

            },
            "sort": [{ "_score": { "order": "desc" }}],   
        }
        response = es.search(index="paper_index", body=search_param, size=10)
        for i in response['hits']['hits']:
            paper = i['_source']['paper']
            print(paper['title'], '\n', paper['abstract'], '\n', paper['authors'], '\n', paper['date'])
            print('-' * 60)

In [None]:
# index = Index('localhost', 9200, 'papers.json')

In [None]:
# index.delete()

In [None]:
# index.save_data()

In [None]:
# calc_alpha('localhost', 9200, 0.1)

In [None]:
# es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
# for id in paper_ids:
#     print(es.get(index='paper_index', id=id)['_source']['paper']['page_rank'])

In [None]:
# search('localhost', 9200, title='Attention is All you Need', title_weight=10, abstract='We propose a novel neural attention', abstract_weight=15, date=2017, date_weight=5, use_page_rank=False)

In [None]:
# search('localhost', 9200, title='Attention is All you Need', title_weight=10, abstract='We propose a novel neural attention', abstract_weight=15, date=2017, date_weight=5, use_page_rank=True)

In [25]:
authors_link = {}
authority = {}
hub = {}

In [26]:
def sort_by_HITS(host, port, authors_number):
    global paper_authors
    es = Elasticsearch([{'host': host, 'port': port}])
    for paper_id in paper_ids:
        paper = es.get('paper_index', paper_id)['_source']['paper']
        references = paper.get('references', False)
        if references:
            for reference_id in references:
                reference_authors = paper_authors.get(reference_id, False)
                if reference_authors:
                    for reference_author in reference_authors:
                        author = authors_link.get(reference_author, False)
                        if author:
                            authors_link[reference_author] = author.union(paper.get('authors', None))
                        else:
                            authors_link[reference_author] = set(paper.get('authors', None))
                            
    for i in range(5):
        for author, r_authors in authors_link.items():
            aux = authority.get(author, False)
            if not aux:
                authority[author] = 1
            for r_author in r_authors:
                authority[author] += hub.get(r_author, 1)
        for author, r_authors in authors_link.items():
            for r_author in r_authors:
                aux = hub.get(r_author, False)
                if not aux:
                    hub[r_author] = 1
                hub[r_author] += authority.get(author, 1)
    print('asf')
    for k, v in sorted(authority.items(), key=lambda item: item[1], reverse=True)[:authors_number]:
        print(k, v)   

In [27]:
sort_by_HITS('localhost', 9200, 10)

asf


In [28]:
authors_link

{}

In [21]:
# sort_by_HITS('localhost', 9200, 10)

In [31]:
while True:
    print('choose one of these:\n 1.crawl\n 2.indexing\n 3.evaluating papers\n 4.search\n 5.sort authors by HITS\n 6.exit')
    order = int(input())
    if order == 1:
        start_urls = []
        print('enter start urls one by one and then press enter')
        for i in range(3):
            print('url number {}'.format(i+1), end='')
            start_urls.append(input())
        print('enter number of papers you want to crawl')
        print('limit:', end='')
        crawl_limit = int(input())
        crawler(start_urls = start_urls, limit=crawl_limit)
    elif order == 2:
        print('enter address of json file')
        json_addr = input()
        print('first enter host address then enter port on which elasticsearch is running')
        print('host:', end='')
        host = input()
        print('port:', end='')
        port = int(input())
        index = Index(host, port, json_addr)
        while True:
            print('to save data in elasticsrach enter 1\nto delete data saved in elasticsearch press 2\nto exit this mode press 3')
            sub_order = int(input())
            if sub_order == 1:
                index.save_data()
            elif sub_order == 2:
                index.delete()
            elif sub_order == 3:
                break
    elif order == 3:
        print('enter host and port of server and then value of alpha you want to be applied')
        print('host:', end='')
        host = input()
        print('port:', end='')
        port = int(input())
        print('alpha:', end='')
        alpha = float(input())
        calc_alpha(host, port, alpha)
    elif order == 4:
        print('enter host and port of server and then weights and values of fields. At the end specify if you want page rank to have an effect on the results or not')
        print('host:', end='')
        host = input()
        print('port:', end='')
        port = int(input())
        print('title weight:', end='')
        title_weight = int(input())
        print('title:', end='')
        title = input()
        print('abstract weight:', end='')
        abstract_weight = int(input())
        print('abstract:', end='')
        abstract = input()
        print('date weight:', end='')
        date_weight = int(input())
        print('date:', end='')
        date = input()
        print('page rank effect: y or n', end='')
        use_page_rank = True if input() == 'y' else False
        search(host, port, title, title_weight, abstract, abstract_weight, date, date_weight, use_page_rank)
    elif order == 5:
        print('enter host and port of server and number of authors you want to be returned')
        print('host:', end='')
        host = input()
        print('port:', end='')
        port = int(input())    
        print('authors number:', end='')
        authors_number = int(input())   
        sort_by_HITS(host, port, authors_number)
    elif order == 6:
        break

choose one of these:
 1.crawl
 2.indexing
 3.evaluating papers
 4.search
 5.sort authors by HITS
 6.exit
2
enter address of json file
D:\education\98-99-2\modern information retrieval\project\phase3\phase3\papers.json
first enter host address then enter port on which elasticsearch is running
host:localhost
port:9200
to save data in elasticsrach enter 1
to delete data saved in elasticsearch press 2
to exit this mode press 3
1
to save data in elasticsrach enter 1
to delete data saved in elasticsearch press 2
to exit this mode press 3
3
choose one of these:
 1.crawl
 2.indexing
 3.evaluating papers
 4.search
 5.sort authors by HITS
 6.exit
5
enter host and port of server and number of authors you want to be returned
host:localhost
port:9200
authors number:10
asf
Yoshua Bengio 7579474926266793
Ilya Sutskever 7210802856506559
Geoffrey E. Hinton 5786486920067150
Quoc V. Le 5608604046913263
Oriol Vinyals 5107951659190343
Andrew Y. Ng 4417444935661362
Jason Weston 4088627336399566
Christopher 

# Part 6

In [2]:
with open('D:/education/98-99-2/modern information retrieval/project/phase3/data/train.txt', mode='r') as file:
    raw_train_data = file.readlines()

In [3]:
with open('D:/education/98-99-2/modern information retrieval/project/phase3/data/vali.txt', mode='r') as file:
    raw_val_data = file.readlines()

In [35]:
with open('D:/education/98-99-2/modern information retrieval/project/phase3/data/test.txt', mode='r') as file:
    raw_test_data = file.readlines()

In [4]:
test_docs = {}
val_docs = {}

In [6]:
for query_doc in raw_val_data:
    query_id = query_doc.split()[1].split(':')[-1]
    doc = val_docs.get(query_id, False)
    if doc:
        val_docs[query_id].append({"relevance" : int(query_doc[0]), "feature" : list(map(lambda t:float(t.split(':')[-1]), query_doc.split()[2:48]))})
    else:
        val_docs[query_id] = [{"relevance" : int(query_doc[0]), "feature" : list(map(lambda t:float(t.split(':')[-1]), query_doc.split()[2:48]))}]

In [37]:
for query_doc in raw_test_data:
    query_id = query_doc.split()[1].split(':')[-1]
    doc = test_docs.get(query_id, False)
    if doc:
        test_docs[query_id].append({"relevance" : int(query_doc[0]), "feature" : list(map(lambda t:float(t.split(':')[-1]), query_doc.split()[2:48]))})
    else:
        test_docs[query_id] = [{"relevance" : int(query_doc[0]), "feature" : list(map(lambda t:float(t.split(':')[-1]), query_doc.split()[2:48]))}]

In [7]:
def NDCG(ranking, ground_truth):
    gt_dcg = 0
    for idx, r in enumerate(ground_truth):
        if idx == 0:
            gt_dcg += r
        else:
            gt_dcg += r / np.log2(idx+1)
    dcg = 0
    for idx, r in enumerate(ranking):
        if idx == 0:
            dcg += r
        else:
            dcg += r / np.log2(idx+1)
    return dcg / gt_dcg

In [8]:
def bubblesort(in_list):

# Swap the elements to arrange in order
    for iter_num in range(len(in_list)-1,0,-1):
        for idx in range(iter_num):
            if clf.predict([np.asarray(in_list[idx]['feature']) - np.asarray(in_list[idx+1]['feature'])]) == 0:
                temp = in_list[idx]
                in_list[idx] = in_list[idx+1]
                in_list[idx+1] = temp
    ranking = []
    for doc in in_list[:5]:
        ranking.append(doc['relevance'])
    return ranking

In [9]:
train_data = []
train_label = []

In [10]:
grouped_docs = None
for k, v in groupby(raw_train_data, key=lambda t: t.split()[1].split(':')[-1]):
    grouped_docs = sorted(list(v), key=lambda t : t[0], reverse=True)
    docs_number = len(grouped_docs)
    for i in range(docs_number - 1):
        for j in range(i+1, docs_number):
            if grouped_docs[i][0] != grouped_docs[j][0]:
                first = np.array(list(map(lambda t:float(t.split(':')[-1]), grouped_docs[i].split()[2:48])))
                second = np.array(list(map(lambda t:float(t.split(':')[-1]), grouped_docs[j].split()[2:48])))                
                if grouped_docs[i][0] > grouped_docs[j][0]:
                    train_data.append(first - second)
                    train_label.append(1)
                    train_data.append(second - first)
                    train_label.append(0)
                else:
                    train_data.append(first - second)
                    train_label.append(0)
                    train_data.append(second - first)
                    train_label.append(1)

In [31]:
start = time.time()
clf = svm.SVC(C=0.6)
clf.fit(train_data, train_label)
print(time.time() - start)

711.9629595279694


In [32]:
dump(clf, 'svm_weights_0.6.joblib') 

['svm_weights_0.6.joblib']

In [38]:
clf = load('svm_weights_0.8.joblib') 

In [33]:
ndcgs = 0
counter = 0
for query_id in val_docs.keys():
    aux = sorted(list(map(lambda d:d['relevance'], val_docs[query_id])), reverse=True)
    if aux[0] != aux[-1]:
        ndcgs += NDCG(bubblesort(val_docs[query_id]), aux[:5])
        counter += 1

In [34]:
ndcgs/counter

0.6795808823313301

In [41]:
ndcgs = 0
counter = 0
for query_id in test_docs.keys():
    aux = sorted(list(map(lambda d:d['relevance'], test_docs[query_id])), reverse=True)
    if aux[0] != aux[-1]:
        ndcgs += NDCG(bubblesort(test_docs[query_id]), aux[:5])
        counter += 1

In [42]:
ndcgs/counter

0.6786381129994296