In [139]:
import sys
import os
from elasticsearch import Elasticsearch
from zipfile import ZipFile
import re
import pandas as pd
import json
import requests
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
import nltk
import pickle
import pytrec_eval
nltk.download('stopwords')
nltk.download('punkt')
from elasticsearch import NotFoundError

es = Elasticsearch()

[nltk_data] Downloading package stopwords to /home/katya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/katya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [140]:
input_dir = 'data/docs'
output_dir = 'results'

In [141]:
for filename in os.listdir():
    if filename.endswith(".zip"):
        with ZipFile(filename, 'r') as zip:
            zip.extractall()

In [142]:
def strip_punct(s):
    s = re.sub('[^A-Za-z0-9]', ' ', s)
    s = s.lower()
    return " ".join(s.split())

In [143]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [144]:
def search(search_param, filename, q, ind):
    
    qid = []
    Q0 = []
    doc = []
    rank = []
    score = []
    tag = []

    num=1

    for i in q:
        for x in search_param['query']['bool']['should']:
            if type(i) == dict:
                for key in x['match']:
                    x['match'][key] = i['title']
            else:
                for key in x['match']:
                    x['match'][key] = i
                    
        #print(search_param)
        response = es.search(index=ind, body=search_param)
        r = 1
        for x in response['hits']['hits']:
            qid.append(num)
            Q0.append('Q0')
            doc.append(x['_id'])
            rank.append(r)
            score.append(x['_score'])
            tag.append('uh-t2-thor')
            r+=1
        num+=1
    
   
    qrels = {'qid': qid, 'Q0': Q0, 'doc': doc, 'rank':rank, 'score':score, 'tag':tag}

    df = pd.DataFrame(qrels)
    df.to_csv((output_dir+'/'+filename+'.txt'), sep = ' ', index = False, header = False)

In [145]:
def search_extended(search_param, filename, q, ind):
    qid = []
    Q0 = []
    doc = []
    rank = []
    score = []
    tag = []

    num=1
#['bool']['should']
    for idx, row in q.iterrows():
            
            #for key in x['match']:
                #x['match'][key] = row['query']
        for x in search_param['query']['bool']['should']:
            for key in x["match"]:
                #print(key)
                if "boost" in x['match'][key]:
                    x['match'][key]["query"] = row['query']
                else:
                    x['match'][key] = row['syn']
        #print(search_param)
        response = es.search(index=ind, body=search_param)
        r = 1
        for x in response['hits']['hits']:
            qid.append(num)
            Q0.append('Q0')
            doc.append(x['_id'])
            rank.append(r)
            score.append(x['_score'])
            tag.append('uh-t2-thor')
            r+=1
        num+=1
    
   
    qrels = {'qid': qid, 'Q0': Q0, 'doc': doc, 'rank':rank, 'score':score, 'tag':tag}

    df = pd.DataFrame(qrels)
    df.to_csv((output_dir+'/'+filename+'.txt'), sep = ' ', index = False, header = False)

In [146]:
def create_index(b, k1, index):
    
    #create template for index
    request_body = {
    "settings" : {
	        "number_of_shards": 1,
	        "number_of_replicas": 0,
        "similarity": {
      "default": { 
        "type": "BM25",
        "b":b,
        "k1":k1
      }
    }
	    }
	}

    es.indices.create(index = index, body = request_body)
    
    
    #load data to index
    with open('data/bulk_data.json') as f:
        bulk_data = json.load(f)
        
    for x in bulk_data[::2]:
        x['index']['_index']=index
        
    bulks = chunks(bulk_data, 100)
    for x in bulks:
        res = es.bulk(index = index, body = x)
    

In [147]:
def create_run(d, run_name, index, mode):
    #d, run_name, index
    
    if mode=='simple':
        f = open("data/topics_lemmatized.txt", 'rb')
        queries_lem = pickle.load(f)
        f.close()
        search_param = {
    
        'size': 50,
        "query": {
        "bool": {
        "should": d
        }
        }
        }
        #print(search_param)

        search(search_param, run_name, queries_lem, index)
    elif mode=='syn':
        df_syn = pd.read_csv('data/q_for_syn.tsv', sep = '\t')
        search_param = {
    
        'size': 50,
        "query": {
        "bool": {
        "should": d
        }
        }
        }
        #print(search_param)

        search_extended(search_param, run_name, df_syn, index)
    else:
        print('Wrong arguments!')
        
    

In [148]:
def evaluate_run(run_path, qrel_path):
    run = pd.read_csv(run_path, sep = ' ', names = ['topic','Q0','id','rank','score','team'])

    qrel = pd.read_csv(qrel_path, sep = ' ', names = ['topic','Q0','id','relevance'])

    qrels = {}
    for i in range(1,51):
        qrels[str(i)] = {}
    
    for idx, row in qrel.iterrows():
        qrels[str(row['topic'])][row['id']] = row['relevance']
    
    runs = {}
    for i in range(1,51):
        runs[str(i)] = {}
    
    for idx, row in run.iterrows():
        runs[str(row['topic'])][row['id']] = row['score']

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrels,{'map_cut', 'ndcg_cut', 'recall', 'P'})

    res = evaluator.evaluate(runs)

    results = {}
    recall = {}
    pres = {}
    for key in res:
        results[key] = res[key]['ndcg_cut_10']
        recall[key] = res[key]['recall_10']
        pres[key] = res[key]['P_10']
        #print(res[key])
    
    filename = re.split('\.', sys.argv[1])
    with open((filename[0]+'_evaluated.json'), 'w') as f:
        json.dump(results, f)
        
    s = sum(results.values())/50
    r = sum(recall.values())/50
    p = sum(pres.values())/50
    if r+p != 0:
        f1 = (2*(r*p)/(r+p))
    else: 
        f1 = 0
    '''
    print("Average ndcg_cut_10: ", s)
    print("Average recall_10: ", r)
    print("Av. precision: ", p)
    print("F1-score: ",f1)
    '''    
    return s, r, p, f1


In [149]:
index = 'final_0.68'
create_index(0.68, 1.2, index)



In [None]:
#runs for args
d=[]
d.append({
          "match": {
            "title_lem": ""
          }})
create_run(d, 'run_title', index, 'simple')


d=[]
d.append({
          "match": {
            "lem": ""
          }
        })
create_run(d, 'run_doc', index, 'simple')


d=[]
d.append({
          "match": {
            "args": ""
          }
        })      
create_run(d, 'run_arg', index, 'simple')


d=[]
d.append({
          "match": {
            "title_lem": ""
          }})
d.append({
          "match": {
            "args": ""
          }
        })
create_run(d, 'run_title_arg', index, 'simple')


d=[]
d.append({
          "match": {
            "title_lem": ""
          }})
d.append({
          "match": {
            "lem": ""
          }
        })
create_run(d, 'run_title_doc', index, 'simple')


d=[]
d.append({
          "match": {
            "lem": ""
          }})
d.append({
          "match": {
            "args": ""
          }
        })
create_run(d, 'run_doc_arg', index, 'simple')


d=[]
d.append({
          "match": {
            "title_lem": ""
          }})
d.append({
          "match": {
            "lem": ""
          }
        })
d.append({
          "match": {
            "args": ""
          }
        })
create_run(d, 'run_title_doc_arg', index, 'simple')

In [None]:
#runs for synonyms


d=[]
d.append({"match": {"title_lem": {
                        "query":"",
                    "boost":5}}})
d.append({"match": {"title_lem": {""}}})
create_run(d, 'run_title_syn', index, 'syn')


d=[]
d.append({"match": {"lem": {
                        "query":"",
                    "boost":5}}})
d.append({"match": {"lem": {""}}})
create_run(d, 'run_doc_syn', index, 'syn')


d=[]
d.append({"match": {"args": {
                        "query":"",
                    "boost":5}}})
d.append({"match": {"args": {""}}})
create_run(d, 'run_args_syn', index, 'syn')


d=[]
d.append({"match": {"title_lem": {
                        "query":"",
                    "boost":5}}})
d.append({"match": {"args": {
                        "query":"",
                    "boost":5}}})
d.append({"match": {"title_lem": {""}}})
d.append({"match": {"args": {""}}})
create_run(d, 'run_title_arg_syn', index, 'syn')


d=[]
d.append({"match": {"title_lem": {
                        "query":"",
                    "boost":5}}})
d.append({"match": {"lem": {
                        "query":"",
                    "boost":5}}})
d.append({"match": {"title_lem": {""}}})
d.append({"match": {"lem": {""}}})
create_run(d, 'run_title_doc_syn', index, 'syn')


d=[]
d.append({"match": {"lem": {
                        "query":"",
                    "boost":5}}})
d.append({"match": {"args": {
                        "query":"",
                    "boost":5}}})
d.append({"match": {"lem": {""}}})
d.append({"match": {"args": {""}}})
create_run(d, 'run_doc_arg_syn', index, 'syn')


d=[]
d.append({"match": {"title_lem": {
                        "query":"",
                    "boost":5}}})
d.append({"match": {"lem": {
                        "query":"",
                    "boost":5}}})
d.append({"match": {"args": {
                        "query":"",
                    "boost":5}}})
d.append({"match": {"title_lem": {""}}})
d.append({"match": {"lem": {""}}})
d.append({"match": {"args": {""}}})
create_run(d, 'run_title_doc_arg_syn', index, 'syn')

In [152]:
#evaluation
run = []
ndcg = []
rec = []
prec = []
f1s = []

for filename in os.listdir(output_dir):
    if filename.endswith(".txt"):
        #print(filename)
        s, r, p, f1 = evaluate_run((output_dir+"/{}").format(filename), 'touche2020-task2-relevance-withbaseline(1).qrels')
        run.append(filename)
        ndcg.append(s)
        rec.append(r)
        prec.append(p)
        f1s.append(f1)
df = pd.DataFrame({'run':run,'ndcg_cut10':ndcg, 'recall_10':rec, 'precision':prec, 'f1-score':f1s})

In [153]:
print(df.sort_values(by = ['ndcg_cut10'], ascending = False))

                          run  ndcg_cut10  recall_10  precision  f1-score
10      run_title_doc_syn.txt    0.445049   0.280011      0.428  0.338539
3           run_title_doc.txt    0.443412   0.271602      0.422  0.330495
12          run_title_arg.txt    0.386312   0.253063      0.376  0.302519
1       run_title_arg_syn.txt    0.379192   0.250281      0.370  0.298587
0       run_title_doc_arg.txt    0.376844   0.234387      0.352  0.281399
11  run_title_doc_arg_syn.txt    0.366942   0.229828      0.348  0.276831
13          run_title_syn.txt    0.348180   0.218751      0.340  0.266220
2               run_title.txt    0.328667   0.205588      0.312  0.247856
4             run_doc_syn.txt    0.323637   0.199024      0.300  0.239296
7                 run_doc.txt    0.318181   0.196172      0.292  0.234680
6             run_doc_arg.txt    0.285249   0.189760      0.276  0.224896
5         run_doc_arg_syn.txt    0.281281   0.186087      0.274  0.221645
8            run_args_syn.txt    0.270

In [154]:
#print(index)
es.indices.delete(index=index)

{'acknowledged': True}