In [15]:
import xml.etree.ElementTree as ET
import requests
from string import punctuation 
import json
import pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
import nltk
from nltk import pos_tag
from nltk.corpus import wordnet
import re
import pandas as pd
nltk.download('stopwords')
nltk.download('punkt')
import sys
import os
from zipfile import ZipFile
import pytrec_eval
from boilerpy3 import extractors
from urllib.error import HTTPError

## Convert XML-file with topics to json, save number of results for ClueWeb12

In [16]:
#parse the XML-file with queries

mytree = ET.parse('data/topics-task-2.xml')
myroot = mytree.getroot()

In [17]:
#preprocess the queries

q = []
topics = []
for item in myroot:
    d = {}
    for x in item:
        d[x.tag] = x.text.strip('\n')
        #print(d)
        
        if x.tag == "title":
            #print(x.text)
            
            #clean up punctuation:
            no_punctuation = ""
            for char in x.text:
                if char not in punctuation:
                    no_punctuation = no_punctuation + char
                    
            #set up the query with an operator AND
            #print(no_punctuation)
            s = ' AND '.join(no_punctuation.split())
            #print(s)
            
            #append the query to an array
            q.append(s)
    topics.append(d)

In [19]:
len(topics)

50

In [20]:
#save topics as json
with open('data/topics.json', 'w') as file:
    json.dump(topics, file)

In [21]:
results = []
chatnoir = "https://www.chatnoir.eu/api/v1/_search"
attr = {"apikey": "7dd15626-53aa-46c6-bd34-b2feaa2d9d81",
        "query": "hello world",
        "index": "cw12",
        "pretty": True
}

for x in q:
    attr["query"] = x
    #somehow index doesn't work correctly when passed as an array (it only searches the 1st index of the array), so search
    #in each index separately and sum up the results
    response = requests.post(chatnoir, data = attr)
    print(x)
    print(response.json()["meta"])
    res = response.json()["meta"]["total_results"]
    
    results.append(res)

What AND is AND the AND difference AND between AND sex AND and AND love
{'query_time': 11422, 'total_results': 194416, 'indices': ['cw12']}
Which AND is AND better AND a AND laptop AND or AND a AND desktop
{'query_time': 8621, 'total_results': 117594, 'indices': ['cw12']}
Which AND is AND better AND Canon AND or AND Nikon
{'query_time': 2724, 'total_results': 17321, 'indices': ['cw12']}
What AND are AND the AND best AND dish AND detergents
{'query_time': 2182, 'total_results': 6321, 'indices': ['cw12']}
What AND are AND the AND best AND cities AND to AND live AND in
{'query_time': 10167, 'total_results': 2602861, 'indices': ['cw12']}
What AND is AND the AND longest AND river AND in AND the AND US
{'query_time': 3627, 'total_results': 81792, 'indices': ['cw12']}
Which AND is AND healthiest AND coffee AND green AND tea AND or AND black AND tea AND and AND why
{'query_time': 4528, 'total_results': 1340, 'indices': ['cw12']}
What AND are AND the AND advantages AND and AND disadvantages AND

In [22]:
#save results as txt

with open("data/results.txt", "wb") as fp:   #Pickling
    pickle.dump(results, fp)

In [23]:
#test if results were saved correctly

with open("data/results.txt", "rb") as fp:   # Unpickling
    res = pickle.load(fp)
    
print(res)

[194416, 117594, 17321, 6321, 2602861, 81792, 1340, 210, 147688, 1033081, 8937, 43117, 110731, 11217, 38399, 270060, 50288, 1723, 184891, 7719, 151323, 12481, 328290, 3445, 86478, 17155, 8840, 4558, 8035, 20266, 26915, 2042, 25652, 5042, 554, 11735, 34, 4572, 1330, 89976, 1903, 5792, 172, 1101, 366, 280, 2342, 10405, 34834, 1259]


## Retrieve documents from ChatNoir

In [None]:
#open the file with topics

f = open("data/topics.json", encoding='utf8')
topics = json.load(f)

#open the file with amount of results for each topic
with open("data/results.txt", "rb") as fp:   # Unpickling
    res = pickle.load(fp)
    
print(res)

In [None]:
#preprocess the queries

for i in range(len(topics)):
    topics[i]['title'] = topics[i]['title'].replace(' ', ' AND ')
    topics[i]['title'] = re.sub('[?:,]', '', topics[i]['title'])
    topics[i]['results'] = res[i]

In [None]:
attr = {"apikey": "7dd15626-53aa-46c6-bd34-b2feaa2d9d81",
        "query": "",
        "index": "cw12",
        "size": 10
       }

In [None]:
extractor = extractors.ArticleExtractor()


def topics_iter(q):
    docs = []
    attr["query"] = q['title']
    #attr['size'] = q['results']
    if q['results']<=10000:
        num_of_res = q['results']
    else:
        num_of_res = 10000
    count = 0
    print(attr)
    print(num_of_res)
    url = "https://www.chatnoir.eu/api/v1/_search?"
    while count < num_of_res:
        attr["from"] = count
        while True:
            try:
                r = requests.post(url, json = attr)
                res = r.json()
                print(count)
                #print(res)
                res_len = len(res['results'])
                print(res_len)
            except KeyError:
                continue
            break
        
    
        for i in range(res_len):
            
            doc_url = "https://www.chatnoir.eu/cache?uuid="+res['results'][i]['uuid']+"&index=cw12&raw"
            #print(doc_url)
            try:
                doc = extractor.get_doc_from_url(doc_url)
                content = doc.content
                title = doc.title
                res['results'][i]['document'] = content
            except HTTPError:
                print("HTTPError")
                continue
        docs.append(res['results'])
        count+=10
        
    return docs

In [None]:
for q in topics[40:41]:
    q['documents'] = topics_iter(q)
    with open("data/docs/docs_for_topic_{}.txt".format(q['number']), "w") as f:
        json.dump(q, f)
        
    f.close()

## Lemmatize topics

In [None]:
f = open('data/topics.json')
topics = json.load(f)
f.close()
lis =[]
for i in range(len(topics)):
    x=(topics[i]['title'])
    lis.append(re.sub('[?:,]', '', x))
converted_list = [x.lower() for x in lis]
#print ("Topics: ", converted_list)
#print("\n")

tokenized_sents = [word_tokenize(i) for i in converted_list]
#for i in tokenized_sents:
    #print (i)
lis3 =[]

for i in tokenized_sents:
    tokens_without_sw = [word for word in i if not word in stopwords.words()]
    lemmatizer = WordNetLemmatizer()
    lemmatized_output_0 = ([lemmatizer.lemmatize(w,pos="n") for w in tokens_without_sw])
    lemmatized_output_1 = ' '.join(([lemmatizer.lemmatize(w,pos="v") for w in lemmatized_output_0]))
    lis3.append(lemmatized_output_1)   
#print("Lemmatized verbs and nouns: \n", lis3)
with open('data/topics_lemmatized.txt', 'wb') as fp:
    pickle.dump(lis3, fp)
fp.close()

## Create and save bulk data for the index

In [29]:
input_dir = 'data/docs'
output_dir = 'results'

In [30]:
def strip_punct(s):
    s = re.sub('[^A-Za-z0-9]', ' ', s)
    s = s.lower()
    return " ".join(s.split())

In [31]:
def return_arguments(doc_raw, model):
    url = 'https://demo.webis.de/targer-api/classify'+model
    #print(url)
    #print(doc_raw)
    res = requests.post(url, json=doc_raw)
    if res.status_code == 200:
#print(res)
        res = res.json()
        args = []
        for x in res:
            arg = []
            for el in x:
                if el['label']!='O' and float(el['prob'])>0.99:
                    arg.append(el['token'])
            s = ' '.join(arg)
            args.append(s)
        arguments = ' '.join(args)
        arguments = word_tokenize(strip_punct(arguments).lower())
        tokens_without_sw = [word for word in arguments if not word in stopwords.words()]
        lemmatizer = WordNetLemmatizer()
        lemmatized_output_0 = ([lemmatizer.lemmatize(w,pos="n") for w in tokens_without_sw])
        arguments = ' '.join(([lemmatizer.lemmatize(w,pos="v") for w in lemmatized_output_0]))
    else:
        #print('EMPTY')
        #print(res)
        arguments = ''
    return arguments

In [32]:
count = 0
c=0
t = 1
url = 'https://demo.webis.de/targer-api/classifyCombo'
headers = {'accept': 'application/json', 'Content-Type': 'text/plain'}
bulk_data = []
#extract docs from zip-files
for filename in os.listdir(input_dir):
    if filename.endswith(".zip"):
        with ZipFile((input_dir+"/{}").format(filename), 'r') as zip:
            zip.extractall(input_dir)
            
for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        name = re.split('_|\.', filename)
        num = next(obj for obj in name if obj.isdigit())
        with open((input_dir+"/{}").format(filename), "r") as f:
            topic = json.load(f)
            topic['title'] = strip_punct(topic['title']).lower()
            print(filename)
            print(topic['title'])
            print(t)
            t+=1
            for n in topic["documents"]:
                print(len(n))
                #print(count)
                if count<10:
                    count+=1
                    for doc in n:
                        try:
                            doc_raw = doc['document'].rstrip('\n')
                            doc_raw = doc_raw.rstrip('\\n')
                            
                            doc['lem'] = word_tokenize(strip_punct(doc_raw).lower())
                            tokens_without_sw = [word for word in doc['lem'] if not word in stopwords.words()]
                            lemmatizer = WordNetLemmatizer()
                            lemmatized_output_0 = ([lemmatizer.lemmatize(w,pos="n") for w in tokens_without_sw])
                            doc['lem'] = ' '.join(([lemmatizer.lemmatize(w,pos="v") for w in lemmatized_output_0]))
                            doc['lem'] = doc['lem'].rstrip()
                            
                            title_lem = strip_punct(doc['title'].lower().rstrip())
                            title_lem = word_tokenize(title_lem)
                            tokens_without_sw = [word for word in title_lem if not word in stopwords.words()]
                            lemmatizer = WordNetLemmatizer()
                            lemmatized_output_0 = ([lemmatizer.lemmatize(w,pos="n") for w in tokens_without_sw])
                            title_lem = ' '.join(([lemmatizer.lemmatize(w,pos="v") for w in lemmatized_output_0]))
                            
                            #doc_raw = doc['document'].rstrip('\n')
                            #doc_raw = doc_raw.rstrip('\\n')
                            
                            
                            combo = return_arguments(doc_raw, 'Combo')
                            
                            #es = return_arguments(doc_raw, 'ES')
                            
                            #es_dep = return_arguments(doc_raw, 'ES_dep')
                            
                            #ibm = return_arguments(doc_raw, 'IBM')
                            
                            #new_pe = return_arguments(doc_raw, 'NewPE')
                            
                            #new_wd = return_arguments(doc_raw, 'NewWD')
                            
                            #wd = return_arguments(doc_raw, 'WD')
                            
                            #wd_dep = return_arguments(doc_raw, 'WD_dep')
                        
                            
                            
                            
                            b = {
                                    'query': topic['title'],
                                    'title': doc['title'],
                                    'title_lem': title_lem,
                                    'num': num,
                                    'uuid': doc['uuid'],
                                    'score': doc['score'],
                                    'document': doc['document'],
                                    'lem': doc['lem'],
                                    'args': combo
                                    #'es': es,
                                    #'es_dep': es_dep,
                                    #'ibm': ibm,
                                    #'new_pe': new_pe,
                                    #'new_wd': new_wd,
                                    #'wd': wd,
                                    #'wd_dep': wd_dep
                                }
                            #es.index(index='test_index',doc_type='doc',id=doc['trec_id'],body=b)
                            templ = {'index': {'_index': 'test_index', 
                                           '_type': 'doc', 
                                           '_id': doc['trec_id']}}
                            bulk_data.append(templ)
                            bulk_data.append(b)

                            c+=1
                        except KeyError:
                            pass
                else:
                    break
            count=0

docs_for_topic_31.txt
which and has and more and caffeine and coffee and or and tea
1
10
10
10
10
10
10
10
10
10
10
10
docs_for_topic_27.txt
which and one and is and better and an and electric and stove and or and a and gas and stove
2
10
10
10
10
10
10
10
10
10
10
10
docs_for_topic_9.txt
why and is and linux and better and than and windows
3
10
10
10
10
10
10
10
10
10
10
10
docs_for_topic_44.txt
which and company and has and a and larger and capitalization and apple and or and microsoft
4
10
10
10
10
10
10
10
10
10
10
10
docs_for_topic_34.txt
what and is and better and for and the and environment and a and real and or and a and fake and christmas and tree
5
10
10
10
10
10
10
10
10
10
10
10
docs_for_topic_50.txt
whose and salary and is and higher and basketball and or and soccer and players
6
10
10
10
10
10
10
10
10
10
10
10
docs_for_topic_22.txt
which and is and better and pepsi and or and coke
7
10
10
10
10
10
10
10
10
10
10
10
docs_for_topic_7.txt
which and is and healthiest and cof

In [33]:
with open("data/bulk_data.json", "w") as f:
    json.dump(bulk_data,f)