# DAT-640  Information Retrieval and Text Mining
## Final Project: MS-MARCO Document Re-Ranking
### Autors:
#### Asahi Cantu - 253964
#### Shaon Rahman - StudentID

### Project Description:
Microsoft MAchine Reading COmprehension Dataset  is a copmilation of queries and documents retrieved from Microsoft Bing Platform. It contains a big dataset ~ 22GB of documents and queries

# Section I - Package installation and definition

In [70]:
!pip install requests
!pip install elasticsearch
!pip install tqdm
!pip install xgboost
!pip install tensorflow
!pip install sklearn



In [71]:
import gzip
import math
import numpy as np
import os

import pandas as pd
import pytest
import pickle
import platform

import random
import requests
import tarfile
import time
import timeit
import subprocess

import sys
import zipfile

from subprocess import Popen,PIPE
from playsound import playsound

from collections import Counter
from collections import defaultdict
from elasticsearch import Elasticsearch
#from playsound import playsound
from tqdm.notebook import tqdm

from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
import xgboost
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tqdm.notebook import tqdm



from transformers import *
from summarizer import Summarizer



# Section II - Document extraction function definition

In [22]:
def save_picke(file_path,obj):
    with open(file_path, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pickle(file_path):
    with open(file_path, 'rb') as handle:
        obj = pickle.load(handle)
    return obj

def finished(n=1):
    file_path = os.path.join('..','assets','bell.wav')
    for i in range(n):
        playsound(file_path)
        time.sleep(1.5)

def download_file(target_path,url,override=False):
    local_filename = url.split('/')[-1]
    # NOTE the stream=True parameter below
    file_downloaded = False
    file_path = os.path.join(target_path,local_filename)
    byte_pos = 0
    if not os.path.exists(target_path):
        os.mkdir(target_path)
    if not override and os.path.exists(file_path):
        print(f'\tFile {file_path} already exists, skipping...')
        return file_path
    try:
        os.remove(file_path)
    except OSError:
        pass
    print(f'Getting file from {url}')
    while not file_downloaded:
        resume_header = {f'Range': 'bytes=%d-' % byte_pos}
        try:
            with requests.get(url, headers=resume_header, stream=True,  verify=False, allow_redirects=True) as r:
            #with requests.get(url, stream=True) as r:
                r.raise_for_status()
                for chunk in  r.iter_content(chunk_size=8192):
                    with open(file_path, 'ab') as f:
                        # If you have chunk encoded response uncomment if
                        # and set chunk_size parameter to None.
                        #if chunk: 
                        f.write(chunk)
                        byte_pos += 1
                file_downloaded = True
        except:
            print(f'An error occured while downloading. Retrying...{sys.exc_info()[0]} {sys.exc_info()[1]}')
    return file_path

def clear_indices(excluded_indices= []):
    for index in  [index for index  in es.indices.stats()['indices'].keys() if index not in excluded_indices]:
        es.indices.delete(index)
        
def create_index(es,index_name,body,overwrite = False):
    indices = es.indices.stats()['indices'].keys()
    if index_name in  indices:
        if overwrite:
            print(f'overwriting index {index_name}')
            es.indices.delete(index_name)
        else:
            print(f'Index {index_name} already exists')
    else:
        es.indices.create(index_name,body=body)
        
def extract_zip_files(file_path,out_path=None):
    if not out_path:
        out_path  = file_path.replace('.zip','')
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(out_path)
    return out_path

        
def extract_gz_files(file_path,override=False,n=8,max_n=None):
    x_file_out_path = file_path.replace('.gz','')
    if override:
        try:
            os.remove(x_file_out_path)
        except OSError:
            pass
    if os.path.exists(x_file_out_path):
        print(f'\tFile {x_file_out_path} already exists, skipping...')
    else:
        print(f'\tExtracting file {file_path}')
        gz_file = gzip.GzipFile(file_path, 'rb')
        n_i = 0
        while True:
            chunk = gz_file.read(n)
            n += len(chunk)
            if chunk == b'' or (max_n and n_i > max_n):
                break
            x_file_out = open(x_file_out_path, 'ab')
            x_file_out.write(chunk)
            x_file_out.close()
        gz_file.close()
        print(f'\t\tExtracted {x_file_out_path}!')
    return x_file_out_path

def get_gz_lines(file_path):
    total_lines = 0
    with gzip.GzipFile(file_path,'rb') as file:
        try:
            while True:
                next(file)
                total_lines +=1
        except StopIteration:
            pass
    return total_lines
                    
def get_lines(file_path):
    total_lines = 0
    with open(file_path,'rb') as file:
        try:
            while True:
                next(file)
                total_lines +=1
        except StopIteration:
            pass
    return total_lines

def get_samples_from_file(file,doc_lines, doc_samples):
    samples = []
    for i in tqdm(range(doc_lines)):
        line = next(file)
        if i in doc_samples:
            samples.append(line)
    return samples

def extract_rand_samples_from_gz_file(file_path,sample_factor):
    doc_lines = get_gz_lines(file_path)
    doc_samples_count =  int(doc_lines * sample_factor)
    doc_samples = set()
    while len(doc_samples) < doc_lines:
        doc_samples.add(random.randint(0,doc_lines-1))     
    with gzip.GzipFile(file_path,'rb') as file:
        return get_samples_from_file(file,doc_lines,doc_samples)
    
def extract_rand_samples_from_file(file_path,sample_factor):
    doc_lines = get_lines(file_path)
    doc_samples_count =  int(doc_lines * sample_factor)
    doc_samples = set()
    while len(doc_samples) < doc_lines:
        doc_samples.add(random.randint(0,doc_lines-1))     
    with open(file_path,'rb') as file:
        return get_samples_from_file(file,doc_lines,doc_samples)


# Section III - Elastic Search downloading and index creation
### Downloading and executing a new instance of ElasticSearch
The code below uses an automated approach todownload and create an instance of elasticSearch. Skip this if alreay have one


In [23]:
os_name =platform.system().lower()
file_path = os.path.join('..','input')
if not os.path.exists(os.path.join(file_path,'elasticsearch-7.9.3')):
    if os_name == 'windows':
            url = 'https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.3-windows-x86_64.zip'
            file = download_file(file_path,url,override=False)
            x_file= extract_zip_files(file,file_path)
            os.remove(file)
            command= os.path.join(x_file,'elasticsearch-7.9.3','bin','elasticsearch.bat')
            subprocess.call([command])
            #command= os.path.join(x_file,'elasticsearch-7.9.3','bin','elasticsearch.bat')
            #p1 = Popen([command], stdout=PIPE)
    elif os_name == 'linux':
        url = 'https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.3-linux-x86_64.tar.gz'
        file = download_file(file_path,url,override=False)
        x_file = extract_gz_files(file)
        os.remove(file)
        #command= os.path.join(x_file,'bin','elasticsearch')
        #subprocess.call([command])
    else:
        path = 'https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.3-darwin-x86_64.tar.gz'
        file = download_file('es',path,override=False)
        x_file = extract_gz_files(file)
        os.remove(file)
        #command= os.path.join(x_file,'bin','elasticsearch')
        #subprocess.call([command])
else:
    print('Elastic Search file already exists, skipping...')
 
    



Elastic Search file already exists, skipping...


In [55]:
FIELDS = ['url','title', 'body']
INDEX_NAME = 'ms-marco'
body = {
    'mappings': {
            'properties': {
                'title': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'body': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                }
            }
        }
    }
overwrite = False # DO NOT CHANGE THIS FLAG!!!
user = 'elastic'
password = 'IfKREtTr7fCqMYTD8NKE4yBi'
remote_url = f'https://{user}:{password}@6a0fe46eef334fada72abc91933b54e8.us-central1.gcp.cloud.es.io:9243'

#es = Elasticsearch(hosts=remote_url)
es = Elasticsearch()
create_index(es,INDEX_NAME,body,overwrite = overwrite)
print(es.info())

{'name': 'ODIN', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'KvITBbqER528OPZnWCQk8A', 'version': {'number': '7.9.3', 'build_flavor': 'default', 'build_type': 'zip', 'build_hash': 'c4138e51121ef06a6404866cddc601906fe5c868', 'build_date': '2020-10-16T10:36:16.141335Z', 'build_snapshot': False, 'lucene_version': '8.6.2', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}


## Execute these shell commands to install and run elasticsearch locally

%%script bash
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.3-linux-x86_64.tar.gz
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.3-linux-x86_64.tar.gz.sha512
shasum -a 512 -c elasticsearch-7.9.3-linux-x86_64.tar.gz.sha512 
tar -xzf elasticsearch-7.9.3-linux-x86_64.tar.gz
rm elasticsearch-7.9.3-linux-x86_64.tar.gz
rm elasticsearch-7.9.3-linux-x86_64.tar.gz.sha512

!useradd elasticuser
!chown -R elasticuser elasticsearch-7.9.3

%%script bash --bg --out script_out
su elasticuser -c ./elasticsearch-7.9.3/bin/elasticsearch &


# Section V - MS-MARCO Dataset Downloading
## Download MS-MARCO files if not available yet

In [25]:

urls = [
'https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz'
,'https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs-lookup.tsv.gz'
,'https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-queries.tsv.gz'
,'https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-queries.tsv.gz'
,'https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-top100.gz'
,'https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-qrels.tsv.gz'
,'https://msmarco.blob.core.windows.net/msmarcoranking/docleaderboard-queries.tsv.gz'
,'https://msmarco.blob.core.windows.net/msmarcoranking/docleaderboard-top100.tsv.gz'
]

source_path = '../input/MS-MARCO'

if not os.path.isdir(source_path):
        os.mkdir(source_path)


gzfiles = []
for url in urls:
    gzfile = download_file(source_path,url,override=False)
    gzfiles.append(gzfile)

	File ../input/MS-MARCO\msmarco-docs.tsv.gz already exists, skipping...
	File ../input/MS-MARCO\msmarco-docs-lookup.tsv.gz already exists, skipping...
	File ../input/MS-MARCO\msmarco-doctrain-queries.tsv.gz already exists, skipping...
	File ../input/MS-MARCO\msmarco-docdev-queries.tsv.gz already exists, skipping...
	File ../input/MS-MARCO\msmarco-docdev-top100.gz already exists, skipping...
	File ../input/MS-MARCO\msmarco-docdev-qrels.tsv.gz already exists, skipping...
	File ../input/MS-MARCO\docleaderboard-queries.tsv.gz already exists, skipping...
	File ../input/MS-MARCO\docleaderboard-top100.tsv.gz already exists, skipping...


# Section VI - Document sampling and extraction
### Will extract the 10% of dev queries and related documents for indexing and feature extraction


In [27]:
DOCUMENT_SAMPLE_FACTOR= 0.1
random.seed(1111)

Query samples come in the form of:
```
174249	does xpress bet charge to deposit money in your account
320792	how much is a cost to run disneyland
1090270	botulinum definition
1101279	do physicians pay for insurance from their salaries?
201376	here there be dragons comic
54544	blood diseases that are sexually transmitted
118457	define bona fides

```

Therefore each line of code has to be split in 2,where index[0] = Query ID and index[1] = query_text

In [36]:
query_samples = extract_rand_samples_from_gz_file(os.path.join(source_path,'msmarco-docdev-queries.tsv.gz'),DOCUMENT_SAMPLE_FACTOR)
query_samples = [q.decode('UTF-8').replace('\r\n','').split('\t') for q in query_samples]
query_samples = {q[0]:q[1] for q in query_samples}

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5193.0), HTML(value='')))




Get top 100 retrieved documents from development dataset for the query ids retrieved from document **msmarco-docdev-top100.gz**.
100 Documents come in the form of:
```
174249 Q0 D3126539 1 -5.99003 IndriQueryLikelihood
174249 Q0 D978773 2 -6.18444 IndriQueryLikelihood
174249 Q0 D399803 3 -6.20982 IndriQueryLikelihood
174249 Q0 D2204704 4 -6.24312 IndriQueryLikelihood
174249 Q0 D3126541 5 -6.24726 IndriQueryLikelihood
174249 Q0 D398816 6 -6.27273 IndriQueryLikelihood
174249 Q0 D2168983 7 -6.29127 IndriQueryLikelihood
174249 Q0 D3126537 8 -6.30813 IndriQueryLikelihood
174249 Q0 D3297846 9 -6.32111 IndriQueryLikelihood
174249 Q0 D531991 10 -6.34283 IndriQueryLikelihood
174249 Q0 D2479861 11 -6.34364 IndriQueryLikelihood

```
Only columns 0,2,3 and 4 are important


In [56]:
query_doc_top100 = {}
with gzip.GzipFile(os.path.join(source_path,'msmarco-docdev-top100.gz'),'rb') as file:
    try:
        while True:
            line = next(file).decode('UTF-8').replace('\r\n','').split(' ')
            if line[0] in query_samples:
                query_doc_top100[line[2]] = [line[0],line[3],line[4]]
    except StopIteration:
        pass

Extract all qrels from the file msmarco-docdev-qrels.tsv. 
This file contains only one relevant document per query andcomes in the form:
```
   2 0 D1650436 1
1215 0 D1202771 1
1288 0 D1547717 1
1576 0 D1313702 1
2235 0 D2113408 1
2798 0 D2830290 1
```
Where:
* Column 0 = Query Id
* Column 1 = Document Id
The rest of the columns are irrelevant, since the present document in the file highlights always '1' in column 3 for being a relevant document

In [80]:
qrels = {}
with gzip.GzipFile(os.path.join(source_path,'msmarco-docdev-qrels.tsv.gz'),'rb') as file:
    try:
        while True:
            line = next(file).decode('UTF-8').replace('\r\n','').split(' ')
            query_id = line[0]
            if query_id in query_samples:
                qrels[query_id] = line[2]
    except StopIteration:
        pass

Now get all the documents whose document id is present in  query_doc_top100 from 'msmarco-docs.tsv.gz'
Such documents come in the form of:
```
D250947 https://www.michaeljfox.org/    LATEST FROM THE BLOG    LATEST FROM THE BLOGMOR
```
Where
* Column 0 = Document id
* Column 1 = URL
* Column 2 = Title
* Column 3 = Body

Only columns 0, 2 and 3 are important
Once documents are extracted they are added to elasticSearch.

In [None]:
doc_ids = set(qrels.values())
doc_ids = doc_ids.union(set(query_doc_top100.keys()))
docs_len = len(doc_ids)
docs = {}
with gzip.GzipFile(os.path.join(source_path,'msmarco-docs.tsv.gz'),'rb') as file:
    added_docs = 0
    try:
        while True:
            if added_docs == query_doc_top100_len:
                break
            line = next(file).decode('UTF-8').replace('\r\n','').split('\t')
            doc_id = line[0]
            if doc_id in query_doc_top100:
                doc= {'title':line[2].strip(),'body':line[3].strip()}
                docs[doc_id] = doc
                es.index(index=INDEX_NAME, id=doc_id, body=doc)
                added_docs +=1
                print(f'\rAdded {doc_id}, {added_docs} of {query_doc_top100_len}...',end='')
    except StopIteration:
        pass

Added D881082, 4477 of 400737....

In [None]:
out_path = os.path.join('..','out')
if not os.path.exists(out_path):
    os.mkdir(out_path)
save_picke(os.path.join(out_path,'query_samples.pickle'),query_samples)
save_picke(os.path.join(out_path,'query_doc_top100.picke'),query_doc_top100)
save_picke(os.path.join(out_path,'docs.picke'),docs)
save_picke(os.path.join(out_path,'qrels.picke'),qrels)

# Section VII - Query analytics and feature extraction algorithms

In [97]:
def analyze_query(es, query, field, index='ms-marco'):
    """Analyzes a query with respect to the relevant index.

    Arguments:
        es: Elasticsearch object instance.
        query: String of query terms.
        field: The field with respect to which the query is analyzed.
        index: Name of the index with respect to which the query is analyzed.

    Returns:
        A list of query terms that exist in the specified field among the documents in the index.
    """
    tokens = es.indices.analyze(index=index, body={'text': query})['tokens']
    query_terms = []
    for t in sorted(tokens, key=lambda x: x['position']):
        ## Use a boolean query to find at least one document that contains the term.
        hits = es.search(index=index, body={'query': {'match': {field: t['token']}}},
                         _source=False, size=1).get('hits', {}).get('hits', {})
        doc_id = hits[0]['_id'] if len(hits) > 0 else None
        if doc_id is None:
            continue
        query_terms.append(t['token'])
    return query_terms


def get_doc_term_freqs(es, doc_id, field, index='toy_index'):
    """Gets the term frequencies of a field of an indexed document.

    Arguments:
        es: Elasticsearch object instance.
        doc_id: Document identifier with which the document is indexed.
        field: Field of document to consider for term frequencies.
        index: Name of the index where document is indexed.

    Returns:
        Dictionary of terms and their respective term frequencies in the field and document.
    """
    tv = es.termvectors(index=index, id=doc_id, fields=field, term_statistics=True)
    if tv['_id'] != doc_id:
        return None
    print(tv)
    if field not in tv['term_vectors']:
        return None
    term_freqs = {}
    for term, term_stat in tv['term_vectors'][field]['terms'].items():
        term_freqs[term] = term_stat['term_freq']
    return term_freqs


def get_query_term_freqs(es, query_terms):
    """Gets the term frequencies of a list of query terms.

    Arguments:
        es: Elasticsearch object instance.
        query_terms: List of query terms, analyzed using `analyze_query` with respect to some relevant index.

    Returns:
        A list of query terms that exist in the specified field among the documents in the index.
    """
    c = Counter()
    for term in query_terms:
        c[term] += 1
    return dict(c)


def extract_query_features(query_terms, es, index='toy_index'):
    """Extracts features of a query.

        Arguments:
            query_terms: List of analyzed query terms.
            es: Elasticsearch object instance.
            index: Name of relevant index on the running Elasticsearch service.
        Returns:
            Dictionary with keys 'query_length', 'query_sum_idf', 'query_max_idf', and 'query_avg_idf'.
    """
    q_features = {}

    if len(query_terms) == 0:
        q_features['query_length'] = 0
        q_features['query_sum_idf'] = 0
        q_features['query_max_idf'] = 0
        q_features['query_avg_idf'] = 0
        return q_features

    q_features['query_length'] = len(query_terms)

    count_docs_with_term = []
    total_docs_in_index = int(es.cat.count(index=index, params={"format": "json"})[0]['count'])

    for query in query_terms:
        res = es.count(index=index, body={
            'query':
                {'match':
                     {'body': query}
                 }
        })['count']
        count_docs_with_term.append(res)

    q_features['query_sum_idf'] = sum([np.log(total_docs_in_index / freq) for freq in count_docs_with_term])
    q_features['query_max_idf'] = max([np.log(total_docs_in_index / freq) for freq in count_docs_with_term])
    q_features['query_avg_idf'] = np.mean([np.log(total_docs_in_index / freq) for freq in count_docs_with_term])

    return q_features


def extract_doc_features(doc_id, es, index='toy_index'):
    """Extracts features of a document.

        Arguments:
            doc_id: Document identifier of indexed document.
            es: Elasticsearch object instance.
            index: Name of relevant index on the running Elasticsearch service.

        Returns:
            Dictionary with keys 'doc_length_title', 'doc_length_body'.
    """
    doc_features = {}

    terms = get_doc_term_freqs(es, doc_id, 'body', index)
    print(terms)
    if terms is None:
        doc_features['doc_length_body'] = 0
    else:
        doc_features['doc_length_body'] = sum(terms.values())

    terms = get_doc_term_freqs(es, doc_id, 'title', index)
    if terms is None:
        doc_features['doc_length_title'] = 0
    else:
        doc_features['doc_length_title'] = sum(terms.values())

    return doc_features


def extract_query_doc_features(query_terms, doc_id, es, index='toy_index'):
    """Extracts features of a query and document pair.

        Arguments:
            query_terms: List of analyzed query terms.
            doc_id: Document identifier of indexed document.
            es: Elasticsearch object instance.
            index: Name of relevant index on the running Elasticsearch service.

        Returns:
            Dictionary with keys 'unique_query_terms_in_title', 'unique_query_terms_in_body',
            'sum_TF_title', 'sum_TF_body', 'max_TF_title', 'max_TF_body', 'avg_TF_title', 'avg_TF_body'.
    """
    q_doc_features = {}

    if len(query_terms) == 0:
        q_doc_features['unique_query_terms_in_title'] = 0
        q_doc_features['unique_query_terms_in_body'] = 0
        q_doc_features['sum_TF_body'] = 0
        q_doc_features['max_TF_body'] = 0
        q_doc_features['avg_TF_body'] = 0
        q_doc_features['sum_TF_title'] = 0
        q_doc_features['max_TF_title'] = 0
        q_doc_features['avg_TF_title'] = 0
        return q_doc_features

    terms_title = get_doc_term_freqs(es, doc_id, 'title', index)
    terms_body = get_doc_term_freqs(es, doc_id, 'body', index)

    def agg(terms_dict, query_terms_list, func):
        freq_list = []
        for term in query_terms_list:
            if term in terms_dict.keys():
                freq_list.append(terms_dict[term])
            else:
                freq_list.append(0)
        return func(freq_list)

    if terms_title is None:
        q_doc_features['sum_TF_title'] = 0
        q_doc_features['max_TF_title'] = 0
        q_doc_features['avg_TF_title'] = 0
    else:
        q_doc_features['sum_TF_title'] = agg(terms_title, query_terms, sum)
        q_doc_features['max_TF_title'] = agg(terms_title, query_terms, max)
        q_doc_features['avg_TF_title'] = agg(terms_title, query_terms, np.mean)

    if terms_body is None:
        q_doc_features['sum_TF_body'] = 0
        q_doc_features['max_TF_body'] = 0
        q_doc_features['avg_TF_body'] = 0
    else:
        q_doc_features['sum_TF_body'] = agg(terms_body, query_terms, sum)
        q_doc_features['max_TF_body'] = agg(terms_body, query_terms, max)
        q_doc_features['avg_TF_body'] = agg(terms_body, query_terms, np.mean)

    # UNIQUE QUERY TERMS
    query_terms = set(query_terms)
    if terms_title is None:
        q_doc_features['unique_query_terms_in_title'] = 0
    else:
        q_doc_features['unique_query_terms_in_title'] = len([t for t in query_terms if t in terms_title.keys()])
    if terms_body is None:
        q_doc_features['unique_query_terms_in_body'] = 0
    else:
        q_doc_features['unique_query_terms_in_body'] = len([t for t in query_terms if t in terms_body.keys()])

    return q_doc_features


FEATURES_QUERY = ['query_length', 'query_sum_idf', 'query_max_idf', 'query_avg_idf']
FEATURES_DOC = ['doc_length_title', 'doc_length_body']
FEATURES_QUERY_DOC = ['unique_query_terms_in_title', 'sum_TF_title', 'max_TF_title', 'avg_TF_title',
                      'unique_query_terms_in_body', 'sum_TF_body', 'max_TF_body', 'avg_TF_body'
                      ]


def extract_features(query_terms, doc_id, es, index='toy_index'):
    """Extracts query features, document features and query-document features of a query and document pair.

        Arguments:
            query_terms: List of analyzed query terms.
            doc_id: Document identifier of indexed document.
            es: Elasticsearch object instance.
            index: Name of relevant index on the running Elasticsearch service.

        Returns:
            List of extracted feature values in a fixed order.
    """
    feature_vect = []

    query_features = extract_query_features(query_terms, es, index=index)
    for f in FEATURES_QUERY:
        feature_vect.append(query_features[f])

    doc_features = extract_doc_features(doc_id, es, index=index)
    for f in FEATURES_DOC:
        feature_vect.append(doc_features[f])

    query_doc_features = extract_query_doc_features(query_terms, doc_id, es, index=index)
    for f in FEATURES_QUERY_DOC:
        feature_vect.append(query_doc_features[f])

    return feature_vect

def prepare_ltr_training_data(query_ids, es,qrels, index='ms-marco'):
    """Prepares feature vectors and labels for query and document pairs found in the training data.

        Arguments:
            query_ids: List of query IDs.
            es: Elasticsearch object instance.
            index: Name of relevant index on the running Elasticsearch service.

        Returns:
            X: List of feature vectors extracted for each pair of query and retrieved or relevant document.
            y: List of corresponding labels.
    """
    X = []
    y = []

    # YOUR CODE HERE

    for query_id in tqdm(query_ids):
        relevent_doc = qrels[query_id]
        query = qrels[query_id]
        analyzed_terms = analyze_query(es, query, 'body', index=index)

        extracted_feature = extract_features(analyzed_terms, relevent_doc, es, index=index)
        X.append(extracted_feature)
        y.append(1)

        hits = es.search(index=index, q=' '.join(analyzed_terms), _source=True, size=100)['hits']['hits']

        for hit in hits:
            doc_id = hit['_id']
            if doc_id != relevent_doc:
                extracted_feature = extract_features(analyzed_terms, doc_id, es, index=index)
                X.append(extracted_feature)
                y.append(0)
    return X, y

def get_reciprocal_rank(doc_rankings, relevant_doc_id):
    """Computes Reciprocal Rank (RR).

    Args:
        system_ranking: Ranked list of document IDs.
        ground_truth: Set of relevant document IDs.

    Returns:
        RR (float).
    """
    for i, doc_id in enumerate(doc_rankings):
        if doc_id == relevant_doc_id:
            return 1 / (i + 1)
    return 0

def get_mean_eval_measure(system_rankings, eval_function):
    """Computes a mean of any evaluation measure over a set of queries.

    Args:
        system_rankings: Dict with query ID as key and a ranked list of document IDs as value.
        ground_truths: Dict with query ID as key and a set of relevant document IDs as value.
        eval_function: Callback function for the evaluation measure that mean is computed over.

    Returns:
        Mean evaluation measure (float).
    """
    sum_score = 0
    for query_id, system_ranking in system_rankings.items():
        sum_score += eval_function(system_ranking, QRELS[query_id])
    return sum_score / len(system_rankings)

def load_basic_rankings(filepath, avoid_queries, max_size=100):
    basic_rankings = defaultdict(list)

    with open(filepath, 'r') as file:
        for line in file:
            record = line.split(' ')
            query_id = int(record[0])
            doc_id = record[2]

            if query_id in avoid_queries:
                continue

            if query_id not in QRELS.keys():
                continue

            basic_rankings[query_id].append(doc_id)

            if(len(basic_rankings)) >= max_size:
                break

        return basic_rankings
    
def rerank_score(basic_rankings, ltr_model):
    reranked = {}
    for query_id, doc_rankings in tqdm(basic_rankings.items(), desc='Reranking'):

        query = QUERIES[query_id]
        query_terms = analyze_query(es, query, 'body', INDEX_NAME)

        if query_terms is None:
            continue

        features = []
        for doc_id in doc_rankings:
            ft = extract_features(query_terms, doc_id, es, INDEX_NAME)
            features.append(ft)

        doc_reranked = ltr_model.rank(features, doc_rankings)
        reranked[query_id] = doc_reranked

    score = get_mean_eval_measure(reranked, get_reciprocal_rank)
    return score

class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        Arguments:
            classifier: An instance of scikit-learn regressor.
        """
        self.model = regressor

    def train(self, X, y):
        """Trains an LTR model.

        Arguments:
            X: Features of training instances.
            y: Relevance assessments of training instances.
        """
        assert self.model is not None
        self.model = self.model.fit(X, y)

    def rank(self, ft, doc_ids):
        """Predicts relevance labels and rank documents for a given query.

        Arguments:
            ft: A list of feature vectors for query-document pairs.
            doc_ids: A list of document ids.
        Returns:
            List of tuples, each consisting of document ID and predicted relevance label.
        """
        assert self.model is not None
        rel_labels = self.model.predict(np.array(ft))
        sort_indices = np.argsort(rel_labels)[::-1]
        results = []
        for i in sort_indices:
            results.append(doc_ids[i])
        return results
    

# Section VIII - Baseline Model - ML Algorithms for Document Re-ranking

In [98]:
train_query_ids = list(qrels.keys())
train_data_path = os.path.join(out_path,'training_data.pickle')
if os.path.isfile(train_data_path):
    with open(train_data_path, 'rb') as file:
        training_data = pickle.load(file)
else:
    training_data = prepare_ltr_training_data(train_query_ids, es,qrels, index=INDEX_NAME)
    with open(train_data_path, 'wb') as file:
        pickle.dump(training_data, file)
X_train, y_train = training_data

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5193.0), HTML(value='')))

{'_index': 'ms-marco', '_type': '_doc', '_id': 'D1650436', '_version': 1, 'found': True, 'took': 96, 'term_vectors': {'body': {'field_statistics': {'sum_doc_freq': 207292521, 'doc_count': 400720, 'sum_ttf': 621599814}, 'terms': {'0': {'doc_freq': 74108, 'ttf': 464701, 'term_freq': 2}, '0.0110117': {'doc_freq': 1, 'ttf': 1, 'term_freq': 1}, '0.0210235': {'doc_freq': 1, 'ttf': 1, 'term_freq': 1}, '0.0270309': {'doc_freq': 1, 'ttf': 1, 'term_freq': 1}, '0.0290041': {'doc_freq': 1, 'ttf': 1, 'term_freq': 1}, '00003': {'doc_freq': 136, 'ttf': 149, 'term_freq': 1}, '00003495': {'doc_freq': 60, 'ttf': 67, 'term_freq': 1}, '0008': {'doc_freq': 219, 'ttf': 312, 'term_freq': 4}, '001': {'doc_freq': 1256, 'ttf': 3415, 'term_freq': 1}, '0019': {'doc_freq': 153, 'ttf': 194, 'term_freq': 1}, '00289': {'doc_freq': 7, 'ttf': 8, 'term_freq': 2}, '00481': {'doc_freq': 1, 'ttf': 1, 'term_freq': 1}, '00583': {'doc_freq': 7, 'ttf': 8, 'term_freq': 1}, '0070': {'doc_freq': 74, 'ttf': 86, 'term_freq': 1}, '0

KeyError: 'term_vectors'

In [None]:
basic_rankings = load_basic_rankings('data/msmarco-docdev-top100.tsv',
                                     avoid_queries=train_query_ids, max_size=300)
get_mean_eval_measure(basic_rankings, get_reciprocal_rank)

In [None]:
clf = RandomForestRegressor(max_depth=5, random_state=0, n_jobs=4)
ltr = PointWiseLTRModel(clf)
ltr.train(X_train, y_train)
print('Random Forest Score:', rerank_score(basic_rankings, ltr))

In [None]:
clf = xgboost.XGBRegressor(base_score=0.25, max_depth=10, random_state=0,
                           objective='reg:linear', verbosity=0, n_estimators=100)
ltr = PointWiseLTRModel(clf)
ltr.train(np.array(X_train), np.array(y_train))
print('XGBoost Score:', rerank_score(basic_rankings, ltr))

In [None]:
clf = svm.SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
ltr = PointWiseLTRModel(clf)
ltr.train(np.array(X_train), np.array(y_train))
print('svm Score:', rerank_score(basic_rankings, ltr))

# Section IX - Advanced Model - Deep Learning with BERT Document Re-ranking

In [None]:
pretrained_model = "distilbert-base-cased"

custom_config = AutoConfig.from_pretrained(pretrained_model)
custom_config.output_hidden_states=True
custom_tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
custom_model = AutoModel.from_pretrained(pretrained_model, config=custom_config)

model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)

doc_ids_training = []

with open('../input/docidsfortraining/doc_id_for_training.txt', 'r') as file:
    doc_ids_training = file.readlines()[0].split()

print(len(doc_ids_training))


def get_lines(file_path):
    total_lines = 0
    with open(file_path,'rb') as file:
        try:
            while True:
                next(file)
                total_lines +=1
        except StopIteration:
            pass
    return total_lines
        
    
def summarize(lines):
    corpus = {}
    for line in tqdm(lines):
        doc_id = line[0]
        doc_title = line[2].lower()
        doc_body = line[3].lower()
        summary = model(doc_body, max_length=250)
        corpus[doc_id] = (doc_title, summary)
    return corpus

def write_to_file(corpus,file_path):
    with open(file_path,'ab') as file:
        for key,val in corpus.items():
            file.write(f'{key}\t{val[0]}\t{val[1]}\n')
            
            
def process_file_per_batch(file_path,file_out,batch_step,max_hits=None):
    print(f'Getting total lines from file {file_path}')
    get_lines(file_path)
    print(f'\tFile read with {total_lines} lines to process')
    print(f'Total of {batch_step} batches will be processed....')
    hits = 0
    lines = []
    with open(file_path,'rb') as file:
        for i in tqdm(range(total_lines)):
            batch_mod = i % batch_step
            if (i > 0 and  batch_mod == 0) or (i == total_lines - 1):
                corpus = summarize(lines)
                write_to_file(corpus,file_out)
                lines= []
                hits += 1
            else:
                line = next(file)
                line = line.decode('UTF-8').split('\t')
                lines.append(line)
            if max_hits and hits ==max_hits:
                break   
                
                
in_file ='../input/msmarcotrainingdoc/required_docs.tsv'
file_out = '/kaggle/working/required_docs_sumarized.tsv'

if os.path.exists(file_out):
    os.remove(file_out)

max_per_batch = 1000
batch_step = round(total_lines / max_per_batch)
process_file_per_batch(in_file,file_out,batch_step,max_hits=None)


  doc_id = line[0]
    if doc_id not in doc_ids_training:
        continue

    doc_title = line[2].lower()
    doc_body = line[3].lower()

    summary = model(doc_body, max_length=250)

    corpus[doc_id] = (doc_title, summary)
    

    with  gzip.GzipFile('MS-MARCO/summarized_docs.tsv.gz', 'wb') as gz_file:
for key, val in tqdm(corpus.items()):
    title, body = val
    gz_file.write(f'{key}\t{title}\t{body}\n')