In [1]:
import time
import numpy as np
import pandas as pd
import dask.dataframe as dd

In [2]:
#load dataset containing queries. 
queries=pd.read_table("msmarco-doctrain-queries/queries.doctrain.tsv",delimiter="\t")
#load top 100 document relevant to each of these queries.
top100_docs=pd.read_table('msmarco-doctrain-top100/msmarco-doctrain-top100',delimiter=' ')

#defined column names
queries.columns=['qid','query']
top100_docs.columns=["qid","Q0","docid","rank","score","runstring"]

In [3]:
#load dataset containing documents.
docs=dd.read_table('msmarco-docs/msmarco-docs.tsv',blocksize=100e6)
#defined column names
docs.columns=["docid","url","title","body"]
docs.head()

Unnamed: 0,docid,url,title,body
0,D301595,http://childparenting.about.com/od/physicalemo...,Developmental Milestones and Your 8-Year-Old C...,School-Age Kids Growth & Development Developme...
1,D1359209,http://visihow.com/Check_for_Lice_Nits,Check for Lice Nits,Check for Lice Nits Edited by Mian Sheilette O...
2,D2147834,http://www.nytimes.com/2010/01/05/business/glo...,Dubai Opens a Tower to Beat All,Global Business Dubai Opens a Tower to Beat Al...
3,D1568809,http://www.realtor.com/realestateandhomes-sear...,"Coulterville, CA Real Estate & Homes for Sale","Coulterville, CA Real Estate & Homes for Sale4..."
4,D3233725,http://www.akc.org/dog-breeds/dogo-argentino/c...,Dogo Argentino,Dogo Argentino Miscellaneous The Dogo Argentin...


In [4]:
docs.isna().sum(axis=0).compute()

docid        0
url          0
title    27291
body     12014
dtype: int64

In [5]:
docs_ = docs.dropna()
docs_.isna().sum(axis=0).compute()

docid    0
url      0
title    0
body     0
dtype: int64

In [6]:
queries['len']=queries['query'].apply(lambda x: len(x))
lengthy_queries = queries[queries['len'] >=40]
# Creating Training Set of Queries
train_queries = lengthy_queries.sample(n=2000,replace=False,random_state=122)
# Creating Testing Set of Queries
merged_queries = queries.merge(lengthy_queries, how="left",indicator=True)
unassigned_queries = merged_queries[merged_queries['_merge']== 'left_only']
test_queries = unassigned_queries.sample(n=700,replace=False,random_state=230)

In [7]:
unique_docids=docs_['docid'].unique().compute()

In [8]:
def get_top_documents(queries):
    '''
    The function takes a set of queries and returns the top 10 and bottom 10 documents associated with the query from our dataset.
    A label is given to the top 10 (1) as `relevant` and 0 to the bottom 10 as `irrelevant`.
    '''
    ## every unique query id in sampled data
    query_ids=queries.qid.unique().tolist()
    ## filter the top 100 docs by the taking samples of these query ids 
    condition1 = (top100_docs['qid'].isin(query_ids))
    condition2 = (top100_docs['docid'].isin(unique_docids.values))
    top_docs = top100_docs[condition1 & condition2].reset_index(drop=True)
    ## Get relevance of top documents
    relevant=list(range(1,11))
    non_relevant=list(range(91,101))
    top_docs['relevance']=top_docs['rank'].apply(lambda x: 1 if x in relevant else ( 0 if x in non_relevant else np.nan))
    top_docs.dropna(inplace=True)
    return top_docs

In [9]:
#get top documents for queries in test set.
top_test_docs = get_top_documents(test_queries)
#get top documents for queries in training set. 
top_train_docs = get_top_documents(train_queries)

In [10]:
def create_corpus(result):
    unique_docid=result['docid'].unique()
    condition=docs['docid'].isin(unique_docid)
    corpus=docs[condition].reset_index(drop=True)
    corpus=corpus.drop(columns='url')
    print('Number of Rows=>',len(corpus))
    return corpus.compute()

training_corpus=create_corpus(top_train_docs)
testing_corpus=create_corpus(top_test_docs)

Number of Rows=> 37406
Number of Rows=> 13669


In [11]:
#save training and testing corpus.
training_corpus.to_csv("training_corpus.csv",index=False)
testing_corpus.to_csv("testing_corpus.csv",index=False)

In [17]:
#save training and testing corpus.
train_queries.to_csv("train_queries.csv",index=False)
test_queries.to_csv("test_queries.csv",index=False)

In [13]:
temp_doc=training_corpus.sample(1)
print('Title=>',temp_doc.title.values)
print('Body:\n',temp_doc.body.values)

Title=> ['Wilmington, NC Airports']
Body:
 ['Home Wilmington, NC Airports Wilmington, NC Airports About Search Results Sort: Default BBB Rated A+/AAll Filters Airfares - Up To 70% Off Call Toll Free For Lowest Fares Use "CALLCENTER20" - Extra 20% Offtripvers.com/Smart Fares1. ILM - Wilmington International Airport BBB Rating: A+1740 Airport Blvd Wilmington, NC 28405 (910) 341-4333Airports Website From Business: The Wilmington International Airport ILM Public Safety Department is a full service law enforcement agency responsible for all law enforcement and security respon…2. 17ID - Coyote Ridge Airport309 Putnam Dr Wilmington, NC 28411 (910) 686-4645Airports3. 03NC - Pilots Ridge Airport716 Pilots Ridge Rd Wilmington, NC 28412 (910) 617-6950Airports4. NC20 - Canaan Air Base Airport100 Cannon Rd Wilmington, NC 28411 (252) 633-0328Airports5. 7NC1 - Stag Air Park Airport245 Creekside East Dr Wilmington, NC 28411 (910) 686-9612Airports6. Airport Taxi Solutions1306 Grackle Ct Wilmington, NC 

In [15]:
for i,v in enumerate(train_queries['query'].sample(10)):
    print(i,'=>',v)

0 => how many days it takes for canadian visa for work permit
1 => how to check on the status of your tax refund check
2 => how many hours difference between pst and gmt
3 => what is the definition of neutron of an atom
4 => how much money can you earn as an enrolled agent
5 => where does intestinal bacteria come from
6 => how far is centerville tn from nashville
7 => when was washington admitted into the union
8 => what kind of cases does internal affairs investigate?
9 => how much do registered nurses make in a year?
