In [3]:
# !pip install --upgrade ir_datasets

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

import ir_datasets

import config

In [2]:
dataset_name = config.DATASET
dataset = ir_datasets.load(dataset_name)

In [3]:
dataset.docs_count()

522931

In [4]:
n_docs = 1000

for i, data in enumerate(zip(dataset.queries_iter(), dataset.docs_iter(), dataset.qrels_iter())):
    query, doc, rel = data
    print(query)
    print(doc)
    print(rel)
    break

BeirQuery(query_id='318', text='How does Quora look to a moderator?', metadata={})
BeirDoc(doc_id='1', text='What is the step by step guide to invest in share market in india?', title='', metadata={})
TrecQrel(query_id='318', doc_id='317', relevance=1, iteration='0')


In [5]:
df_queries = pd.DataFrame(dataset.queries_iter())
print(len(df_queries))
df_queries.head()

5000


Unnamed: 0,query_id,text,metadata
0,318,How does Quora look to a moderator?,{}
1,378,How do I refuse to chose between different thi...,{}
2,379,Did Ben Affleck shine more than Christian Bale...,{}
3,399,What are the effects of demonitization of 500 ...,{}
4,420,Why creativity is important?,{}


In [6]:
df_docs = pd.DataFrame(dataset.docs_iter())
print(len(df_docs))
df_docs.head()

522931


Unnamed: 0,doc_id,text,title,metadata
0,1,What is the step by step guide to invest in sh...,,{}
1,2,What is the step by step guide to invest in sh...,,{}
2,3,What is the story of Kohinoor (Koh-i-Noor) Dia...,,{}
3,4,What would happen if the Indian government sto...,,{}
4,5,How can I increase the speed of my internet co...,,{}


In [7]:
df_qrels = pd.DataFrame(dataset.qrels_iter())
print(len(df_qrels))
df_qrels.head()

7626


Unnamed: 0,query_id,doc_id,relevance,iteration
0,318,317,1,0
1,378,377,1,0
2,379,29976,1,0
3,379,380,1,0
4,379,45646,1,0


In [8]:
df_qrels.relevance.value_counts()

1    7626
Name: relevance, dtype: int64

In [9]:
df_docs = df_docs.rename({"text":"doc_text"}, axis=1).drop(["metadata", "title"], axis=1)
df_docs.head()

Unnamed: 0,doc_id,doc_text
0,1,What is the step by step guide to invest in sh...
1,2,What is the step by step guide to invest in sh...
2,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
3,4,What would happen if the Indian government sto...
4,5,How can I increase the speed of my internet co...


In [10]:
df_queries = df_queries.rename({"text":"query_text"}, axis=1).drop(["metadata"], axis=1)
df_queries.head()

Unnamed: 0,query_id,query_text
0,318,How does Quora look to a moderator?
1,378,How do I refuse to chose between different thi...
2,379,Did Ben Affleck shine more than Christian Bale...
3,399,What are the effects of demonitization of 500 ...
4,420,Why creativity is important?


In [11]:
df = df_qrels.merge(df_docs).merge(df_queries)[["query_text", "doc_text", "relevance"]]
print(len(df))
df.head()

7626


Unnamed: 0,query_text,doc_text,relevance
0,How does Quora look to a moderator?,What does the Quora website look like to membe...,1
1,How do I refuse to chose between different thi...,Is it possible to pursue many different things...,1
2,Did Ben Affleck shine more than Christian Bale...,"According to you, whose Batman performance was...",1
3,Did Ben Affleck shine more than Christian Bale...,"No fanboys please, but who was the true batman...",1
4,Did Ben Affleck shine more than Christian Bale...,Who do you think portrayed Batman better: Chri...,1


In [12]:
df.isna().sum()

query_text    0
doc_text      0
relevance     0
dtype: int64

In [13]:
df.relevance.value_counts()

1    7626
Name: relevance, dtype: int64

In [14]:
df_negative = df.copy()

In [15]:
np.random.shuffle(df_negative.doc_text)
df_negative["relevance"] = 0
df_negative.head()

  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  np.random.shuffle(df_negative.doc_text)


Unnamed: 0,query_text,doc_text,relevance
0,How does Quora look to a moderator?,What are some amazing pics?,0
1,How do I refuse to chose between different thi...,How do you run fast with the ball glued to you...,0
2,Did Ben Affleck shine more than Christian Bale...,Which are the best colleges for electrical eng...,0
3,Did Ben Affleck shine more than Christian Bale...,"I'm 8 days late for my period, Am I pregnant?",0
4,Did Ben Affleck shine more than Christian Bale...,Can someone survive a shot to the head?,0


In [16]:
df = pd.concat([df, df_negative])
print(len(df))
df.head()

15252


Unnamed: 0,query_text,doc_text,relevance
0,How does Quora look to a moderator?,What does the Quora website look like to membe...,1
1,How do I refuse to chose between different thi...,Is it possible to pursue many different things...,1
2,Did Ben Affleck shine more than Christian Bale...,"According to you, whose Batman performance was...",1
3,Did Ben Affleck shine more than Christian Bale...,"No fanboys please, but who was the true batman...",1
4,Did Ben Affleck shine more than Christian Bale...,Who do you think portrayed Batman better: Chri...,1


In [17]:
save_path = Path(f"data/{dataset_name}")
save_path.mkdir(exist_ok=True, parents=True)
df.to_pickle(save_path/"data.pkl")