DATASETS

In [48]:
!pip install -U beir
!pip install nltk
!pip install pandas



In [49]:
import os
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from beir import util

In [50]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
dataset_url = 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [51]:
def load_dataset(dataset_name):
    out_dir = os.path.join(os.getcwd(), 'datasets')
    util.download_and_unzip(dataset_url.format(dataset_name), out_dir)

In [52]:
def filter_words(w, filter_stopwords):
    if filter_stopwords:
        return w.isalnum() and w not in stop_words
    else:
        return w

In [53]:
def build_doc_stats(dataset_name, fdist, doc_count, sentence_count, query_count):
    term_count = fdist.N()
    bin_count = fdist.B()
    word_per_doc = term_count / doc_count
    valid_sentence_count = sentence_count
    if valid_sentence_count == 0:
        print('WARNING! "{}" corpus does not have title values.'.format(dataset_name))
        valid_sentence_count = 1
    word_per_sent = term_count / valid_sentence_count
    # print(fdist.most_common(10))
    stats_dict = {
        '# of docs': [doc_count],
        '# of terms': [term_count],
        '# of unique terms': [bin_count],
        '# of sentences': [sentence_count],
        '# of terms per sentences': [word_per_sent],
        '# of terms per docs': [word_per_doc]}
    if query_count != 0:
        stats_dict['# of queries'] = query_count
    stats_df = pd.DataFrame(data=stats_dict)
    stats_df.index = [dataset_name]
    return stats_df

In [54]:
def get_corpus_df(dataset_name, file_name):
    corpus_path = os.path.join(os.getcwd(), 'datasets', dataset_name, file_name)
    return pd.read_json(path_or_buf=corpus_path, lines=True)

In [55]:
def stats_corpus(dataset_name, column_name, add_queries):
    load_dataset(dataset_name)
    query_count = 0
    if add_queries:
        queries_df = get_corpus_df(dataset_name, 'queries.jsonl')
        query_count = len(queries_df.index)
    corpus_df = get_corpus_df(dataset_name, 'corpus.jsonl')
    doc_count = len(corpus_df.index)
    sentence_count = 0
    fdist = FreqDist()
    for line in corpus_df.get(column_name):
        sentence_count += len(sent_tokenize(line))
        tokens = [w.lower() for w in word_tokenize(line) if filter_words(w, True)]
        for word in tokens:
            fdist[word] += 1
    return build_doc_stats(dataset_name, fdist, doc_count, sentence_count, query_count)

In [56]:
datasets = ['scifact', 'scidocs', 'nfcorpus']
title_stats = []
text_stats = []
for dataset in datasets:
    title_stats.append(stats_corpus(dataset, 'title', False))
for dataset in datasets:
    text_stats.append(stats_corpus(dataset, 'text', True))



---



**SCIFACT** - [Homepage](https://allenai.org/data/scifact) - [Paper](https://arxiv.org/abs/2004.14974)

**SCIDOCS** - [Homepage](https://allenai.org/data/scidocs) - [Paper](https://arxiv.org/abs/2004.07180)

**NFCORPUS** - [Homepage](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) - [Paper](https://www.cl.uni-heidelberg.de/~riezler/publications/papers/ECIR2016.pdf)



---



**STATISTICS FOR TITLES**

In [57]:
pd.concat(title_stats)

Unnamed: 0,# of docs,# of terms,# of unique terms,# of sentences,# of terms per sentences,# of terms per docs
scifact,5183,46307,9404,5291,8.752032,8.934401
scidocs,25657,181503,18755,26058,6.965347,7.07421
nfcorpus,3633,31914,6184,3763,8.480999,8.784476


**STATISTICS FOR DOCUMENTS**

In [58]:
pd.concat(text_stats)

Unnamed: 0,# of docs,# of terms,# of unique terms,# of sentences,# of terms per sentences,# of terms per docs,# of queries
scifact,5183,658662,32591,47121,13.978099,127.081227,1109
scidocs,25657,2686311,71106,195663,13.729274,104.7009,1000
nfcorpus,3633,504134,24388,35130,14.350527,138.765208,3237


end of fun.