DATASETS

In [17]:
!pip install -U beir
!pip install nltk
!pip install pandas



In [18]:
import os
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from beir import util

In [19]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
dataset_url = 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
def load_dataset(dataset_name):
    out_dir = os.path.join(os.getcwd(), 'datasets')
    util.download_and_unzip(dataset_url.format(dataset_name), out_dir)

In [21]:
def filter_words(w, filter_stopwords):
    if filter_stopwords:
        return w.isalnum() and w not in stop_words
    else:
        return w

In [22]:
def stats_corpus(dataset_name, column_name):
    load_dataset(dataset_name)
    corpus_path = os.path.join(os.getcwd(), 'datasets', dataset_name, 'corpus.jsonl')
    corpus_json = pd.read_json(path_or_buf=corpus_path, lines=True)
    sentence_count = 0
    doc_count = 0
    fdist = FreqDist()
    for line in corpus_json.get(column_name):
        doc_count += 1
        sentence_count += len(sent_tokenize(line))
        tokens = [w.lower() for w in word_tokenize(line) if filter_words(w, True)]
        for word in tokens:
            fdist[word] += 1
    term_count = fdist.N()
    bin_count = fdist.B()
    word_per_doc = term_count / doc_count
    valid_sentence_count = sentence_count
    if valid_sentence_count == 0:
        print('WARNING! "{}" corpus does not have title values.'.format(dataset_name))
        valid_sentence_count = 1
    word_per_sent = term_count / valid_sentence_count
    stats_dict = {
        '# of Documents': [doc_count],
        '# of Sentences': [sentence_count],
        '# of Terms': [term_count],
        '# of Unique Terms': [bin_count],
        'Rate of Terms per Documents': [word_per_doc],
        'Rate of Terms per Sentences': [word_per_sent]}
    stats_df = pd.DataFrame(data=stats_dict)
    stats_df.index = [dataset_name]
    return stats_df

In [23]:
datasets = ['arguana', 'fiqa']
title_stats = []
text_stats = []
for dataset in datasets:
    title_stats.append(stats_corpus(dataset, 'title'))
for dataset in datasets:
    text_stats.append(stats_corpus(dataset, 'text'))





---



**ARGUANA** - [Homepage](http://argumentation.bplaced.net/arguana/data) - [Paper](https://aclanthology.org/P18-1023.pdf)

**FIQA** - [Homepage](https://sites.google.com/view/fiqa/home) - [Paper](https://www.researchgate.net/publication/324629350_WWW'18_Open_Challenge_Financial_Opinion_Mining_and_Question_Answering)



---



**STATISTICS FOR TITLES**

In [24]:
pd.concat(title_stats)

Unnamed: 0,# of Documents,# of Sentences,# of Terms,# of Unique Terms,Rate of Terms per Documents,Rate of Terms per Sentences
arguana,8674,2699,24362,501,2.808623,9.026306
fiqa,57638,0,0,0,0.0,0.0


**STATISTICS FOR DOCUMENTS**

In [25]:
pd.concat(text_stats)

Unnamed: 0,# of Documents,# of Sentences,# of Terms,# of Unique Terms,Rate of Terms per Documents,Rate of Terms per Sentences
arguana,8674,65730,822246,33063,94.794328,12.509448
fiqa,57638,419639,4187377,66790,72.649589,9.978522


end of fun.