# Dataset Informations

In [None]:
from utils import (
    download_dataset,
    read_as_dataframe,
    ALLNLI_DATASET_URL,
    STS_BENCHMARK_DATASET_URL,
)

In [3]:
def download_as_dataframe(url: str, download_path: str):
    download_dataset(url, download_path)
    return read_as_dataframe(download_path)


# download training dataset (ALLNLI)
training_ds = download_as_dataframe(
    ALLNLI_DATASET_URL,
    "datasets/allnli.tsv.gz",
)
# download evaluation (ie. benchmark) dataset (STSBENCHMARK)
benchmark_ds = download_as_dataframe(
    STS_BENCHMARK_DATASET_URL,
    "datasets/stsbenchmark.tsv.gz",
)

## Summary for datasets

In [4]:
# for training
print("Summary for training dataset")
print("\tColumns:", training_ds.columns.to_list())
print("\tNumber of sentence pairs:", len(training_ds))
original_split = training_ds["split"].value_counts().to_dict()
for k, v in original_split.items():
    print(f"\t\t{k}: {v}")
labels = training_ds["label"].value_counts().to_dict()
print("\tLabels:", list(labels.keys()))
for k, v in labels.items():
    print(f"\t\t{k}: {v}")
# for benchmark
print()
print("Summary for benchmark dataset")
print("\tColumns:", benchmark_ds.columns.to_list())
print("\tNumber of sentence pairs:", len(benchmark_ds))
original_split = benchmark_ds["split"].value_counts().to_dict()
for k, v in original_split.items():
    print(f"\t\t{k}: {v}")
labels = benchmark_ds["genre"].value_counts().to_dict()
print("\tGenres:", list(labels.keys()))
for k, v in labels.items():
    print(f"\t\t{k}: {v}")


Summary for training dataset
	Columns: ['split', 'dataset', 'filename', 'sentence1', 'sentence2', 'label']
	Number of sentence pairs: 981382
		train: 942069
		dev: 19657
		test: 19656
	Labels: ['entailment', 'contradiction', 'neutral']
		entailment: 327954
		contradiction: 327058
		neutral: 326370

Summary for benchmark dataset
	Columns: ['split', 'genre', 'dataset', 'year', 'sid', 'score', 'sentence1', 'sentence2']
	Number of sentence pairs: 8628
		train: 5749
		dev: 1500
		test: 1379
	Genres: ['main-news', 'main-captions', 'main-forums', 'main-forum']
		main-news: 4299
		main-captions: 3250
		main-forums: 629
		main-forum: 450
