In [1]:
import dataset

In [2]:
data = dataset.get_dataset().get_data_folds()[0]
data.read_data()


print(f"Number of features: {data.num_features}")
# print some statistics
for split in ["train", "validation", "test"]:
    print(f"Split: {split}")
    split = getattr(data, split)
    print(f"\tNumber of queries {split.num_queries()}")
    print(f"\tNumber of docs {split.num_docs()}")


Number of features: 501
Split: train
	Number of queries 19943
	Number of docs 473092
Split: validation
	Number of queries 2993
	Number of docs 71041
Split: test
	Number of queries 6734
	Number of docs 163439


In [3]:
import numpy as np

In [4]:
# queries go from 0 to num_queries
queries = np.arange(0, data.train.num_queries())
# loop over 10 random queries
for qid in np.random.choice(queries, size=10):
    # as the name suggests, query_feat gives 
    qd_features = data.train.query_feat(qid)
    labels = data.train.query_labels(qid)
    print("QID {} has {} documents".format(qid, qd_features.shape[0]))
    # number of labels == number of documents
    assert qd_features.shape[0] == labels.shape[0]
    
    # doc_feat, as the name suggests, gives you features for one doc and query
    doc_ids = np.arange(qd_features.shape[0])
    qd_features_2 = np.zeros_like(qd_features)
    for did in doc_ids:
        qd_features_2[did] = data.train.doc_feat(qid, did)
    # this is the same as the output returned by query_feat
    assert np.all(qd_features == qd_features_2)

QID 6167 has 19 documents
QID 1922 has 64 documents
QID 17539 has 36 documents
QID 5683 has 19 documents
QID 788 has 22 documents
QID 2682 has 22 documents
QID 7489 has 8 documents
QID 2704 has 52 documents
QID 5470 has 32 documents
QID 8804 has 7 documents


In [5]:
# an example function to extract the features / labels for an set of indices
# the indicies go from 0 to feature.matrix.shape[0] (exclusive)
def get_batch(split, idx):
    return split.feature_matrix[idx, :], split.label_vector[idx]

# get a random batch of indices from 0 to feature_matrix.shape[0] (exclusive)
batch_idx = np.random.permutation(np.arange(data.train.feature_matrix.shape[0]))[:10]
X, y = get_batch(data.train, batch_idx)
assert X.shape[0] == y.shape[0]