# Using Mixup on TREC Dataset

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

# Utility imports
import spacy
import re
import string
import time

# Extras
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

from datasets import load_dataset # using 🤗 HugggingFace datasets library


## Set Random Seed

In [2]:
SEED = 420

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Load dataset

In [3]:
dataset = load_dataset("trec")

Using custom data configuration default
Reusing dataset trec (C:\Users\ariha\.cache\huggingface\datasets\trec\default\1.1.0\751da1ab101b8d297a3d6e9c79ee9b0173ff94c4497b75677b59b61d5467a9b9)
100%|██████████| 2/2 [00:00<00:00, 107.79it/s]


## Exploring the dataset

In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['label-coarse', 'label-fine', 'text'],
        num_rows: 5452
    })
    test: Dataset({
        features: ['label-coarse', 'label-fine', 'text'],
        num_rows: 500
    })
})


In [5]:
dataset.keys()

dict_keys(['train', 'test'])

In [6]:
print(dataset["train"].dataset_size)
print(dataset["train"].description)
print(dataset["train"].features)


413073
The Text REtrieval Conference (TREC) Question Classification dataset contains 5500 labeled questions in training set and another 500 for test set. The dataset has 6 labels, 47 level-2 labels. Average length of each sentence is 10, vocabulary size of 8700.

Data are collected from four sources: 4,500 English questions published by USC (Hovy et al., 2001), about 500 manually constructed questions for a few rare classes, 894 TREC 8 and TREC 9 questions, and also 500 questions from TREC 10 which serves as the test set.

{'label-coarse': ClassLabel(num_classes=6, names=['DESC', 'ENTY', 'ABBR', 'HUM', 'NUM', 'LOC'], names_file=None, id=None), 'label-fine': ClassLabel(num_classes=47, names=['manner', 'cremat', 'animal', 'exp', 'ind', 'gr', 'title', 'def', 'date', 'reason', 'event', 'state', 'desc', 'count', 'other', 'letter', 'religion', 'food', 'country', 'color', 'termeq', 'city', 'body', 'dismed', 'mount', 'money', 'product', 'period', 'substance', 'sport', 'plant', 'techmeth', 'vol

In [7]:
# Sample data point in TREC

dataset["train"][0]

{'label-coarse': 0,
 'label-fine': 0,
 'text': 'How did serfdom develop in and then leave Russia ?'}

## Tokenize Data

In [11]:
nlp = spacy.load("en_core_web_sm")
def tokenize(text):
	text = re.sub(r"[^\x00-\x7F]+", " ", text)
	regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
	no_punctuation = regex.sub(" ", text.lower())
	return [token.text for token in nlp.tokenizer(no_punctuation)]


In [None]:
counts = Counter()

In [8]:
# from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

100%|██████████| 231508/231508 [00:01<00:00, 129681.82B/s]


In [18]:
dataset.set_format(type="torch", columns=["text", "label-coarse"])
train_dataloader = DataLoader(dataset["train"], batch_size=256)
next(iter(train_dataloader))