In [2]:
from datasets import load_dataset
import pandas as pd
import random
random.seed(5)
from sklearn.model_selection import train_test_split
import os

# 20NG
origin: SetFit/20_newsgroups

In [2]:
dataset = load_dataset('SetFit/20_newsgroups')
dataset

Using custom data configuration SetFit--20_newsgroups-bba9acf94c3d61ec
Reusing dataset json (/home/v-biyangguo/.cache/huggingface/datasets/SetFit___json/SetFit--20_newsgroups-bba9acf94c3d61ec/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 11314
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 7532
    })
})

In [4]:
# filter empty contents
orig_train_dataset = dataset['train'].filter(lambda x:x['text'] not in ['',None,float('nan')])
print(orig_train_dataset)
orig_test_dataset = dataset['test'].filter(lambda x:x['text'] not in ['',None,float('nan')])
print(orig_test_dataset)


orig_train_contents = orig_train_dataset['text']
orig_train_labels = orig_train_dataset['label']
test_contents = orig_test_dataset['text']
test_labels = orig_test_dataset['label']

train_contents, dev_contents, train_labels, dev_labels = train_test_split(
    orig_train_contents, orig_train_labels, test_size=0.3, random_state=42)

print(len(train_contents),len(train_labels))
print(len(dev_contents),len(dev_labels))
print(len(test_contents),len(test_labels))

Loading cached processed dataset at /home/v-biyangguo/.cache/huggingface/datasets/SetFit___json/SetFit--20_newsgroups-bba9acf94c3d61ec/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253/cache-0d420d30af8f0d62.arrow
Loading cached processed dataset at /home/v-biyangguo/.cache/huggingface/datasets/SetFit___json/SetFit--20_newsgroups-bba9acf94c3d61ec/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253/cache-533f82bed5639e9a.arrow


Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 11096
})
Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 7370
})
7767 7767
3329 3329
7370 7370


In [5]:
# full dataset

data_path = './20ng_full'
if not os.path.exists(data_path):
    os.mkdir(data_path)
pd.DataFrame({'content':train_contents, 'label':train_labels}).to_csv(f"{data_path}/train.csv")
pd.DataFrame({'content':dev_contents, 'label':dev_labels}).to_csv(f"{data_path}/dev.csv")
pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

# sub dataset

for num in [50,100,200,500,1000]:
    data_path = './20ng_%s'%num
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    pd.DataFrame({'content':train_contents[:num], 'label':train_labels[:num]}).to_csv(f"{data_path}/train.csv")
    pd.DataFrame({'content':dev_contents[:num], 'label':dev_labels[:num]}).to_csv(f"{data_path}/dev.csv")
    pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

In [7]:
# check
data_train = pd.read_csv('20ng_100/train.csv')
data_dev = pd.read_csv('20ng_100/dev.csv')
len(data_train['label'].value_counts()),len(data_dev['label'].value_counts())

(20, 20)

# Yahoo Answers
origin: yahoo_answers_topics

In [6]:
dataset = load_dataset('yahoo_answers_topics')
dataset

Reusing dataset yahoo_answers_topics (/home/v-biyangguo/.cache/huggingface/datasets/yahoo_answers_topics/yahoo_answers_topics/1.0.0/0edb353eefe79d9245d7bd7cac5ae6af19530439da520d6dde1c206ee38f4439)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 1400000
    })
    test: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 60000
    })
})

In [7]:
# filter empty contents
# yahoo is too large, so only sample 10k training samples
orig_train_dataset = dataset['train'].select(range(10000)).filter(lambda x:x['best_answer'] not in ['',None,float('nan')])
print(orig_train_dataset)
orig_test_dataset = dataset['test'].select(range(10000)).filter(lambda x:x['best_answer'] not in ['',None,float('nan')])
print(orig_test_dataset)

# concat question title and answer as the content
orig_train_questions = orig_train_dataset['question_title']
orig_train_answers = orig_train_dataset['best_answer']
orig_train_contents = [' '.join([q,a]) for q,a in zip(orig_train_questions, orig_train_answers)]
orig_train_labels = orig_train_dataset['topic']

test_questions = orig_test_dataset['question_title']
test_answers = orig_test_dataset['best_answer']
test_contents = [' '.join([q,a]) for q,a in zip(test_questions, test_answers)]
test_labels = orig_test_dataset['topic']

train_contents, dev_contents, train_labels, dev_labels = train_test_split(
    orig_train_contents, orig_train_labels, test_size=0.3, random_state=42)

print(len(train_contents),len(train_labels))
print(len(dev_contents),len(dev_labels))
print(len(test_contents),len(test_labels))

Loading cached processed dataset at /home/v-biyangguo/.cache/huggingface/datasets/yahoo_answers_topics/yahoo_answers_topics/1.0.0/0edb353eefe79d9245d7bd7cac5ae6af19530439da520d6dde1c206ee38f4439/cache-e3ea7e3b492b45e6.arrow
Loading cached processed dataset at /home/v-biyangguo/.cache/huggingface/datasets/yahoo_answers_topics/yahoo_answers_topics/1.0.0/0edb353eefe79d9245d7bd7cac5ae6af19530439da520d6dde1c206ee38f4439/cache-955613c61170deca.arrow


Dataset({
    features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
    num_rows: 10000
})
Dataset({
    features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
    num_rows: 9992
})
7000 7000
3000 3000
9992 9992


In [8]:
# full dataset
data_path = './yahoo10k_full'
if not os.path.exists(data_path):
    os.mkdir(data_path)
pd.DataFrame({'content':train_contents, 'label':train_labels}).to_csv(f"{data_path}/train.csv")
pd.DataFrame({'content':dev_contents, 'label':dev_labels}).to_csv(f"{data_path}/dev.csv")
pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

# sub dataset
for num in [50,100,200,500,1000]:
    data_path = './yahoo10k_%s'%num
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    pd.DataFrame({'content':train_contents[:num], 'label':train_labels[:num]}).to_csv(f"{data_path}/train.csv")
    pd.DataFrame({'content':dev_contents[:num], 'label':dev_labels[:num]}).to_csv(f"{data_path}/dev.csv")
    pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

In [8]:
# check
data_train = pd.read_csv('yahoo10k_50/train.csv')
data_dev = pd.read_csv('yahoo10k_50/dev.csv')
len(data_train['label'].value_counts()),len(data_dev['label'].value_counts())

(10, 10)

# HuffPost
origin: khalidalt/HuffPost

In [12]:
dataset = load_dataset('khalidalt/HuffPost')
dataset

No config specified, defaulting to: huff_post/default
Reusing dataset huff_post (/home/v-biyangguo/.cache/huggingface/datasets/khalidalt___huff_post/default/1.1.0/7e696fa9c5f0fda2ed35e66f7b84cdbb17d017a09c3c05b4e6e864d2a1000499)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['category', 'headline', 'authors', 'link', 'short_description', 'date', 'label'],
        num_rows: 200853
    })
})

In [15]:
# original dataset has 41 classes! too much ! so we only keep 5 classes, same as BBC news
selected_categories = ['ENTERTAINMENT','SPORTS','BUSINESS','TECH','POLITICS']
categories_translation = {
    'ENTERTAINMENT':'entertainment',
    'SPORTS':'sport',
    'BUSINESS':'business',
    'TECH':'tech',
    'POLITICS':'politics',
} # to be consistent with BBC news dataset
# dataset = dataset['test'].filter(lambda x:x['category'] in selected_categories).filter(lambda x:x['short_description'] not in ['',None,float('nan')])
print(dataset) # 54427

# filter empty contents
# huffpost is too large, so only sample 10k training samples
alist = list(range(54427))
random.shuffle(alist)
alist_train = alist[:7000]
alist_dev = alist[7000:10000]
alist_test = alist[10000:20000]

train_dataset = dataset.select(alist_train)
dev_dataset = dataset.select(alist_dev)
test_dataset = dataset.select(alist_test)
print(train_dataset)
print(dev_dataset)
print(test_dataset)


# concat headline and short_description as the content
train_questions = train_dataset['headline']
train_answers = train_dataset['short_description']
train_contents = [' '.join([q,a]) for q,a in zip(train_questions, train_answers)]
train_labels = [categories_translation[l] for l in train_dataset['category']]

dev_questions = dev_dataset['headline']
dev_answers = dev_dataset['short_description']
dev_contents = [' '.join([q,a]) for q,a in zip(dev_questions, dev_answers)]
dev_labels =  [categories_translation[l] for l in dev_dataset['category']]

test_questions = test_dataset['headline']
test_answers = test_dataset['short_description']
test_contents = [' '.join([q,a]) for q,a in zip(test_questions, test_answers)]
test_labels =  [categories_translation[l] for l in test_dataset['category']]

print(len(train_contents),len(train_labels),len(set(train_labels)))
print(len(dev_contents),len(dev_labels),len(set(dev_labels)))
print(len(test_contents),len(test_labels),len(set(test_labels)))

Dataset({
    features: ['category', 'headline', 'authors', 'link', 'short_description', 'date', 'label'],
    num_rows: 54427
})
Dataset({
    features: ['category', 'headline', 'authors', 'link', 'short_description', 'date', 'label'],
    num_rows: 7000
})
Dataset({
    features: ['category', 'headline', 'authors', 'link', 'short_description', 'date', 'label'],
    num_rows: 3000
})
Dataset({
    features: ['category', 'headline', 'authors', 'link', 'short_description', 'date', 'label'],
    num_rows: 10000
})
7000 7000 5
3000 3000 5
10000 10000 5


In [16]:
# full dataset
data_path = './5huffpost_full'
if not os.path.exists(data_path):
    os.mkdir(data_path)
pd.DataFrame({'content':train_contents, 'label':train_labels}).to_csv(f"{data_path}/train.csv")
pd.DataFrame({'content':dev_contents, 'label':dev_labels}).to_csv(f"{data_path}/dev.csv")
pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

# sub dataset
for num in [50,100,200,500,1000]:
    data_path = './5huffpost_%s'%num
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    pd.DataFrame({'content':train_contents[:num], 'label':train_labels[:num]}).to_csv(f"{data_path}/train.csv")
    pd.DataFrame({'content':dev_contents[:num], 'label':dev_labels[:num]}).to_csv(f"{data_path}/dev.csv")
    pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

In [19]:
# check
data_train = pd.read_csv('5huffpost_50/train.csv')
data_dev = pd.read_csv('5huffpost_50/dev.csv')
len(data_train['label'].value_counts()),len(data_dev['label'].value_counts()) # make sure the train set contain all categories

(5, 4)

# BBC news
origin: SetFit/bbc-news

In [21]:
dataset = load_dataset('SetFit/bbc-news')
dataset

Using custom data configuration SetFit--bbc-news-003ad310d9aedc64
Reusing dataset json (/home/v-biyangguo/.cache/huggingface/datasets/SetFit___json/SetFit--bbc-news-003ad310d9aedc64/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 1225
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 1000
    })
})

In [22]:
# filter empty contents
orig_train_dataset = dataset['train'].filter(lambda x:x['text'] not in ['',None,float('nan')])
print(orig_train_dataset)
orig_test_dataset = dataset['test'].filter(lambda x:x['text'] not in ['',None,float('nan')])
print(orig_test_dataset)


orig_train_contents = orig_train_dataset['text']
orig_train_labels = orig_train_dataset['label_text']
test_contents = orig_test_dataset['text']
test_labels = orig_test_dataset['label_text']

train_contents, dev_contents, train_labels, dev_labels = train_test_split(
    orig_train_contents, orig_train_labels, test_size=0.3, random_state=42)

print(len(train_contents),len(train_labels))
print(len(dev_contents),len(dev_labels))
print(len(test_contents),len(test_labels))

  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 1225
})


  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 1000
})
857 857
368 368
1000 1000


In [23]:
# full dataset
data_path = './bbc_full'
if not os.path.exists(data_path):
    os.mkdir(data_path)
pd.DataFrame({'content':train_contents, 'label':train_labels}).to_csv(f"{data_path}/train.csv")
pd.DataFrame({'content':dev_contents, 'label':dev_labels}).to_csv(f"{data_path}/dev.csv")
pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

# sub dataset
for num in [50,100,200,500,1000]:
    data_path = './bbc_%s'%num
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    pd.DataFrame({'content':train_contents[:num], 'label':train_labels[:num]}).to_csv(f"{data_path}/train.csv")
    pd.DataFrame({'content':dev_contents[:num], 'label':dev_labels[:num]}).to_csv(f"{data_path}/dev.csv")
    pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

In [24]:
# check
data_train = pd.read_csv('bbc_50/train.csv')
data_dev = pd.read_csv('bbc_50/dev.csv')
len(data_train['label'].value_counts()),len(data_dev['label'].value_counts()) # make sure the train set contain all categories

(5, 5)

# IMDB
origin: imdb

In [26]:
dataset = load_dataset('imdb')
dataset

Reusing dataset imdb (/home/v-biyangguo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [27]:
# filter empty contents
orig_train_dataset = dataset['train'].filter(lambda x:x['text'] not in ['',None,float('nan')])
print(orig_train_dataset)
orig_test_dataset = dataset['test'].filter(lambda x:x['text'] not in ['',None,float('nan')])
print(orig_test_dataset)


orig_train_contents = orig_train_dataset['text']
orig_train_labels = orig_train_dataset['label']
test_contents = orig_test_dataset['text']
test_labels = orig_test_dataset['label']

train_contents, dev_contents, train_labels, dev_labels = train_test_split(
    orig_train_contents, orig_train_labels, test_size=0.3, random_state=42)

print(len(train_contents),len(train_labels))
print(len(dev_contents),len(dev_labels))
print(len(test_contents),len(test_labels))

  0%|          | 0/25 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


  0%|          | 0/25 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})
17500 17500
7500 7500
25000 25000


In [28]:
# full dataset
data_path = './imdb_full'
if not os.path.exists(data_path):
    os.mkdir(data_path)
pd.DataFrame({'content':train_contents, 'label':train_labels}).to_csv(f"{data_path}/train.csv")
pd.DataFrame({'content':dev_contents, 'label':dev_labels}).to_csv(f"{data_path}/dev.csv")
pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

# sub dataset
for num in [50,100,200,500,1000]:
    data_path = './imdb_%s'%num
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    pd.DataFrame({'content':train_contents[:num], 'label':train_labels[:num]}).to_csv(f"{data_path}/train.csv")
    pd.DataFrame({'content':dev_contents[:num], 'label':dev_labels[:num]}).to_csv(f"{data_path}/dev.csv")
    pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

In [29]:
# check
data_train = pd.read_csv('imdb_50/train.csv')
data_dev = pd.read_csv('imdb_50/dev.csv')
len(data_train['label'].value_counts()),len(data_dev['label'].value_counts()) # make sure the train set contain all categories

(2, 2)

# SST2
origin: glue sst2

In [3]:
dataset = load_dataset('glue','sst2')
dataset

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/sst2 (download: 7.09 MiB, generated: 4.81 MiB, post-processed: Unknown size, total: 11.90 MiB) to /home/v-biyangguo/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /home/v-biyangguo/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [5]:
# since the sst2 doesn't provide the labels for test set
# we use the dev set as the test here, and split dev set from the original train set
# filter empty contents
orig_train_dataset = dataset['train'].filter(lambda x:x['sentence'] not in ['',None,float('nan')])
print(orig_train_dataset)
orig_test_dataset = dataset['validation'].filter(lambda x:x['sentence'] not in ['',None,float('nan')])
print(orig_test_dataset)


orig_train_contents = orig_train_dataset['sentence']
orig_train_labels = orig_train_dataset['label']
test_contents = orig_test_dataset['sentence']
test_labels = orig_test_dataset['label']

train_contents, dev_contents, train_labels, dev_labels = train_test_split(
    orig_train_contents, orig_train_labels, test_size=0.3, random_state=42)

print(len(train_contents),len(train_labels))
print(len(dev_contents),len(dev_labels))
print(len(test_contents),len(test_labels))

  0%|          | 0/68 [00:00<?, ?ba/s]

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})


  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 872
})
47144 47144
20205 20205
872 872


In [6]:
# full dataset
data_path = './sst2_full'
if not os.path.exists(data_path):
    os.mkdir(data_path)
pd.DataFrame({'content':train_contents, 'label':train_labels}).to_csv(f"{data_path}/train.csv")
pd.DataFrame({'content':dev_contents, 'label':dev_labels}).to_csv(f"{data_path}/dev.csv")
pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

# sub dataset
for num in [50,100,200,500,1000]:
    data_path = './sst2_%s'%num
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    pd.DataFrame({'content':train_contents[:num], 'label':train_labels[:num]}).to_csv(f"{data_path}/train.csv")
    pd.DataFrame({'content':dev_contents[:num], 'label':dev_labels[:num]}).to_csv(f"{data_path}/dev.csv")
    pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

In [7]:
# check
data_train = pd.read_csv('sst2_50/train.csv')
data_dev = pd.read_csv('sst2_50/dev.csv')
len(data_train['label'].value_counts()),len(data_dev['label'].value_counts()) # make sure the train set contain all categories

(2, 2)

# SST2-longer version

In [3]:
dataset = load_dataset('glue','sst2')
dataset

Reusing dataset glue (/home/v-biyangguo/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [4]:
# since the sst2 doesn't provide the labels for test set
# we use the dev set as the test here, and split dev set from the original train set
# filter empty contents
orig_train_dataset = dataset['train'].filter(lambda x:len(x['sentence'].split(' ')) > 20)
print(orig_train_dataset)
orig_test_dataset = dataset['validation'].filter(lambda x:x['sentence'] not in ['',None,float('nan')])
print(orig_test_dataset)


orig_train_contents = orig_train_dataset['sentence']
orig_train_labels = orig_train_dataset['label']
test_contents = orig_test_dataset['sentence']
test_labels = orig_test_dataset['label']

train_contents, dev_contents, train_labels, dev_labels = train_test_split(
    orig_train_contents, orig_train_labels, test_size=0.3, random_state=42)

print(len(train_contents),len(train_labels))
print(len(dev_contents),len(dev_labels))
print(len(test_contents),len(test_labels))

  0%|          | 0/68 [00:00<?, ?ba/s]

Loading cached processed dataset at /home/v-biyangguo/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-7d03d9a4ffc90207.arrow


Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 8294
})
Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 872
})
5805 5805
2489 2489
872 872


In [7]:
# full dataset
data_path = './sst2-l_full'
if not os.path.exists(data_path):
    os.mkdir(data_path)
pd.DataFrame({'content':train_contents, 'label':train_labels}).to_csv(f"{data_path}/train.csv")
pd.DataFrame({'content':dev_contents, 'label':dev_labels}).to_csv(f"{data_path}/dev.csv")
pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

# sub dataset
for num in [50,100,200,500,1000]:
    data_path = './sst2-l_%s'%num
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    pd.DataFrame({'content':train_contents[:num], 'label':train_labels[:num]}).to_csv(f"{data_path}/train.csv")
    pd.DataFrame({'content':dev_contents[:num], 'label':dev_labels[:num]}).to_csv(f"{data_path}/dev.csv")
    pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

In [8]:
# check
data_train = pd.read_csv('sst2-l_50/train.csv')
data_dev = pd.read_csv('sst2-l_50/dev.csv')
len(data_train['label'].value_counts()),len(data_dev['label'].value_counts()) # make sure the train set contain all categories

(2, 2)

# Yahoo Answers Clf

In [3]:
dataset = load_dataset('yahoo_answers_topics')
dataset

Reusing dataset yahoo_answers_topics (/home/v-biyangguo/.cache/huggingface/datasets/yahoo_answers_topics/yahoo_answers_topics/1.0.0/0edb353eefe79d9245d7bd7cac5ae6af19530439da520d6dde1c206ee38f4439)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 1400000
    })
    test: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 60000
    })
})

In [5]:
# filter empty contents
# yahoo is too large, so only sample 10k training samples
orig_train_dataset = dataset['train'].filter(lambda x:len(x['best_answer'])>50).select(range(10000))
print(orig_train_dataset)
orig_test_dataset = dataset['test'].filter(lambda x:len(x['best_answer'])>50).select(range(10000))
print(orig_test_dataset)

# concat question title and answer as the content
orig_train_contents = orig_train_dataset['best_answer']
orig_train_labels = orig_train_dataset['topic']


test_contents = orig_test_dataset['best_answer']
test_labels = orig_test_dataset['topic']

train_contents, dev_contents, train_labels, dev_labels = train_test_split(
    orig_train_contents, orig_train_labels, test_size=0.3, random_state=42)

print(len(train_contents),len(train_labels))
print(len(dev_contents),len(dev_labels))
print(len(test_contents),len(test_labels))

  0%|          | 0/1400 [00:00<?, ?ba/s]

Dataset({
    features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
    num_rows: 10000
})


  0%|          | 0/60 [00:00<?, ?ba/s]

Dataset({
    features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
    num_rows: 10000
})
7000 7000
3000 3000
10000 10000


In [6]:
# full dataset
data_path = './yahooA10k_full'
if not os.path.exists(data_path):
    os.mkdir(data_path)
pd.DataFrame({'content':train_contents, 'label':train_labels}).to_csv(f"{data_path}/train.csv")
pd.DataFrame({'content':dev_contents, 'label':dev_labels}).to_csv(f"{data_path}/dev.csv")
pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

# sub dataset
for num in [50,100,200,500,1000]:
    data_path = './yahooA10k_%s'%num
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    pd.DataFrame({'content':train_contents[:num], 'label':train_labels[:num]}).to_csv(f"{data_path}/train.csv")
    pd.DataFrame({'content':dev_contents[:num], 'label':dev_labels[:num]}).to_csv(f"{data_path}/dev.csv")
    pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")

In [7]:
# check
data_train = pd.read_csv('yahooA10k_50/train.csv')
data_dev = pd.read_csv('yahooA10k_50/dev.csv')
len(data_train['label'].value_counts()),len(data_dev['label'].value_counts())

(10, 10)

# Reddit-topics
jamescalam/reddit-topics

In [2]:
from datasets import load_dataset
data = load_dataset('jamescalam/reddit-topics')
data

DatasetDict({
    train: Dataset({
        features: ['sub', 'title', 'selftext', 'upvote_ratio', 'id', 'created_utc'],
        num_rows: 3791
    })
})

In [5]:
set(data['train']['sub'])

{'LanguageTechnology', 'Python', 'investing', 'pytorch'}