# 1. Loading

In [None]:
from datasets import *
datasets = load_dataset("madao33/new-title-chinese")

In [3]:
datasets

DatasetDict({
    train: Dataset({
        features: ['title', 'content'],
        num_rows: 5850
    })
    validation: Dataset({
        features: ['title', 'content'],
        num_rows: 1679
    })
})

## 1.1 different ways of slicing

In [4]:
dataset = load_dataset("madao33/new-title-chinese", split="train[10:100]")
dataset

Dataset({
    features: ['title', 'content'],
    num_rows: 90
})

In [5]:
dataset = load_dataset("madao33/new-title-chinese", split="train[:50%]")
dataset

Dataset({
    features: ['title', 'content'],
    num_rows: 2925
})

In [6]:
dataset = load_dataset("madao33/new-title-chinese", split=["train[:50%]", "train[50%:]"])
dataset

[Dataset({
     features: ['title', 'content'],
     num_rows: 2925
 }),
 Dataset({
     features: ['title', 'content'],
     num_rows: 2925
 })]

In [15]:
dataset = datasets["train"]
dataset.train_test_split(test_size=0.1) # stratifiy_by_column = "label" can balance the label

DatasetDict({
    train: Dataset({
        features: ['title', 'content'],
        num_rows: 5265
    })
    test: Dataset({
        features: ['title', 'content'],
        num_rows: 585
    })
})

In [14]:
datasets['train']

Dataset({
    features: ['title', 'content'],
    num_rows: 5850
})

## 1.2 Inspecting

In [16]:
datasets["train"][:5]

{'title': ['望海楼美国打“台湾牌”是危险的赌博',
  '大力推进高校治理能力建设',
  '坚持事业为上选贤任能',
  '“大朋友”的话儿记心头',
  '用好可持续发展这把“金钥匙”'],
 'content': ['近期，美国国会众院通过法案，重申美国对台湾的承诺。对此，中国外交部发言人表示，有关法案严重违反一个中国原则和中美三个联合公报规定，粗暴干涉中国内政，中方对此坚决反对并已向美方提出严正交涉。\n事实上，中国高度关注美国国内打“台湾牌”、挑战一中原则的危险动向。近年来，作为“亲台”势力大本营的美国国会动作不断，先后通过“与台湾交往法”“亚洲再保证倡议法”等一系列“挺台”法案，“2019财年国防授权法案”也多处触及台湾问题。今年3月，美参院亲台议员再抛“台湾保证法”草案。众院议员继而在4月提出众院版的草案并在近期通过。上述法案的核心目标是强化美台关系，并将台作为美“印太战略”的重要伙伴。同时，“亲台”议员还有意制造事端。今年2月，5名共和党参议员致信众议院议长，促其邀请台湾地区领导人在国会上发表讲话。这一动议显然有悖于美国与台湾的非官方关系，其用心是实质性改变美台关系定位。\n上述动向出现并非偶然。在中美建交40周年之际，两国关系摩擦加剧，所谓“中国威胁论”再次沉渣泛起。美国对华认知出现严重偏差，对华政策中负面因素上升，保守人士甚至成立了“当前中国威胁委员会”。在此背景下，美国将台海关系作为战略抓手，通过打“台湾牌”在双边关系中增加筹码。特朗普就任后，国会对总统外交政策的约束力和塑造力加强。其实国会推动通过涉台法案对行政部门不具约束力，美政府在2018年并未提升美台官员互访级别，美军舰也没有“访问”台湾港口，保持着某种克制。但从美总统签署国会通过的法案可以看出，国会对外交产生了影响。立法也为政府对台政策提供更大空间。\n然而，美国需要认真衡量打“台湾牌”成本。首先是美国应对危机的代价。美方官员和学者已明确发出警告，美国卷入台湾问题得不偿失。美国学者曾在媒体发文指出，如果台海爆发危机，美国可能需要“援助”台湾，进而导致新的冷战乃至与中国大陆的冲突。但如果美国让台湾自己面对，则有损美国的信誉，影响美盟友对同盟关系的支持。其次是对中美关系的危害。历史证明，中美合则两利、斗则两伤。中美关系是当今世界最重要的双边关系之一，保持中美关系的稳定发展，不仅

In [17]:
datasets["train"]['title'][:2]


['望海楼美国打“台湾牌”是危险的赌博', '大力推进高校治理能力建设']

In [11]:
datasets["train"].column_names

['title', 'content']

In [12]:
datasets["train"].features

{'title': Value(dtype='string', id=None),
 'content': Value(dtype='string', id=None)}

## 1.3 filtering

In [19]:
filtered_dataset = datasets["train"].filter(lambda example: '中国' in example['title'])

Filter:   0%|          | 0/5850 [00:00<?, ? examples/s]

In [20]:
filtered_dataset['title'][:5]

['聚焦两会，世界探寻中国成功秘诀',
 '望海楼中国经济的信心来自哪里',
 '“中国奇迹”助力世界减贫跑出加速度',
 '和音瞩目历史交汇点上的中国',
 '中国风采感染世界']

## 1.4 mapping

In [None]:
def add_prefix(example):
    example['title'] = 'PREFIX: ' + example['title']
    return example

prefix_dataset = datasets["train"].map(add_prefix)
prefix_dataset['title'][:5]

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
def process_function(example):
    inputs = tokenizer(example['content'], truncation=True, max_length=512)
    labels = tokenizer(example['title'], truncation=True, max_length=512)
    inputs["labels"] = labels["input_ids"]
    return inputs


In [None]:
process_dataset = datasets.map(process_function, batched=True)

In [24]:
process_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5850
    })
    validation: Dataset({
        features: ['title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1679
    })
})

In [None]:
## multi-processing
process_dataset = datasets.map(process_function, num_proc=4)

In [None]:
process_dataset = datasets.map(process_function, batched=True, remove_columns=datasets['train'].column_names)

In [29]:
process_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5850
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1679
    })
})

## save and load

```
process_dataset.save_to_disk(PATH)
process_dataset.load_from_disk(PATH)
```

# 2. Customized Dataset

## 2.0 from memory

In [36]:
data = [
    {"CN": "我爱你", "EN": "I love you"},
    {"CN": "我恨你", "EN": "I hate you"},
]
dataset = Dataset.from_list(data)
dataset


Dataset({
    features: ['CN', 'EN'],
    num_rows: 2
})

## 2.1 load from files

In [None]:
dataset = load_dataset('csv', data_files='data/ChnSentiCorp_htl_all.csv')
dataset

In [None]:
datatset = Dataset.from_csv('data/ChnSentiCorp_htl_all.csv')
dataset

## 2.2 load from directory

In [None]:
dataset = load_dataset('csv', data_dir = 'data/csv_chunks/')
dataset

## 2.3 load with scripts

```
import json
import datasets
from datasets import DownloadManager, DatasetInfo


class CMRC2018TRIAL(datasets.GeneratorBasedBuilder):

    def _info(self) -> DatasetInfo:
        """
            info方法, 定义数据集的信息,这里要对数据的字段进行定义.
                - description
                - features
        :return:
        """

    def _split_generators(self, dl_manager: DownloadManager):
        """
            返回datasets.SplitGenerator
            涉及两个参数: name和gen_kwargs
            name: 指定数据集的划分
            gen_kwargs: 指定要读取的文件的路径, 与_generate_examples的入参数一致
        :param dl_manager:
        :return: [ datasets.SplitGenerator ]
        """

    def _generate_examples(self, filepath):
        """
            生成具体的样本, 使用yield
            需要额外指定key, id从0开始自增就可以
        :param filepath:
        :return:
        """
```


In [None]:
from datasets import *
# be careful with the dataset path here; I hard coded it in the script so that it can be run in the notebook (respecting the relative path)
dataset = load_dataset('scripts/04_load_script.py', split = 'train')

# 3. Dataset With DataCollator

Dynamically padding samples within a batch so that they have the same shape

In [6]:
from transformers import DataCollatorWithPadding, AutoTokenizer
from datasets import load_dataset
dataset = load_dataset('csv', data_files='data/ChnSentiCorp_htl_all.csv', split='train')
# remove the na rows
dataset = dataset.filter(lambda example: example['review'] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
def process_function(example):
    # 动态填充 短的samples
    inputs = tokenizer(example['review'], truncation=True, max_length=128)
    inputs["labels"] = example['label']
    return inputs

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

In [8]:
tokenized_dataset = dataset.map(process_function, batched=True, remove_columns=dataset.column_names)

In [10]:
samples = tokenized_dataset[:4]
print(len(samples['input_ids']))

4


In [12]:
[len(sen) for sen in samples['input_ids']]

[52, 29, 44, 128]

In [14]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_dataset, collate_fn=collator, shuffle=True, batch_size=4)

In [22]:
num = 0
for batch in train_dataloader:
    print(batch['input_ids'].shape)
    num += 1
    if num > 10:
        break

torch.Size([4, 128])
torch.Size([4, 93])
torch.Size([4, 128])
torch.Size([4, 109])
torch.Size([4, 128])
torch.Size([4, 63])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 86])
torch.Size([4, 128])


# 4. fine tuning with `Datasets`

In [24]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

## 4.1 load dataset

In [26]:
dataset = load_dataset('csv', data_files='data/ChnSentiCorp_htl_all.csv', split='train')
dataset = dataset.filter(lambda example: example['review'] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## 4.2 train test splot

In [28]:
# split the dataset
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## 4.3 tokenize

In [None]:
import torch
checkpoint = "hfl/rbt3"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def process_function(example):
    inputs = tokenizer(example['review'], truncation=True, max_length=128)
    inputs["labels"] = example['label']
    return inputs

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets['train'].column_names)

In [30]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

## 4.4 collator

In [39]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(tokenized_datasets['train'], collate_fn=collator, shuffle=True, batch_size=32)
eval_dataloader = DataLoader(tokenized_datasets['test'], collate_fn=collator, shuffle=False, batch_size=64)

## 4.5 模型和优化器

In [40]:
from torch.optim import Adam
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
if torch.cuda.is_available():
    model.cuda()
optimizer = Adam(model.parameters(), lr=2e-5)    

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 4.6 训练

In [43]:
def evaluate():
    model.eval()
    correct = 0
    with torch.inference_mode():
        for batch in eval_dataloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            correct += torch.sum(preds == batch['labels'])
        acc = correct / len(tokenized_datasets['test'])
        return acc


def train(num_epoch=1, log_step=100):

    global_step = 0
    for ep in range(num_epoch):
        model.train()
        for batch in train_dataloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            global_step += 1
            if global_step % log_step == 0:
                print(f"global_step: {global_step:6d}, loss: {loss.item():.4f}")
            acc = evaluate()
        print(f"epoch: {ep}, acc: {acc:.4f}")
            


In [44]:
train()

global_step:    100, loss: 0.2824
global_step:    200, loss: 0.4679
epoch: 0, acc: 0.9048
