In [1]:
!pip install transformers
!git clone https://github.com/huggingface/transformers.git

fatal: destination path 'transformers' already exists and is not an empty directory.


# BertForSequenceClassification情感分析

In [0]:
import torch

In [0]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class SentimentDataset(Dataset):
	def __init__(self, path_to_file):
		self.dataset = pd.read_csv(path_to_file, sep="\t", names=["text", "label"])
	def __len__(self):
		return len(self.dataset)
	def __getitem__(self, idx):
		text = self.dataset.loc[idx, "text"]
		label = self.dataset.loc[idx, "label"]
		sample = {"text": text, "label": label}
		return sample

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, BertModel
from torch.utils.data import DataLoader
from transformers import AdamW

In [0]:
# 超参数
hidden_dropout_prob = 0.5
num_labels = 2
learning_rate = 1e-5
weight_decay = 1e-2
epochs = 5
max_len = 100
batch_size = 16
class_num = 2

base_path = "/content/drive/My Drive/Colab Notebooks/"
vocab_file = base_path + "PyTorch_Pretrained_Model/chinese_wwm_pytorch/vocab.txt"

In [8]:
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

In [0]:
# 使用GPU
# 然后通过model.to(device)的方式使用
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [0]:
data_path = base_path + "/data/sentiment/"
# 加载数据集
sentiment_train_set = SentimentDataset(data_path + "sentiment.train.data")
sentiment_train_loader = DataLoader(sentiment_train_set, batch_size=batch_size, shuffle=True, num_workers=2)

sentiment_valid_set = SentimentDataset(data_path + "sentiment.valid.data")
sentiment_valid_loader = DataLoader(sentiment_valid_set, batch_size=batch_size, shuffle=False, num_workers=2)

In [0]:
# 加载模型
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

In [0]:
# 定义优化器和损失函数
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
#optimizer = AdamW(model.parameters(), lr=learning_rate)
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
criterion = nn.CrossEntropyLoss()
tokenizer = BertTokenizer(vocab_file)

In [0]:
def convert_text_to_ids(tokenizer, text, max_len=100):
	if isinstance(text, str):
		tokenized_text = tokenizer.encode_plus(text, max_length=max_len, add_special_tokens=True)
		input_ids = tokenized_text["input_ids"]
		token_type_ids = tokenized_text["token_type_ids"]
	elif isinstance(text, list):
		input_ids = []
		token_type_ids = []
		for t in text:
			tokenized_text = tokenizer.encode_plus(t, max_length=max_len, add_special_tokens=True)
			input_ids.append(tokenized_text["input_ids"])
			token_type_ids.append(tokenized_text["token_type_ids"])
	else:
		print("Unexpected input")
	return input_ids, token_type_ids


def seq_padding(tokenizer, X):
	# 需要 LongTensor
	pad_id = tokenizer.convert_tokens_to_ids("[PAD]")
	if len(X) <= 1:
		return torch.tensor(X, dtype=torch.long)
	L = [len(x) for x in X]
	ML = max(L)
	X = torch.tensor([x + [pad_id] * (ML - len(x)) if len(x) < ML else x for x in X], dtype=torch.long)
	return X

In [0]:
def train(model, iterator, optimizer, criterion, device):
	model.to(device)
	model.train()
	epoch_loss = 0
	epoch_acc = 0
	for i, batch in enumerate(iterator):
		label = batch["label"]
		text = batch["text"]
		input_ids, token_type_ids = convert_text_to_ids(tokenizer, text)
		input_ids = seq_padding(tokenizer, input_ids)
		token_type_ids = seq_padding(tokenizer, token_type_ids)
		# 标签形状为 (batch_size, 1) 
		label = label.unsqueeze(1)
		# 梯度清零
		optimizer.zero_grad()
		# 迁移到GPU
		input_ids, token_type_ids, label = input_ids.to(device), token_type_ids.to(device), label.to(device)
		# (loss), logits, (hidden_states), (attentions)
		# (hidden_states), (attentions) 不一定存在
		output = model(input_ids=input_ids, token_type_ids=token_type_ids, labels=label)
		y_pred_prob = output[1]
		y_pred_label = y_pred_prob.argmax(dim=1)
		# 计算loss
		#loss = criterion(y_pred_prob.view(-1, 2), label.view(-1))
		loss = output[0]
		# 计算acc
		acc = ((y_pred_label == label.view(-1)).sum()).item()
		# 反向传播
		loss.backward()
		optimizer.step()
		# epoch 中的 loss 和 acc 累加
		epoch_loss += loss.item()
		epoch_acc += acc
		if i % 100 == 0:
			print("current loss:", epoch_loss / (i+1), "\t", "current acc:", epoch_acc / ((i+1)*len(label)))
	# return epoch_loss / len(iterator), epoch_acc / (len(iterator) * iterator.batch_size)
	return epoch_loss / len(iterator), epoch_acc / len(iterator.dataset.dataset)

def evaluate(model, iterator, criterion, device):
	model.to(device)
	model.eval()
	epoch_loss = 0
	epoch_acc = 0
	with torch.no_grad():
		for _, batch in enumerate(iterator):
			label = batch["label"]
			text = batch["text"]
			input_ids, token_type_ids = convert_text_to_ids(tokenizer, text)
			input_ids = seq_padding(tokenizer, input_ids)
			token_type_ids = seq_padding(tokenizer, token_type_ids)
			# 标签形状为 (batch_size, 1) 
			label = label.unsqueeze(1)
			# 迁移到GPU
			input_ids, token_type_ids, label = input_ids.to(device), token_type_ids.to(device), label.to(device)
			output = model(input_ids=input_ids, token_type_ids=token_type_ids, labels=label)
			y_pred_label = output[1].argmax(dim=1)
			loss = output[0]
			acc = ((y_pred_label == label.view(-1)).sum()).item()
			epoch_loss += loss.item()
			epoch_acc += acc
	# return epoch_loss / len(iterator), epoch_acc / (len(iterator) * iterator.batch_size)
	return epoch_loss / len(iterator), epoch_acc / len(iterator.dataset.dataset)

In [15]:
for i in range(epochs):
	train_loss, train_acc = train(model, sentiment_train_loader, optimizer, criterion, device)
	print("train loss: ", train_loss, "\t", "train acc:", train_acc)
	valid_loss, valid_acc = evaluate(model, sentiment_valid_loader, criterion, device)
	print("valid loss: ", valid_loss, "\t", "valid acc:", valid_acc, end="\n\n")

current loss: 0.7206894159317017 	 current acc: 0.375
current loss: 0.6639068952881464 	 current acc: 0.5928217821782178
current loss: 0.659623192762261 	 current acc: 0.605410447761194
current loss: 0.6472823614891977 	 current acc: 0.6243770764119602
current loss: 0.6229137607717752 	 current acc: 0.6510286783042394
current loss: 0.6070191398887577 	 current acc: 0.6681636726546906
current loss: 0.5884566437236084 	 current acc: 0.6857321131447587
current loss: 0.5727972561086976 	 current acc: 0.7018544935805991
current loss: 0.5588198389937071 	 current acc: 0.7135611735330837
current loss: 0.5455629944007484 	 current acc: 0.7237791342952276
current loss: 0.5343364701493756 	 current acc: 0.7319555444555444
train loss:  0.5273547310449346 	 train acc: 0.736623314923689
valid loss:  0.4159237858133786 	 valid acc: 0.8180956892468024

current loss: 0.2922477126121521 	 current acc: 0.875
current loss: 0.3790179410635835 	 current acc: 0.8366336633663366
current loss: 0.3748158735422

In [20]:
import os
saved_model = "./saved_model"
saved_tokenizer = "./saved_tokenizer"
os.makedirs(saved_model)
os.makedirs(saved_tokenizer)
model.save_pretrained(saved_model)
tokenizer.save_pretrained(saved_tokenizer)

('./saved_tokenizer/vocab.txt',
 './saved_tokenizer/special_tokens_map.json',
 './saved_tokenizer/added_tokens.json')

In [17]:
# 再测试
for i in range(epochs):
	train_loss, train_acc = train(model, sentiment_train_loader, optimizer, criterion, device, 1)
	print("train loss: ", train_loss, "\t", "train acc:", train_acc)
	valid_loss, valid_acc = evaluate(model, sentiment_valid_loader, criterion, device)
	print("valid loss: ", valid_loss, "\t", "valid acc:", valid_acc)

current loss: 0.5664684772491455 	 current acc: 0.875
current loss: 0.264671977609396 	 current acc: 0.8935643564356436
current loss: 0.25866135283933944 	 current acc: 0.898320895522388
current loss: 0.2383091023371267 	 current acc: 0.9061461794019934
current loss: 0.237193513689493 	 current acc: 0.905860349127182
current loss: 0.23496822162927267 	 current acc: 0.907185628742515
current loss: 0.23098835738967738 	 current acc: 0.908797836938436
current loss: 0.22928264006951055 	 current acc: 0.9099500713266762
current loss: 0.2334908884014009 	 current acc: 0.9083177278401997
current loss: 0.2309247796340007 	 current acc: 0.9090593784683685
current loss: 0.2257420639698203 	 current acc: 0.9111513486513486
train loss:  0.22612134802375425 	 train acc: 0.9113961636676762
valid loss:  0.24776305052254236 	 valid acc: 0.906679298910469
current loss: 0.1803281456232071 	 current acc: 0.9375
current loss: 0.14353447514447834 	 current acc: 0.9467821782178217
current loss: 0.1450958402

KeyboardInterrupt: ignored

In [0]:
from transformers import BertConfig

In [0]:
config = BertConfig.from_pretrained("bert-base-uncased", num_labels=num_labels, config = BertConfig.from_pretrained("bert-base-uncased", num_labels=num_labels, hidden_dropout_prob=hidden_dropout_prob))
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", config=config)

In [0]:
for i in range(epochs):
	train_loss, train_acc = train(model, sentiment_train_loader, optimizer, criterion, device, 1)
	valid_loss, valid_acc = evaluate(model, sentiment_valid_loader, criterion, device)
	print("\n")
	print("train loss: ", train_loss, "\t", "train acc:", train_acc)
	print("valid loss: ", valid_loss, "\t", "valid acc:", valid_acc, end="\n\n")

In [0]:
config.is_decoder

False

In [0]:
for i in model.parameters():
  print(i)
  break

Parameter containing:
tensor([[-0.0102, -0.0616, -0.0264,  ..., -0.0202, -0.0378, -0.0091],
        [-0.0117, -0.0600, -0.0323,  ..., -0.0168, -0.0401, -0.0107],
        [-0.0197, -0.0627, -0.0326,  ..., -0.0165, -0.0420, -0.0032],
        ...,
        [-0.0218, -0.0556, -0.0135,  ..., -0.0043, -0.0151, -0.0249],
        [-0.0462, -0.0565, -0.0019,  ...,  0.0157, -0.0139, -0.0095],
        [ 0.0015, -0.0821, -0.0160,  ..., -0.0081, -0.0475,  0.0753]],
       device='cuda:0', requires_grad=True)


# 测试专用

In [0]:
def train(model, iterator, optimizer, criterion, device, clip):
	model.to(device)
	model.train()
	epoch_loss = 0
	epoch_acc = 0
	for i, batch in enumerate(iterator):
		label = batch["label"] # shape为batch_size
		text = batch["text"]
		print(label)
		print(label.size())
		input_ids, token_type_ids = convert_text_to_ids(tokenizer, text)
		input_ids = seq_padding(tokenizer, input_ids)
		token_type_ids = seq_padding(tokenizer, token_type_ids)
		# 标签形状为 (batch_size, 1) 
		label = label.unsqueeze(1)
		# 需要 LongTensor
		input_ids, token_type_ids, label = input_ids.long(), token_type_ids.long(), label.long()
		# 梯度清零
		optimizer.zero_grad()
		# 迁移到GPU
		input_ids, token_type_ids, label = input_ids.to(device), token_type_ids.to(device), label.to(device)
		# (loss), logits, (hidden_states), (attentions)
		# (hidden_states), (attentions) 不一定存在
		# output = model(input_ids=input_ids, token_type_ids=token_type_ids, labels=label)
		output = model(input_ids=input_ids, token_type_ids=token_type_ids)
		y_pred_prob = output[1] # BertModel的话shape为(batch_size,768)
		print(y_pred_prob)
		print(y_pred_prob.size())
		print(label) # shape为(batch_size,1)
		print(label.size())
		break
		y_pred_label = y_pred_prob.argmax(dim=1)
		# 计算loss
		#loss = criterion(y_pred_prob.view(-1, 2), label.view(-1))
		loss = output[0]
		# 计算acc
		acc = ((y_pred_label == label.view(-1)).sum()).item()
		# 梯度裁剪
		#torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
		# 反向传播
		loss.backward()
		optimizer.step()
		# epoch 中的 loss 和 acc 累加
		epoch_loss += loss.item()
		epoch_acc += acc
		if i % 100 == 0:
			print("current loss:", epoch_loss / (i+1), "\t", "current acc:", epoch_acc / ((i+1)*len(label)))
	# return epoch_loss / len(iterator), epoch_acc / (len(iterator) * iterator.batch_size)
	return epoch_loss / len(iterator), epoch_acc / len(iterator.dataset.dataset)