## 1 导入工具包

In [2]:
import pandas as pd
import numpy as np
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss# 多标签分类loss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
import pickle
from transformers import AutoModel,AutoConfig,AutoTokenizer
from tqdm import tqdm, trange
from ast import literal_eval

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce RTX 2080 Ti'

In [4]:
device

device(type='cuda')

## 2 加载数据

In [5]:
train = pd.read_csv('dataset/train.csv')  # toxic-comment-classification
test = pd.read_csv('dataset/test.csv')
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
def check_missing():
    print("train null nums")
    print(train.shape[0]-train.count())
    print("----这是分割线----")
    print("test null nums")
    print(test.shape[0]-test.count())

check_missing()  # 训练集没有缺失值

train null nums
id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64
----这是分割线----
test null nums
id              0
comment_text    0
dtype: int64


In [7]:
print('Unique comments: ', train.comment_text.nunique() == train.shape[0])  # 训练集的每一个文本都是不同的
print('Null values: ', train.isnull().values.any())                         # 训练集没有缺失值

Unique comments:  True
Null values:  False


In [8]:
train.comment_text.str.split().str.len().describe()

count    159571.000000
mean         67.273527
std          99.230702
min           1.000000
25%          17.000000
50%          36.000000
75%          75.000000
max        1411.000000
Name: comment_text, dtype: float64

In [9]:
(train.comment_text.str.split().str.len() > 200).sum()  # 统计文本word个数大于200的文本个数

10087

In [10]:
MAX_LEN = 160

In [11]:
cols = train.columns # 数据集的列名
label_cols = list(cols[2:]) # 数据集中的标签列名
num_labels = len(label_cols) # 标签个数
print('Label columns: ', label_cols)
print('num_labels: ', num_labels)

Label columns:  ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
num_labels:  6


In [12]:
# 统计每个标签为1或0的个数
print(train[label_cols].sum())
print("-------------------------")
print(train.shape[0] - train[label_cols].sum())  # 或train[label_cols].eq(0).sum()

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64
-------------------------
toxic            144277
severe_toxic     157976
obscene          151122
threat           159093
insult           151694
identity_hate    158166
dtype: int64


## 3 数据预处理

In [13]:
train = train.sample(frac=1).reset_index(drop=True)  # shuffle训练集
    
# sample(): 对原数据框进行一定比例的随机抽取并打乱顺序
# frac=1: 返回的抽样行数占总行数的比例 1为全排列
# reset_index(): 重置索引
# drop=True: 丢弃原标签

In [14]:
train['one_hot_labels'] = list(train[label_cols].values)  # 将六个标签合成独热编码
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,one_hot_labels
0,9eac22375e6d5046,"""\n\nThanks for starting a section about this ...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
1,c700575117591400,Stop Deleting Images Without First Consulting ...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
2,a7c7bbc60a3dc340,"""::::No, under international norms Kosovo is s...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
3,f568037f7cb6e4dd,"""\n\n(1) Could you please explain to me how my...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
4,add837a78ecbaa4d,"""\n\nThis article should not be """"speedy delet...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"


In [15]:
labels = list(train['one_hot_labels'].values)
comments = list(train['comment_text'].values)

## 4 Tokenizer

In [16]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)  # 全部转换为小写字母
encodings = tokenizer.batch_encode_plus(  # batch_encode_plus能够批量梳理文本
    comments,# 传入列表
    max_length=MAX_LEN,
    padding=True,
    truncation=True
) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys()) # 分词之后的输出

tokenizer outputs:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [17]:
input_ids = encodings['input_ids']
token_type_ids = encodings['token_type_ids']
attention_masks = encodings['attention_mask']

In [18]:
label_counts = train.one_hot_labels.astype(str).value_counts()  # [0, 0, 0, 0, 0, 0] 14023  |  [0, 0, 0, 0, 0, 1] 356 ....
one_freq = label_counts[label_counts==1].keys()  # 只有1个样本的独热编码

# 找出 只有1个样本的独热编码 的index
# ！！！这里用sorted倒序排序，是为了先弹出index大的值。若先弹出index小的，大index会改变！！！
one_freq_idxs = sorted(list(train[train.one_hot_labels.astype(str).isin(one_freq)].index), reverse=True)
print('df label indices with only one instance: ', one_freq_idxs)

df label indices with only one instance:  [135067, 71828]


In [19]:
# 将单个样本的例子拿出来，为了之后强制加入训练集（避免学习不到少样本数据）
one_freq_input_ids = [input_ids.pop(i) for i in one_freq_idxs]
one_freq_token_types = [token_type_ids.pop(i) for i in one_freq_idxs]
one_freq_attention_masks = [attention_masks.pop(i) for i in one_freq_idxs]
one_freq_labels = [labels.pop(i) for i in one_freq_idxs]

## 5 构建数据集

In [20]:
# 训练集和验证集划分
# ！！这里用到了stratify=labels 保证了训练集和测试集会按照y的比例分配，所以前面不能有类别1样本！！
train_inputs, validation_inputs, train_labels, validation_labels, train_token_types, validation_token_types, train_masks, validation_masks = train_test_split(input_ids, labels, token_type_ids,attention_masks,random_state=2023, test_size=0.10, stratify = labels)

# Add one frequency data to train data 将单样本强制加入训练集
train_inputs.extend(one_freq_input_ids)
train_labels.extend(one_freq_labels)
train_masks.extend(one_freq_attention_masks)
train_token_types.extend(one_freq_token_types)

# 将原始id转为torch 张量
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
train_token_types = torch.tensor(train_token_types)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)
validation_token_types = torch.tensor(validation_token_types)

  train_labels = torch.tensor(train_labels)


In [21]:
# 批数据一般大小 ：8 16 32 64  128 256
batch_size = 32

# 训练集 
train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)  # 对tensor进行打包,很像python zip函数
train_sampler = RandomSampler(train_data) 
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)  # 指定shuffle=True不指定sampler 等同于 指定sampler=train_sampler

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
validation_sampler = SequentialSampler(validation_data) # 按顺序遍历
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [22]:
# 保存处理好的数据
# torch.save(validation_dataloader,'validation_data_loader')
# torch.save(train_dataloader,'train_data_loader')

## 6 加载预训练模型

In [23]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels) 
# num_labels：6   默认情况2分类

model.cuda()  # 等于model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

xxxxxlForSequenceClassification 输出有两个        
1. loss
2. logits

### 设置优化器

In [24]:
paras = [para for para in model.named_parameters()]  # 模型参数

In [25]:
from transformers import AdamW

In [26]:
# 对不同参数设置weight_decay_rate
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']  # 参数名字是否包含这三个keyword, 处理方法不同
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [27]:
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)
# optimizer = AdamW(model.parameters(),lr=2e-5)  # 默认优化器



## 7 模型训练

In [29]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3 # 训练轮数，15万训练集 任务比较简单的，最多设置5

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train() # 设置训练模式

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):# 遍历批数据
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    # 每一批数据展开: batch_size个 train_inputs, train_labels, train_masks, train_token_types

    # 接收batch的输入
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()

    # # Forward pass for multiclass classification
    # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    # loss = outputs[0]
    # logits = outputs[1]

    # Forward pass for multilabel classification
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
    loss_func = BCEWithLogitsLoss() # 计算损失
    loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    # loss_func = BCELoss() 
    # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())# 记录loss    

    # Backward pass
    loss.backward() # loss反向求导
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # scheduler.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################

  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

  # Predict
  for i, batch in enumerate(validation_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    with torch.no_grad():
      # Forward pass
      outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

  # Flatten outputs
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # 计算准确率
  threshold = 0.50
  pred_bools = [pl>threshold for pl in pred_labels]
  true_bools = [tl==1 for tl in true_labels]
  val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
  val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

  print('F1 Validation Accuracy: ', val_f1_accuracy)
  print('Flat Validation Accuracy: ', val_flat_accuracy)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Train loss: 0.04957876766314058


Epoch:  33%|███▎      | 1/3 [24:02<48:04, 1442.03s/it]

F1 Validation Accuracy:  79.29873772791024
Flat Validation Accuracy:  92.6740615403898
Train loss: 0.03338945735124392


Epoch:  67%|██████▋   | 2/3 [47:41<23:48, 1428.66s/it]

F1 Validation Accuracy:  78.54814814814814
Flat Validation Accuracy:  92.66779469825156
Train loss: 0.026383054758235158


Epoch: 100%|██████████| 3/3 [1:11:22<00:00, 1427.57s/it]

F1 Validation Accuracy:  79.03040734366036
Flat Validation Accuracy:  92.72419627749578





In [31]:
torch.save(model.state_dict(), 'bert_model_toxic')

## 8 预测与评估 

In [39]:
test_df = pd.read_csv('dataset/test.csv')
test_labels_df = pd.read_csv('dataset/test_labels.csv')
test_df = test_df.merge(test_labels_df, on='id', how='left')
test_label_cols = list(test_df.columns[2:])
print('Null values: ', test_df.isnull().values.any()) #should not be any null sentences or labels
print('Same columns between train and test: ', label_cols == test_label_cols) #columns should be the same
test_df.head()

Null values:  False
Same columns between train and test:  True


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1,-1,-1,-1,-1,-1


In [42]:
test_df = test_df[~test_df[test_label_cols].eq(-1).any(axis=1)] # remove irrelevant rows/comments with -1 values
test_df['one_hot_labels'] = list(test_df[test_label_cols].values)  # 生成labels的独热编码
test_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,one_hot_labels
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
7,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"


In [44]:
# Gathering input data
test_labels = list(test_df.one_hot_labels.values)
test_comments = list(test_df.comment_text.values)

In [45]:
# 测试集分词编码
test_encodings = tokenizer.batch_encode_plus(test_comments,max_length=MAX_LEN,padding=True,truncation=True)
test_input_ids = test_encodings['input_ids']
test_token_type_ids = test_encodings['token_type_ids']
test_attention_masks = test_encodings['attention_mask']

In [46]:
# Make tensors out of data
test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_attention_masks)
test_token_types = torch.tensor(test_token_type_ids)

# Create test dataloader
test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Save test dataloader
# torch.save(test_dataloader,'test_data_loader')

In [47]:
# Test

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

#track variables
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels, b_token_types = batch
  with torch.no_grad():
    # Forward pass
    outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)

    b_logit_pred = b_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    b_labels = b_labels.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
  logit_preds.append(b_logit_pred)
  true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]
# Converting flattened binary values to boolean values
true_bools = [tl==1 for tl in true_labels]

### ？？？下面报错 ？？？

In [52]:
red_bools = [pl>0.50 for pl in pred_labels] #boolean output after thresholding

# Print and save classification report
print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools,average='micro'))
print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools),'\n')

clf_report = classification_report(true_bools,pred_bools,target_names=test_label_cols)
pickle.dump(clf_report, open('classification_report.txt','wb')) #save report
print(clf_report)

ValueError: Found input variables with inconsistent numbers of samples: [63978, 15957]