In [87]:
import numpy as np
import pandas as pd
import os 
import torch
import torch.nn as nn
import transformers
import tokenizers

分析数据

In [88]:
# 将数据显示完全
pd.set_option('display.max_columns', None)

path = './input'
df_train = pd.read_csv(os.path.join(path, 'train.csv'))
df_test = pd.read_csv(os.path.join(path, 'test.csv'))
print(df_train.info())
print(df_train.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB
None
       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment  
0  I`d have responded, if I were going   neutral  
1                             Sooo SAD  negative  
2                        

In [89]:
# 空值非常少, 直接删除带有空值的项
df_train.dropna(inplace=True)

构建DataLoader

In [90]:
# 定义全局参数
# 模型见: https://huggingface.co/model  bert_base_uncased
class config:
    MAX_LENTH = 128
    TRAIN_BATCH_SIZE = 2
    TEST_BATCH_SIZE = 32
    EPOCH = 3
    BERT_PATH = './input'
    MODEL_PATH = 'pytorch_model.bin'
    TOKENIZER = tokenizers.BertWordPieceTokenizer(os.path.join(BERT_PATH, 'vocab.txt'), lowercase=True)

In [91]:
'''
BERT的tokenizer输出
1. ids:对应token在vocab中的id, <cls>为101,<seq>为102
2. type_ids
3. tokens
4. offsets
'''
tmp = config.TOKENIZER.encode('negative neutral positive')
print(tmp.ids)
print(tmp.tokens)
print(tmp.offsets)

[101, 4997, 8699, 3893, 102]
['[CLS]', 'negative', 'neutral', 'positive', '[SEP]']
[(0, 0), (0, 8), (9, 16), (17, 25), (0, 0)]


In [92]:
class TweetDataset:
    def __init__(self, tweet, selected_tweet, sentiment):
        self.tweet = tweet
        self.selected_text = selected_tweet
        self.sentiment = sentiment
    def __len__(self):
        return len(self.tweet)
    def __getitem__(self, item):
        tweet = self.tweet[item]
        selected_text = self.selected_text[item]
        sentiment = self.sentiment[item]
        self.tokenizer = config.TOKENIZER
        '''
        BERT模型需要的输入格式
        1) ids:text -> index(input_ids)
        2) mask: 参与到self-attention
        3) token_type_ids: 标识两个句子,用于NSP
        '''
        # 1）找到训练的标签：start，end
        idx0 = None
        idx1 = None
        for i, text in enumerate(tweet):
            if text == selected_text[0] and tweet[i:i + len(selected_text)] == selected_text:
                idx0 = i
                idx1 = i + len(selected_text) - 1
        # 对tweet进行tokenizer
        tok_tweet = self.tokenizer.encode(tweet)
        input_ids_orig = tok_tweet.ids[1:-1]
        tweet_offset = tok_tweet.offsets[1:-1]
        
        char_target = [0] * len(tweet)
        char_target[idx0:idx1+1] = [1] * len(selected_text)

        target_idx = []
        for j, (offset1, offset2) in enumerate(tweet_offset):
            if sum(char_target[offset1:offset2]) > 0:
                target_idx.append(j)
        target_start = target_idx[0]
        target_end = target_idx[-1]
        
        '''
        ids, mask, token_type_ids
        '''
        sentiment_id = {
            'negative':4997,
            'neutral':8699,
            'positive':3893
        }
        # 格式为 <CLS>,sentiment,<SEP>,tweet,<SEP>
        input_ids = [101] + [sentiment_id[sentiment]] + [102] + input_ids_orig
        # 句子分界
        token_type_ids = [0, 0, 0] + [1] * (len(input_ids) - 3)
        mask = [1] * len(input_ids)
        tweet_offset = [(0,0)] * 3 + tweet_offset
        target_start += 3
        target_end += 3
        
        # padding, max_len < 128
        padding_lenght = config.MAX_LENTH - len(input_ids)
        if padding_lenght > 0:
            input_ids = input_ids + [0] * padding_lenght
            token_type_ids = token_type_ids + [0] * padding_lenght
            mask = mask + [0] * padding_lenght
            tweet_offset = tweet_offset + ([(0,0)] * (padding_lenght))
        
        return {
            'ids':torch.tensor(input_ids, dtype=torch.long),
            'token_type_ids':torch.tensor(token_type_ids,dtype=torch.long),
            'mask':torch.tensor(mask,dtype=torch.long),
            'tweet_off':torch.tensor(tweet_offset,dtype=torch.long),
            'target_start':torch.tensor(target_start, dtype=torch.long),
            'target_end':torch.tensor(target_end, dtype=torch.long),
            'tweet':tweet,
            'selected':selected_text
        }

定义模型


In [102]:
'''
BERT的输出：
https://www.cnblogs.com/deep-deep-learning/p/12792041.html
sequence_output, pooled_output, (hidden_states), (attentions)
1) sequence_output：输出的序列所有单词的embedding [batch, length, embedding]
2) pooled_output: CLS的输出[batch, embedding]
3) hidden_states: 输出BERT模型所有层的输出(13层的transformer block) 13 * [batch, length, embedding](model_hidden_states=True)
4) attenions: 输出attentions
'''
class Tweet(transformers.BertPreTrainedModel):
    def __init__(self,conf):
        super(Tweet, self).__init__(conf)
        self.bert = transformers.BertModel.from_pretrained(config.BERT_PATH,config=conf)
        
        # 这里选择了调整整个模型参数而非全连接层
        for param in self.bert.parameters():
            param.requires_grad = True
        self.drop_out = nn.Dropout(0.1)
        # 设置全连接层, 分别输出start和end的位置
        self.l0 = nn.Linear(768*2, 2)
    def forward(self, ids, mask, token_type_ids):
        '''config : hidden_states = True'''
        _, _, output = self.bert(ids, attention_mask = mask, token_type_ids=token_type_ids)
        # 合并output的最后一层和倒数第二层的输出
        out = torch.cat((torch.tensor(output[-1]),torch.tensor(output[-2])), dim=-1)
        out = self.drop_out(out)  # 768 * 2
        logist = self.l0(out)   # 768 * 2 -> 2
        start_logits, end_logits = logist.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)  # [batch, length]
        end_logits = end_logits.squeeze(-1)
        return start_logits, end_logits

定义Optimize和Loss

In [103]:
'''
构建loss,交叉熵
'''
def loss_fn(start_logist, end_logist, start_position, end_position):
    loss_fct = nn.CrossEntropyLoss()
    start_loss = loss_fct(start_logist, start_position)
    end_loss = loss_fct(end_logist, end_position)
    return start_loss + end_loss

In [104]:
'''
构建optimizer
AdamW
'''
# 构建模型
model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
model_config.output_hidden_states = True
model = Tweet(conf=model_config)

param_optimizer = list(model.named_parameters())

# 对LayerNorm层不设置正则项且学习率更小
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimzer_parameter = [
    {'params':[p for n, p in param_optimizer if not any(i in n for i in no_decay)], 'weight_decay':0.01,'lr':3e-5},
    {'params':[p for n, p in param_optimizer if any(i in n for i in no_decay)],  'weight_decay':0.0,'lr':5e-5}
]
from transformers import AdamW
optimzer = AdamW(optimzer_parameter,lr=5e-5)

# 动态调整learning rate方式
'''
optimizer (Optimizer) – Wrapped optimizer.

factor (float) – Factor by which the learning rate will be reduced. new_lr = lr * factor. Default: 0.1.

patience (int) – Number of epochs with no improvement after which learning rate will be reduced. 
For example, if patience = 2, then we will ignore the first 2 epochs with no improvement, 
and will only decrease the LR after the 3rd epoch if the loss still hasn’t improved then. Default: 10.

threshold (float) – Threshold for measuring the new optimum, to only focus on significant changes. Default: 1e-4.
'''
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimzer,factor=0.1,patience=3,eps=1e-8)

'''
Early Stop,避免可能的灾难性遗忘
'''
from utils import EarlyStopping
es = EarlyStopping(patience=3,path='./output/checkpoint.pt')

Some weights of the model checkpoint at ./input were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [105]:
'''
定义DataLoader
'''
from torch.utils.data import DataLoader
train_dataloader = DataLoader(TweetDataset(df_train['text'], df_train['selected_text'], df_train['sentiment']),batch_size=config.TRAIN_BATCH_SIZE)

In [106]:
'''
该任务的评价指标为字符级别的Jaccard相似度
'''
def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def calculate_jaccard_score(tweet,orig_selected, start_logist, end_logist, offset):
    if start_logist > end_logist:
        start_logist = end_logist
    # offset (0,1), (1,9)
    logist_selected = tweet[offset[start_logist][0] : offset[end_logist][1]]
    return jaccard(orig_selected, logist_selected)

In [107]:
from tqdm.autonotebook import tqdm
def main():
    for i in range(config.EPOCH):
        tk0 = tqdm(train_dataloader, total=len(train_dataloader))
        losses = 0
        for i, data in enumerate(tk0):
            start_logist, end_logist = model(data['ids'], data['mask'], data['token_type_ids'])
            loss = loss_fn(start_logist, end_logist, data['target_start'], data['target_end'])
            losses += loss*len(data['ids'])
            optimzer.zero_grad()
            loss.backward()
            optimzer.step()
            output_start = torch.argmax(start_logist, dim=-1)
            output_end = torch.argmax(end_logist, dim=-1)
            jaccards = []
            for p_i, tweet in enumerate(data['tweet']):
                jaccard_s = calculate_jaccard_score(tweet, data['selected'][p_i], output_start[p_i], output_end[p_i], data['tweet_off'][p_i])
                jaccards.append(jaccard_s)
            tk0.set_postfix({'loss':loss.item(),'jaccard':np.mean(jaccards)})
        scheduler.step(losses)
        es(losses, model)
        if es.early_stop:
            break

In [None]:
if __name__ == "__main__":
    main();