<a href="https://colab.research.google.com/github/annychu/CTBC_JOB/blob/master/emotion_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install torch



In [2]:
import pandas as pd
from transformers import BertForSequenceClassification
from transformers import BertTokenizer, BertConfig
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch
from sklearn.utils import shuffle

In [3]:
df_train = pd.read_csv("01_Hoteltrain.csv", encoding="utf-8")
df_train = df_train.drop(columns=["index"])
if len(df_train[df_train.isnull().values==True]) > 0:
    df_train = df_train.dropna()


# 只取384個字元做預測
MAX_LENGTH = 384

def substr(string):
    string = string[:MAX_LENGTH]
    return string

df_train["review"] = df_train["review"].apply(substr)

columns = ["review", "label"]
df_train = df_train.reindex(columns=columns)

# 訓練資料分成90:10的訓練和驗證集
df_train_copy = df_train.copy()
train_set = df_train_copy.sample(frac=0.9, random_state=0)
test_set = df_train_copy.drop(train_set.index)

train_set.to_csv("train.tsv", sep="\t", index=False, encoding="utf-8")
test_set.to_csv("test.tsv", sep="\t", index=False, encoding="utf-8")
    
print(train_set)
print(test_set)

                                                 review  label
398   房間空間還好滿寬敞的,但是朝南面挨著馬路比較吵,房間的隔音效果只能說一般.硬體設施上特別是衛...      1
3833  感覺還可以,就是二層樓房窗外沒有風景,被一些建築物遮擋,可能是廣告之類,晚上還可以,有窗簾,...      1
4836                          還好住的地方居然窗口是對著居民區的設施一般般床不錯      1
4572  環境差得很.進房間就一股醜味.設施簡陋,電視不清楚,最重要的是房間價格,自己到店大堂可以還價...      0
636             房間感覺還可以，但是洗漱用的毛巾浴巾品質不好，感覺沒有洗乾淨，房間隔音效果不好      0
...                                                 ...    ...
753                        房間還行，交通也方便，當然火車站周邊略顯嘈雜，總體還行。      1
1938                             酒店不錯,已多次住過,同等價格下次還會入住.      1
673          房間隔音效果超級差，晚上竟然能聽到外面在搓麻將和講話聲！半夜被吵醒好幾次，我暈死！！      0
2712  酒店裝修氣味很濃，房間感覺一般，與4星有不少差距。入住期間居然不開空調，熱得要死，剛好房間又...      0
3511  這裡的早餐是我看到的最差的一個,基本上沒什麼吃的,就看到服務員在不聽的加白粥,下次在來我是不...      1

[4500 rows x 2 columns]
                                                 review  label
0     來往虹橋機場,絕對方便,938公共汽車直接門對門,上車睡覺就可以了(一個半小時)賓館服務不專...      0
21    其實住美卡不是第一次，這次算我最失望的一次！一直以來我比較喜歡這個酒店，最近我覺得很差很差．...      0
24    其實這個價格相對這個狀況的酒店已經不便宜了（36

In [0]:
'''

設計一個Dataset，每次將csv裡的一筆資料轉成bert相容格式，並回傳:

tokens_tensor :: 句子，包含"[CLS]"
segments_tensor :: 皆為1，識別句子。
label_tensor :: 就tensor

'''

class sentimentDataset(Dataset):

    def __init__(self, dataPath, tokenizer):
        self.dataPath = dataPath
        self.traindf = pd.read_csv(dataPath, sep="\t", encoding="utf-8")
        self.tokenizer = tokenizer
        self.len = len(self.traindf)
    
    def __getitem__(self, idx):
        # 一一取出資料
        text, label = self.traindf.iloc[idx,:].values
        
        # token tensor
        word_cls = ["[CLS]"]
        token = self.tokenizer.tokenize(text)
        word_cls += token
        ids = self.tokenizer.convert_tokens_to_ids(word_cls)
        len_ids = len(ids)
        tokens_tensor = torch.tensor(ids)

        # segment tensor
        segments_tensor = torch.tensor([1]*len_ids)
        # label tensor
        label_tensor = torch.tensor(label)

        return tokens_tensor, segments_tensor, label_tensor

    def __len__(self):
        return self.len

In [0]:
'''

設計一個mini_batch

input : sentimentDataset回傳值的集合

output : 

tokens_tensors :: (batch_size, max_seq_len_in_batch)
segments_tensors :: (batch_size, max_seq_len_in_batch)
masks_tensors :: (batch_size, max_seq_len_in_batch)  # 界定自注意力範圍，1是關注，0是padding 不需要關注。
label_tensors :: (batch_size)

'''

def mini_batch(sentimentSet):
    
    tokens_tensors = [s[0] for s in sentimentSet]
    segments_tensors = [s[1] for s in sentimentSet]
    label_tensors = torch.stack([s[2] for s in sentimentSet])

    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)

    masks_tensors = torch.zeros(tokens_tensors.shape)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)

    return tokens_tensors, segments_tensors, masks_tensors, label_tensors

In [0]:
'''

複寫一個model 並加上linear & BN

'''

from torch.nn import CrossEntropyLoss
import torch.nn.functional as F 

class BertForSequenceClassification_sentiment(BertForSequenceClassification):

    def __init__(self, config):
        super(BertForSequenceClassification_sentiment, self).__init__(config)

        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(config.hidden_size, 768)
        self.classifier2 = nn.Linear(768, 64)
        self.classifier3 = nn.Linear(64, self.config.num_labels)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):    
        
        outputs = self.bert(input_ids = input_ids,
                            attention_mask = attention_mask,
                            token_type_ids = token_type_ids,
                            position_ids = position_ids,
                            head_mask = head_mask,
                            # inputs_embeds = inputs_embeds,
                            )
        
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        pooled_output = self.classifier(pooled_output)
        pooled_output = self.classifier2(pooled_output)
        logits = self.classifier3(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)



In [7]:
PRETRAINED_MODEL_NAME = "bert-base-chinese"
NUM_LABELS = 2

model = BertForSequenceClassification_sentiment.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name:}:{n:}")
    else:
        print("{:20}{}".format(name, module))

# frozen bert and train linear layers
# for i in model.bert.parameters():
#     i.requires_grad = False

para_sum = sum(j.numel() for j in [i for i in model.parameters() if i.requires_grad == True ])
print("總共要訓練的參數:{}個".format(para_sum))

bert:embeddings
bert:encoder
bert:pooler
dropout             Dropout(p=0.2, inplace=False)
classifier          Linear(in_features=768, out_features=768, bias=True)
classifier2         Linear(in_features=768, out_features=64, bias=True)
classifier3         Linear(in_features=64, out_features=2, bias=True)
總共要訓練的參數:102907586個


In [8]:
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
sentimentSet = sentimentDataset("train.tsv", tokenizer)

BATCH_SIZE = 8
sentimentLoader = DataLoader(sentimentSet, batch_size=BATCH_SIZE, collate_fn=mini_batch)

# take a look
data = next(iter(sentimentLoader))

tokens_tensors, segments_tensors, masks_tensors, label_tensors = (i for i in data)

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_tensors.shape}
{label_tensors}
""")


tokens_tensors.shape   = torch.Size([8, 273]) 
tensor([[ 101, 2791, 7279,  ..., 1762,  857, 8013],
        [ 101, 2697, 6221,  ...,    0,    0,    0],
        [ 101, 6917, 1962,  ...,    0,    0,    0],
        ...,
        [ 101, 6983, 2421,  ...,    0,    0,    0],
        [ 101, 7478, 2382,  ...,    0,    0,    0],
        [ 101, 6983, 2421,  ...,    0,    0,    0]])
------------------------
segments_tensors.shape = torch.Size([8, 273])
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([8, 273])
tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]])
------------

In [0]:
acc_train = []
acc_test = []
train_loss = []
TIMES = 0

In [10]:
'''

設計一個可以批次訓練程式

'''
times = 20
TIMES += times
for i in range(times):
    # training mode
    model.train()

    lr = 1e-6
    optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), lr=lr)

    # move to gpu
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    EPOCHS = 1
    print("Device:{}".format(device))
    print("="*50 + "Training and Testing Start" + "="*50)
    for epoch in range(EPOCHS):
        
        # 紀錄acc
        corr = 0
        total = 0

        # 紀錄loss
        running_loss = 0.0
        goal_loss = 0.005

        # 逐一取出
        for data in sentimentLoader:

            tokens_tensors, segments_tensors, masks_tensors, label_tensors = (i.to(device) for i in data)

            optimizer.zero_grad()
            # forward pass
            outputs = model(input_ids=tokens_tensors,
                            token_type_ids=segments_tensors,
                            attention_mask=masks_tensors,
                            labels=label_tensors)

            loss, logits = outputs[:2]
            _, pred_indice = torch.max(logits.data, 1)

            # 計算corr, total
            total += label_tensors.size(0)
            corr += (pred_indice == label_tensors).sum().item()

            loss.backward()
            optimizer.step()
            # 疊加loss
            running_loss += loss.item()
        # 平均一下loss
        aver = total / BATCH_SIZE
        running_loss_aver = running_loss / aver
        train_loss.append(running_loss_aver)
        # acc
        acc = corr / total
        acc_train.append(acc)

        print(f"[epoch {i+1:}] loss: {running_loss_aver:.3f}  acc: {acc*100:.2f}%")
        
    '''

    驗證

    '''
    sentimentSet_text = sentimentDataset("test.tsv", tokenizer)

    BATCH_SIZE_TEXT = 200
    sentimentLoader_text = DataLoader(sentimentSet_text, batch_size=BATCH_SIZE_TEXT, collate_fn=mini_batch)

    # eval mode
    model.eval()

    # 紀錄acc
    corr = 0
    total = 0

    with torch.no_grad():
        # 逐一取出
        for text_data in sentimentLoader_text:

            tokens_tensors, segments_tensors, masks_tensors, label_tensors = (i.to(device) for i in text_data)

            # forward pass
            outputs = model(input_ids=tokens_tensors,
                            token_type_ids=segments_tensors,
                            attention_mask=masks_tensors
                            )

            logits = outputs[0]
            _, pred_indice = torch.max(logits.data, 1)

            # 計算corr, total
            total += label_tensors.size(0)
            corr += (pred_indice == label_tensors).sum().item()

        # acc
        acc = corr / total
        acc_test.append(acc)
    print(f"total:{total} correct:{corr} valid_acc:{acc*100:.2f}%")
    print("="*50 + "Training and Testing End" + "="*50)

model_name = f"bert_sentiment_wordmax_{MAX_LENGTH}_loss_{running_loss_aver:.3f}_lr_{lr}.pkl"
torch.save(model, model_name)
         

Device:cuda:0
[epoch 1] loss: 0.449  acc: 79.58%
total:500 correct:440 valid_acc:88.00%
Device:cuda:0
[epoch 2] loss: 0.243  acc: 91.20%
total:500 correct:439 valid_acc:87.80%
Device:cuda:0
[epoch 3] loss: 0.206  acc: 92.56%
total:500 correct:450 valid_acc:90.00%
Device:cuda:0
[epoch 4] loss: 0.182  acc: 93.69%
total:500 correct:453 valid_acc:90.60%
Device:cuda:0
[epoch 5] loss: 0.165  acc: 94.22%
total:500 correct:443 valid_acc:88.60%
Device:cuda:0
[epoch 6] loss: 0.146  acc: 95.20%
total:500 correct:448 valid_acc:89.60%
Device:cuda:0
[epoch 7] loss: 0.128  acc: 95.80%
total:500 correct:449 valid_acc:89.80%
Device:cuda:0
[epoch 8] loss: 0.110  acc: 96.42%
total:500 correct:443 valid_acc:88.60%
Device:cuda:0
[epoch 9] loss: 0.102  acc: 96.64%
total:500 correct:447 valid_acc:89.40%
Device:cuda:0
[epoch 10] loss: 0.090  acc: 97.27%
total:500 correct:452 valid_acc:90.40%
Device:cuda:0
[epoch 11] loss: 0.077  acc: 97.84%
total:500 correct:450 valid_acc:90.00%
Device:cuda:0
[epoch 12] loss:

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [0]:
'''

定義一個可以input文字和可以oupput預測判斷的class。


'''
class sentimentModel():
    
    def __init__(self, text=None, tokenizerName="bert-base-chinese"):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.predict_map = {0:"Negative", 1: "Positive"}
        self.modelPath = model_name
        self.tokenizer = BertTokenizer.from_pretrained(tokenizerName)
        self.text = text
        
    r'''
    設計把一句話轉成bert相容的形式。
    
    input : model的路徑
    output : model you want
    '''
    def loadModel(self, modelPath):

        # load model and set to cuda if they are here
        model = torch.load(modelPath, map_location=self.device)
        model = model.to(self.device)

        print(f"Device:{self.device}")
        return model
    
    r'''
    設計把一句話轉成bert相容的形式。

    input : 一段話，最大長度不超過80個字
    output : 
            token_tensor : 把文字轉成電腦可以理解的方式
            segment_tensor : 皆設為1
            mask_tensor : self attention 關注的地方
    '''
    def convert_text_to_bertEat(self, text):

        if type(text) != str:
            # raise TypeError("Input must be str.")
            text = str(text)
        elif len(text) > MAX_LENGTH:
            # raise ValueError("the len(s) must less than 384.")
            text = text[:MAX_LENGTH]

        # 取得3個tensor
        token = self.tokenizer.tokenize(text)
        word_cls = ["[CLS]"]
        word_cls += token
        word_cls_len = len(word_cls)
        ids = self.tokenizer.convert_tokens_to_ids(word_cls)
        token_tensor = torch.tensor(ids)

        segment_tensor = torch.tensor([1] * word_cls_len)
        
        mask_tensor = torch.zeros(token_tensor.shape)
        mask_tensor = mask_tensor.masked_fill(token_tensor !=0, 1)

        # covert tensor, 1D to 2D
        token_tensor = token_tensor.view(-1,token_tensor.size(0))
        segment_tensor = segment_tensor.view(-1,segment_tensor.size(0))
        mask_tensor = mask_tensor.view(-1,mask_tensor.size(0))

        return token_tensor, segment_tensor, mask_tensor

    r'''
    設計一個可以預測的函式，
    input : 能被bert吃的文字
    output : 情感結果
    '''
    def sentimentPredict(self, model, token_tensor, segment_tensor, mask_tensor):
        
        # predict mode
        model.eval()
        with torch.no_grad():
            # move all to cuda
            if torch.cuda.is_available():
                token_tensor = token_tensor.to(self.device).long()
                segment_tensor = segment_tensor.to(self.device).long()
                mask_tensor = mask_tensor.to(self.device).long()

            data = [token_tensor, segment_tensor, mask_tensor]

            outputs = model(*data[:3])
            logits = outputs[0]

            _, pred_num = torch.max(logits.data, 1)

            pred = self.predict_map[pred_num.item()]
            

        return pred_num.item(), pred


    r'''
    
    封包，流程化。
    
    input : 文字
    output : 信號, 回覆的文字 
 
    '''

    def predict_process(self, text):
        # model = self.loadModel(self.modelPath)

        # print(f"要預測的話:\n{text}")
        # print("=" * 25 + "開始預測" + "=" * 25)
        token_tensor, segment_tensor, mask_tensor = self.convert_text_to_bertEat(text)
        pred_num, pred = self.sentimentPredict(model, token_tensor, segment_tensor, mask_tensor)

        # print(f"預測結果為:{pred}")
        # print("=" * 25 + "預測結束" + "=" * 25)
        return pred_num  

In [0]:
df_pred = pd.read_csv("01_Hoteltest.csv", encoding="utf-8")

df_pred["review"] = df_pred["review"].apply(substr)

columns = ["index", "review", "label"]
df_pred = df_pred.reindex(columns=columns)

df_pred["label"] = None


In [41]:
emotion_class = sentimentModel()
model = emotion_class.loadModel(emotion_class.modelPath)
print("=" * 25 + "開始預測" + "=" * 25)
result = []
result = df_pred["review"].apply(emotion_class.predict_process)
print("=" * 25 + "預測結束" + "=" * 25)
df_pred["label"] = result
df_pred

Device:cuda:0


Unnamed: 0,index,review,label
0,1,"""此期間預訂，入住首日酒店贈送每間房10元洗衣券一張，通過攜程預訂，入住首日每房還可獲贈歡迎...",1
1,2,&#35828;&#23454;&#35805;，&#23545;景&#21306;酒店的硬...,1
2,3,(1)房間衛生乾淨空間大!(2)早餐美味風富菜色多!,1
3,4,（1）酒店冊子介紹說房間內提供飲用水，水壺內沒有水，給前臺提意見。前臺說飲用水就是衛生間的自...,0
4,5,*房間很不錯，服務很好，就是位置偏點，在機場到市區的路邊，打車到江北商業圈起步價。*早餐不錯。,1
...,...,...,...
2760,2761,鷺江賓館的位置非常好，交通輪渡可謂四通八達。賓館旁邊是步行街，對面是鼓浪嶼，如果住海景房的話...,1
2761,2762,鹽城來了很多次，第一次住鹽阜賓館，我的確很失望整個牆壁黑咕隆咚的，好像被煙熏過一樣傢俱非常的...,0
2762,2763,"觀海木樓建築別致,風景優美,環境幽靜,絕佳的度假勝地,8628房間位置極好.只是每天不是24...",1
2763,2764,觀景大床房，住了2晚。酒店位置很好，距離虹橋機場只有不到10公里，出行、叫車都算方便。據說是...,1


In [0]:
columns = ["index", "label", "review"]
df_pred = df_pred.reindex(columns=columns)
df_pred.to_csv("pred.csv", index=False, encoding="utf-8")