In [0]:
# !pip install transformers tqdm boto3 requests regex
import torch
from transformers import BertTokenizer
import pandas as pd
import sys
# 要先將自定義的class(BertForSequenceClassificationReviewsSentiments)的路徑加進sys.path裡，import程式時才找的到檔案
root_path = '/content/drive/My Drive'
folder_path = root_path + '/ATSC_model'
if not root_path in sys.path:
  print('新增路徑')
  sys.path.append(root_path)
if not folder_path in sys.path:
  print('新增路徑')
  sys.path.append(folder_path)
for p in sys.path:
  print(p)
from BertForSequenceClassificationReviewsSentiments import BertForSequenceClassificationReviewsSentiments

In [0]:
class ReviewTargetSentimentGenerator:
    def __init__(self, df):
        self.df = df
        self.model = torch.load('/content/drive/My Drive/ATSC_model/model_res_reviews0527.pkl')
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

    def df_tran(self):
    # 先將通過NER model的dataframe轉為ATSC model的input格式 (1.去除沒有菜名的列 2.把有多菜名的列分成多列)
        text = []
        tag = []
        empty_tag = (self.df['tag'].isnull())
        df_t = self.df[~empty_tag]

        for i in range(len(df_t)):
            if "," in df_t.iloc[i,:]["tag"]:
                dish = df_t.iloc[i,:]["tag"].split(",")
                for j in dish:
                    tag.append(j) 
                reviews_num = len(dish)
                for k in range(len(dish)):
                    text.append(df_t.iloc[i,:]["text"]) 
            else:
                text.append(df_t.iloc[i,:]["text"])
                tag.append(df_t.iloc[i,:]["tag"]) 
        dic = {
            "text": text,
            "tag": tag
        } 
        df_t = pd.DataFrame(dic, index = None)
        return df_t

    def SentimentGenerator(self):
    # 生成含預測結果的dataframe
        df_all = pd.DataFrame(columns = ["review", "target", "sentiment", "service", "CP", "environment"])        
        for j in range(len(self.df_tran())):
            review = self.df_tran().loc[j, "text"]
            target = self.df_tran().loc[j, "tag"]

            label_tensor = None
            
            word_pieces = ["[CLS]"]
            tokens_review = self.tokenizer.tokenize(review)
            word_pieces += tokens_review + ["[SEP]"]
            len_review = len(word_pieces)

            tokens_target = self.tokenizer.tokenize(target)
            word_pieces += tokens_target + ["[SEP]"]
            len_target = len(word_pieces) - len_review

            ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
            tokens_tensor = torch.tensor(ids)

            segments_tensor = torch.tensor([0] * len_review + [1] * len_target, dtype=torch.long)

            masks_tensor = torch.zeros(tokens_tensor.shape, dtype=torch.long)
            masks_tensor = masks_tensor.masked_fill(tokens_tensor != 0, 1)

            tokens_tensor = tokens_tensor.unsqueeze(0)
            segments_tensor = segments_tensor.unsqueeze(0)
            masks_tensor = masks_tensor.unsqueeze(0)
            data = (tokens_tensor, segments_tensor, masks_tensor)

            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
            model_r = self.model.to(device)

            with torch.no_grad():
                # 將所有 tensors 移到 GPU 上
                if next(model_r.parameters()).is_cuda:
                    data = [t.to("cuda:0") for t in data if t is not None]

                tokens_tensors, segments_tensors, masks_tensors = data
                outputs = model_r(input_ids=tokens_tensors, 
                                token_type_ids=segments_tensors, 
                                attention_mask=masks_tensors)    
                logits = outputs[0]
                _, pred = torch.max(logits.data, 1)

            tran = {0: 'negative', 1: 'neutral', 2:'positive'}    
            sentiment = tran[pred.tolist()[0]]
            df_all = df_all.append({"review": review,
                                    "target": target,
                                    "sentiment": sentiment,
                                    "service": 0,
                                    "CP": 0,
                                    "environment": 0}, ignore_index=True)
        return df_all

In [0]:
# df = pd.read_csv("/content/drive/My Drive/ATSC_model/T+台中高麗屋_review.csv")

# df = ReviewTargetSentimentGenerator(df)
# df_result = df.SentimentGenerator()
# df_result

Unnamed: 0,review,target,sentiment,service,CP,environment
0,"""從開店吃到現在已經吃上無數次，大概每個時期的味道都吃過了。，，今天久違（大概有半年左右）去...",銅板烤肉,negative,0,0,0
1,"""炒馬麵很好吃份量也不少 服務還可以""",炒馬麵,positive,0,0,0
2,"""東西算是便宜大碗，但個人對他們銅板烤肉的肉，調味的方式不是很喜歡，但還是很推他們的馬麵，好吃!""",銅板烤肉,negative,0,0,0
3,"""東西算是便宜大碗，但個人對他們銅板烤肉的肉，調味的方式不是很喜歡，但還是很推他們的馬麵，好吃!""",馬麵,positive,0,0,0
4,"""平價的韓式餐點，銅盤烤肉及炒馬麵划算必點。生意太好，可能與人併桌，若吃完就走不介意，雖店家...",銅盤烤肉,positive,0,0,0
...,...,...,...,...,...,...
337,"""週日晚上五點半到現場已經客滿囉，只等了一下第一輪的客人已用餐完畢，馬上就輪到我們囉，，銅板...",豆腐鍋,neutral,0,0,0
338,"""炒馬麵很大一份，銅盤烤肉，肉量很可以，料多味美，真讚！""",炒馬麵,positive,0,0,0
339,"""炒馬麵很大一份，銅盤烤肉，肉量很可以，料多味美，真讚！""",銅盤烤肉,positive,0,0,0
340,"""點了石鍋拌飯，吃到了2樣不得了的東西，一個是鋼絲刷的鐵絲，一個是2片朔膠片。""",石鍋拌飯,negative,0,0,0
