In [12]:
import sys
sys.path.append("..")
import mysql
from mysql import connect_mysql
import time
import datetime

conn = connect_mysql()

In [13]:
import pandas as pd
from opencc import OpenCC
import numpy as np
import torch

In [14]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [15]:
import torch
import numpy as np
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [16]:
cc = OpenCC('s2twp')  # s2twp為簡體轉台灣繁體(包含慣用語)

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [18]:
def content_preprocessing(text_list = []):
  input_ids = []
  attention_masks = []
  for i in range(0, len(text_list)):
    # 簡體轉為繁體
    content = cc.convert(text_list[i])

    # 轉成token
    tokens = tokenizer.encode_plus(content, add_special_tokens=True, max_length=150, padding='max_length')

    input_ids.append(tokens.get('input_ids'))
    attention_masks.append(tokens.get('attention_mask'))

  # Convert lists to tensors
  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)

  return input_ids, attention_masks

In [19]:
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
  """Bert Model for Classification Tasks.
  """
  def __init__(self, freeze_bert=False):
    """
    @param    bert: a BertModel object
    @param    classifier: a torch.nn.Module classifier
    @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
    """
    super(BertClassifier, self).__init__()
    # Specify hidden size of BERT, hidden size of our classifier, and number of labels
    D_in, H, D_out = 768, 50, 2

    # Instantiate BERT model
    self.bert = BertModel.from_pretrained('bert-base-chinese')

    # Instantiate an one-layer feed-forward classifier
    self.classifier = nn.Sequential(
     nn.Linear(D_in, H),
     nn.ReLU(),
     #nn.Dropout(0.5),
     nn.Linear(H, D_out)
    )

    # Freeze the BERT model
    if freeze_bert:
      for param in self.bert.parameters():
        param.requires_grad = False
        
  def forward(self, input_ids, attention_mask):
    """
    Feed input to BERT and the classifier to compute logits.
    @param    input_ids (torch.Tensor): an input tensor with shape (batch_size, max_length)
    @param    attention_mask (torch.Tensor): a tensor that hold attention mask information with shape (batch_size, max_length)
    @return   logits (torch.Tensor): an output tensor with shape (batch_size, num_labels)
    """
    # Feed input to BERT
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
    # Extract the last hidden state of the token `[CLS]` for classification task
    last_hidden_state_cls = outputs[0][:, 0, :]

    # Feed input to classifier to compute logits
    logits = self.classifier(last_hidden_state_cls)

    return logits

In [20]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

In [21]:
def sentiment():
    real_data = []
    no = []
    with conn.cursor() as cursor:
        sql = 'SELECT No, comment FROM comments WHERE sentiment = 2'
        cursor.execute(sql)
        result = cursor.fetchall()

        for row in result:
            no.append(row[0])
            real_data.append(row[1])

    #print(real_data)


    print('Tokenizing data...')
    inputs, masks = content_preprocessing(real_data)

    # Create the DataLoader for our test set
    dataset = TensorDataset(inputs, masks)
    sampler = SequentialSampler(dataset)
    dataloader = DataLoader(dataset, sampler=sampler, batch_size=32)

    model = torch.load('../Bert/SentimentAnalyzer.pt', map_location='cpu')

    probs = bert_predict(model, dataloader)

    # Get predictions from the probabilities
    threshold = 0.9
    preds = np.where(probs[:, 1] > threshold, 1, 0)

    # Number of tweets predicted non-negative
    print("Number of tweets predicted non-negative: ", preds.sum())

    pred_df = pd.DataFrame(real_data, preds)
    pred_df[preds == 0]

    print(preds.sum())
    persent = (preds.sum() / len(preds)) * 100

    print(int(persent))

    print(preds)
    for i in range(0, len(preds)):
        with conn.cursor() as cursor:
            sql = 'UPDATE comments SET sentiment = %s WHERE No = %s'
            cursor.execute(sql, (str(preds[i]), str(no[i])))
        conn.commit()


In [22]:
sentiment()

Tokenizing data...
Number of tweets predicted non-negative:  0
0
0
[0 0 0 0 0]


------------