In [None]:
#from google.colab import drive
#drive.mount('/content/drive')
#data_dir =  '/content/drive/MyDrive/Colab Notebooks/'
data_dir = './data/'

#file to download to run model:  
#1) http://nlp.stanford.edu/data/glove.840B.300d.zip for general embedding
#2) https://howardhsu.github.io/dataset/ for domain embedding

In [5]:
#!pip install fasttext
#!pip install transformers
#import nltk
#nltk.download('punkt')

In [9]:
import pandas as pd
import numpy as np
import scipy
import nltk
import re
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import AdamW
from fasttext import load_model
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, data_utils
from tqdm import tqdm
from sklearn.metrics import f1_score
import xml.etree.ElementTree as ET

In [10]:
# generate word_index list
def build_vocab(data_dir, plain = []):
  """plain is a empty str file which will record all text from official dataset"""
  for fn in os.listdir(data_dir):
      if fn.endswith('.xml'):
          with open(data_dir+fn) as f:
            dom=ET.parse(f)
            root=dom.getroot()
            for sent in root.iter("sentence"):
                text = sent.find('text').text.lower()
                token = word_tokenize(text)
                plain = plain + token
  vocab = sorted(set(plain))
  word_idx = {}
  for idx, word in enumerate(vocab):
        word_idx[word] = idx+1   
  return word_idx

word_indx = build_vocab(data_dir)

In [11]:
len(word_indx)

7876

In [12]:
def gen_np_embedding(fn, word_idx, dim=100, emb=False):
    if emb:
        model = load_model(fn+".bin")
    embedding=np.zeros((len(word_idx)+2, dim) )

    with open(fn) as f:
        for l in f:
            # for each line, get the word and its vector
            rec=l.rstrip().split(' ')
            if len(rec)==2: #skip the first line.
                continue 
            # if the word in word_idx, fill the embedding
            if rec[0] in word_idx:
                embedding[word_idx[rec[0]]] = np.array([float(r) for r in rec[1:] ])
    for w in word_idx:
        if embedding[word_idx[w] ].sum()==0.:
            if emb:
                embedding[word_idx[w] ] = model.get_word_vector(w)
    return embedding

In [13]:
fn = data_dir + 'restaurant_emb.vec'
res_domain_embedding = gen_np_embedding(fn, word_indx, dim = 100, emb = True)

fn = data_dir + 'laptop_emb.vec'
lap_domain_embedding = gen_np_embedding(fn, word_indx, dim = 100, emb = True)

fn = data_dir + 'glove.840B.300d.txt'
general_embedding = gen_np_embedding(fn, word_indx, dim = 300, emb = False)



In [14]:
general_embedding.shape

(7878, 300)

In [15]:
def create_train_data_restaurant(fn, word_idx,  sent_len=83, sent_num=3050):

    corpus = []
    corpus_tag = []
    opsList = []
    train_X = np.zeros((sent_num, sent_len), np.int16)

    train_y = np.zeros((sent_num, sent_len), np.int16) 


    dom=ET.parse(fn)
    root=dom.getroot()
    # iterate the review sentence
    for sx, sent in enumerate(root.iter("sentence") ) : 
        text = sent.find('text').text.lower()
        # tokenize the current sentence
        token = word_tokenize(text)
        corpus.append(token)
           
        # write word index and tag in train_X and train_X_tag
        for wx, word in enumerate(token):
            train_X[sx, wx] = word_idx[word]

        # create a list for opinions in this sentence
        opList = []
        # iterate the opinions
        for ox, apin in enumerate(sent.iter('aspectTerms') ) :
            for ax, opin in enumerate(apin.iter('aspectTerm')):
              target, polarity, start, end = opin.attrib['term'], opin.attrib['polarity'], int(opin.attrib['from']), int(opin.attrib['to'])
              # find word index (instead of str index) if start,end is not (0,0)
              if end != 0:
                  if start != 0:
                      start = len(word_tokenize(text[:start]))
                  end = len(word_tokenize(text[:end]))-1
                  # for training only identify aspect word, but not polarity
                  train_y[sx, start] = 1
                  if end > start:
                      train_y[sx, start+1:end] = 2   
              opList.append([target, polarity, start, end])
          # get a list of opinions attributes
        opsList.append(opList)
    
    return train_X, train_y

fn = data_dir + 'Restaurants_Train_v2.xml'
X_train_res, y_train_res = create_train_data_restaurant(fn, word_indx,sent_len=100)
fn = data_dir + 'Laptop_Train_v2.xml'
X_train_lap, y_train_lap = create_train_data_restaurant(fn, word_indx,sent_len=100)
fn_test = data_dir + 'Restaurants_Test_Gold.xml'
X_test, y_test = create_train_data_restaurant(fn_test, word_indx, sent_len=100)

In [None]:
# not used - try to add a POS tag layer

#pos_tag_list = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS','NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP','SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB',',','.',':','$','#',"``","''",'(',')','-LRB-','-RRB-','HYPH','NFP','SYM','PUNC','AFX','ADD']
#pos_len = 53

from nltk.tag import StanfordPOSTagger
POSdir = data_dir + 'stanford-postagger-full/'

def create_pos_tag(fn, POSdir, sent_len=83, sent_num=3050):
  pos_tag_list = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS','NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP','SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB',',','.',':','$','#',"``","''",'(',')','-LRB-','-RRB-','HYPH','NFP','SYM','PUNC','AFX','ADD']
  tag_to_num = {tag:i+1 for i, tag in enumerate(sorted(pos_tag_list))}

  corpus_tag = []
  
  train_X_tag = np.zeros((sent_num, sent_len), np.int16)
  
  dom=ET.parse(fn)
  root=dom.getroot()

  for sx, sent in enumerate(root.iter("sentence") ) : 
    text = sent.find('text').text
    token = word_tokenize(text)

    jar = POSdir+'stanford-postagger.jar'
    model = POSdir+'models/english-left3words-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')

    pos_tag_stf = []

    for _, tag in pos_tagger.tag(token):
      if tag not in tag_to_num:
        pos_tag_stf.append(-1)
      else:
        pos_tag_stf.append(tag_to_num[tag])
          
      # write word index and tag in train_X and train_X_tag
    for wx, word in enumerate(token):
        train_X_tag[sx, wx] = pos_tag_stf[wx]

  return train_X_tag

fn = data_dir + 'Restaurants_Train_v2.xml'
X_train_tag_res = create_pos_tag(fn, POSdir, sent_len=100, sent_num=3050)

In [17]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    elif isinstance(data, dict):
        for k, v in data.items():
            data[k] = v.to(device)
        return data
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [18]:
device = get_default_device()
print(device)

NUM_EPOCHS = 5
TRAIN_BATCH_SIZE = 100
TEST_BATCH_SIZE = 8

NUM_ASPECT_TAGS = 3

cpu


In [55]:
def loss_fn(pred, true):
    pred = pred.view(-1, 3)
    #print(pred.shape)
    true = true.view(-1)
    #print(true.shape)
    loss = torch.nn.functional.nll_loss(torch.nn.functional.log_softmax(pred, dim = 1 ), true)
    return loss

def cal_acc(pred_tags, true_tags):
    
    batch = pred_tags.shape[0]
    acc = 0

    for i in range(batch):
        pred_array = pred_tags[i].cpu().detach().numpy()
        true_array = true_tags[i].cpu().detach().numpy()

        f1_scores_class = f1_score(true_array, pred_array, labels = np.unique(true_array), average='weighted')
        acc += np.sum(pred_array == true_array) / len(pred_array)

    return acc / batch, f1_scores_class

class Model(torch.nn.Module):
    def __init__(self, gen_emb, domain_emb, num_classes=3, dropout=0.5):
        super(Model, self).__init__()
        self.gen_embedding = torch.nn.Embedding(gen_emb.shape[0], gen_emb.shape[1])
        self.gen_embedding.weight=torch.nn.Parameter(torch.from_numpy(gen_emb), requires_grad=False)
        self.domain_embedding = torch.nn.Embedding(domain_emb.shape[0], domain_emb.shape[1])
        self.domain_embedding.weight=torch.nn.Parameter(torch.from_numpy(domain_emb), requires_grad=False)
        self.conv1=torch.nn.Conv1d(gen_emb.shape[1]+domain_emb.shape[1], 128, 5, padding=2 )
        self.conv2=torch.nn.Conv1d(gen_emb.shape[1]+domain_emb.shape[1], 128, 3, padding=1 )
        self.dropout=torch.nn.Dropout(dropout)

        self.conv3=torch.nn.Conv1d(256, 256, 5, padding=2)
        self.conv4=torch.nn.Conv1d(256, 256, 5, padding=2)
        self.conv5=torch.nn.Conv1d(256, 256, 5, padding=2)

        self.lstm = nn.LSTM(256, hidden_size = 128, num_layers= 1, bidirectional= True, batch_first=True)

        self.linear_ae=torch.nn.Linear(256, num_classes)
  
    def forward(self, x_train, y_train):
    
        x_emb=torch.cat((self.gen_embedding(x_train), self.domain_embedding(x_train) ), dim=2)

        x_emb=self.dropout(x_emb).transpose(1, 2)

        x_conv = torch.nn.functional.relu(torch.cat((self.conv1(x_emb.float()), self.conv2(x_emb.float())), dim=1))
        x_conv = self.dropout(x_conv)
        
        x_conv = torch.nn.functional.relu(self.conv3(x_conv))
        x_conv = self.dropout(x_conv)
        
        x_conv = torch.nn.functional.relu(self.conv4(x_conv))
        x_conv = self.dropout(x_conv)
        
        x_conv = torch.nn.functional.relu(self.conv5(x_conv))

        x_conv = x_conv.transpose(1, 2)
        
        x_lstm, (hidden, cell) = self.lstm(x_conv)

        x_logit = self.linear_ae(x_lstm)

        x_result = torch.argmax(x_logit, dim =2)
        loss = loss_fn(x_logit, y_train)

        return x_result, loss

In [56]:
dataset = data_utils.TensorDataset(torch.Tensor(X_train_res), torch.Tensor(y_train_res))
train_loader = data_utils.DataLoader(dataset, batch_size=35, shuffle=True)

dataset_test = data_utils.TensorDataset(torch.Tensor(X_test), torch.Tensor(y_test))
test_loader = data_utils.DataLoader(dataset, batch_size=35, shuffle=True)

model = to_device(Model(general_embedding, res_domain_embedding), device)
print(model)

Model(
  (gen_embedding): Embedding(7878, 300)
  (domain_embedding): Embedding(7878, 100)
  (conv1): Conv1d(400, 128, kernel_size=(5,), stride=(1,), padding=(2,))
  (conv2): Conv1d(400, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (dropout): Dropout(p=0.5, inplace=False)
  (conv3): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
  (conv4): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
  (conv5): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
  (lstm): LSTM(256, 128, batch_first=True, bidirectional=True)
  (linear_ae): Linear(in_features=256, out_features=3, bias=True)
)


In [49]:
torch.cuda.empty_cache()

num_train_steps = int(len(X_train_res) / TRAIN_BATCH_SIZE * NUM_EPOCHS)
parameters = [p for p in model.parameters() if p.requires_grad]
optimizer= AdamW(parameters, lr=3e-5)

best_loss = np.inf

for epoch in range(NUM_EPOCHS):
  train_losses = []
  train_acc = []
  test_loss = []
  test_acc = []
  train_f1 = []
  test_f1 = []

  model.train()
  for data in tqdm(train_loader, total=len(train_loader)):
    optimizer.zero_grad()

    pred_tags, loss = model(data[0].long(), data[1].long(), testing = True)
    loss.backward()
    optimizer.step()

    train_losses.append(loss.item())

    true_tags = data[1].long()

    acc, f1_score_value = cal_acc(pred_tags, true_tags)

    train_acc.append(acc)
    train_f1.append(f1_score_value)

  avg_train_f1 = sum(train_f1) / len(train_f1)

  avg_train_loss = sum(train_losses) / len(train_losses)
  avg_train_acc = sum(train_acc) / len(train_acc)

  model.eval()

  for data in tqdm(test_loader, total=len(test_loader)):
        
    pred_tags, loss = model(data[0].long(), data[1].long(), testing = True)

    test_loss.append(loss.item())
    true_tags = data[1].long()

    acc, f1_score_value = cal_acc(pred_tags, true_tags)

    test_acc.append(acc)
    test_f1.append(f1_score_value)

  avg_test_f1 = sum(test_f1) / len(test_f1)
  avg_test_loss = sum(test_loss) / len(test_loss)
  avg_test_acc = sum(test_acc) / len(test_acc)
        
  print("Train acc: {:.2f}%; Test acc: {:.2f}%".format(avg_train_acc*100, avg_test_acc*100))
  print("Train Loss: {:.5f}; Test Loss: {:.5f}".format(avg_train_loss, avg_test_loss))
  print("Train F1 score: {:.5f}; Test F1 score: {:.5f}".format(avg_train_f1, avg_test_f1))


100%|██████████| 88/88 [01:12<00:00,  1.22it/s]
100%|██████████| 88/88 [00:27<00:00,  3.21it/s]


Train acc: 98.61%; Test acc: 98.61%
Train Loss: 0.75179; Test Loss: 0.13065
Train F1 score: 0.97747; Test F1 score: 0.97527


100%|██████████| 88/88 [01:12<00:00,  1.21it/s]
100%|██████████| 88/88 [00:27<00:00,  3.25it/s]


Train acc: 98.61%; Test acc: 98.60%
Train Loss: 0.06993; Test Loss: 0.05725
Train F1 score: 0.97932; Test F1 score: 0.97789


100%|██████████| 88/88 [01:10<00:00,  1.24it/s]
100%|██████████| 88/88 [00:27<00:00,  3.24it/s]


Train acc: 98.61%; Test acc: 98.61%
Train Loss: 0.05520; Test Loss: 0.05220
Train F1 score: 0.97997; Test F1 score: 0.98167


100%|██████████| 88/88 [01:11<00:00,  1.24it/s]
100%|██████████| 88/88 [00:27<00:00,  3.21it/s]


Train acc: 98.61%; Test acc: 98.61%
Train Loss: 0.05137; Test Loss: 0.04878
Train F1 score: 0.97543; Test F1 score: 0.97831


100%|██████████| 88/88 [01:11<00:00,  1.23it/s]
100%|██████████| 88/88 [00:27<00:00,  3.21it/s]

Train acc: 98.59%; Test acc: 98.61%
Train Loss: 0.04914; Test Loss: 0.04637
Train F1 score: 0.97611; Test F1 score: 0.97721



