In [None]:
!pip install tensorflow-gpu==1.15.0

In [None]:
pip install keras==2.2.4

In [None]:
from google.colab import drive
import os
drive.mount('/content/drive/')
os.chdir('/content/drive/My Drive/Aspect Category Detection/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras import layers
from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import xml.etree.ElementTree as ET, getopt, logging, sys, random, re, copy, os
from lxml import etree
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

from keras import backend as K

Using TensorFlow backend.


In [None]:
def getSentences(file):
  tree = ET.parse(file, etree.XMLParser(recover=True, encoding="utf-8"))
  root = tree.getroot()
  s = []
  p = []
  #for review in root.findall('Review'):
  for sentence in root.findall('sentence'):
    #for sentence in sentences.findall('sentence'):
    sent = []
    sent_characteristics = []
    text = sentence.find('text').text
    sent.append(text)
    polarity = []
    for opinions in sentence.findall('aspectCategories'):
      for opinion in opinions.findall('aspectCategory'):
        elem = [opinion.get('category'), opinion.get('polarity')]
        polarity.append(elem)
    sent_characteristics.append(polarity)
    s.append(sent)
    p.append(sent_characteristics)
        
  return s, p

In [None]:
train_sentences, train_adnotations = getSentences("./MAMS_train.xml")
test_sentences, test_adnotations = getSentences("./MAMS_test.xml")

In [None]:
train_sentences[0:5]

[["It might be the best sit down food I've had in the area, so if you are going to the upright citizen brigade, or the garden, it could be just the place for you."],
 ['Hostess was extremely accommodating when we arrived an hour early for our reservation.'],
 ["We were a couple of minutes late for our reservation and minus one guest, but we didn't think we deserved the attitude we got from the hostess."],
 ['Though the service might be a little slow, the waitresses are very friendly.'],
 ['Although we arrived at the restaurant 10 min late, the hostess did not have a table for us.']]

In [None]:
train_adnotations [0:5]

[[[['food', 'positive'], ['place', 'neutral']]],
 [[['staff', 'positive'], ['miscellaneous', 'neutral']]],
 [[['miscellaneous', 'neutral'], ['staff', 'negative']]],
 [[['service', 'negative'], ['staff', 'positive']]],
 [[['staff', 'negative'], ['miscellaneous', 'neutral']]]]

In [None]:
train_reviews = []
train_aspects = []
test_reviews = []
test_aspects = []

In [None]:
for review in train_sentences:
  train_reviews.append(' '.join(review))
for ta in train_adnotations:
  aspect = set()
  for adnotation_set in ta:
    for a in adnotation_set:
      aspect.add(a[0])
  train_aspects.append(aspect)

In [None]:
for review in test_sentences:
  test_reviews.append(' '.join(review))
  
for ta in test_adnotations:
  aspect = set()
  for adnotation_set in ta:
    for a in adnotation_set:
      aspect.add(a[0])
  test_aspects.append(aspect)

In [None]:
def getLabels(aspects):
	#print(unique_aspects)
	#Create train labels
	food = []
	place = []
	staff = []
	miscellaneous = []
	service	= []
	menu = []
	ambience = []
	price = []

	for aspect in aspects:
		if 'food' in aspect:
			food.append(1)
		else:
			food.append(0)
			
		if 'place' in aspect:
			place.append(1)
		else:
			place.append(0)
			
		if 'staff' in aspect:
			staff.append(1)
		else:
			staff.append(0)
			
		if 'miscellaneous' in aspect:
			miscellaneous.append(1)
		else:
			miscellaneous.append(0)
			
		if 'service' in aspect:
			service.append(1)
		else:
			service.append(0)
			
		if 'menu' in aspect:
			menu.append(1)
		else:
			menu.append(0)
			
		if 'ambience' in aspect:
			ambience.append(1)
		else:
			ambience.append(0)
			
		if 'price' in aspect:
			price.append(1)
		else:
			price.append(0)
				
	return food, place ,staff , miscellaneous ,service ,price ,menu , ambience

In [None]:
#Train and test labels
train1, train2, train3, train4, train5, train6, train7, train8 = getLabels(train_aspects)
train_labels = [train1, train2, train3, train4, train5, train6, train7, train8]

test1, test2, test3, test4, test5, test6, test7, test8 = getLabels(test_aspects)
test_labels = [test1, test2, test3, test4, test5, test6, test7, test8]

In [None]:
#Vectorizing data
vectorizer = CountVectorizer(analyzer='word', lowercase=True, stop_words='english', ngram_range=(1,2))
vectorizer.fit(train_reviews)
x_train = vectorizer.transform(train_reviews)
x_test = vectorizer.transform(test_reviews)

In [None]:
input_dim = x_train.shape[1]

tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train_reviews)

x_train = tokenizer.texts_to_sequences(train_reviews)
x_test = tokenizer.texts_to_sequences(test_reviews)
vocab_size = len(tokenizer.word_index) + 1 

In [None]:
maxlen = 100
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding = 'post', maxlen=maxlen)

In [None]:
#Pretrained Word Embeddings
def create_embedding_matrix(filepath, word_index, embedding_dim):
	vocab_size = len(word_index) + 1
	embedding_matrix = np.zeros((vocab_size, embedding_dim))
	
	with open(filepath, encoding='utf-8') as f:
		for line in f:
			word, *vector = line.split()
			if word in word_index:
				idx = word_index[word]
				embedding_matrix[idx] = np.array(vector[-300:], dtype=np.float32)[:embedding_dim]

	return embedding_matrix

In [None]:
embedding_dim = 300
embedding_matrix = create_embedding_matrix('./glove.840B.300d.txt', tokenizer.word_index, embedding_dim)

In [None]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.
        Only computes a batch-wise average of recall.
        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.
        Only computes a batch-wise average of precision.
        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
from tensorflow.python.keras.callbacks import EarlyStopping
def getPredictions(x_train, x_test, train, test):
	embedding_dim = 300
	embedding_matrix = create_embedding_matrix('./glove.840B.300d.txt', tokenizer.word_index, embedding_dim)
	model = Sequential()
	model.add(layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length = maxlen, trainable = True))
	model.add(layers.Conv1D(64, 3, activation = 'relu'))
	model.add(layers.GlobalMaxPool1D())
	model.add(layers.Dense(10, activation='relu'))
	model.add(layers.Dense(1, activation='sigmoid'))
	model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[f1, 'accuracy'])
	#model.summary()

	history = model.fit(x_train, train, epochs = 15, verbose = 1, validation_data = (x_test, test), batch_size = 10)
	val = model.evaluate(x_train, train, verbose = False)
	val = model.evaluate(x_test, test, verbose = False)

	predictions = model.predict(x_test)
	predictions_class = model.predict_classes(x_test)
	#predictions1 = model.predict(testt)
	#predictions11 = model.predict_classes(testt)
	return predictions, predictions_class

In [None]:
print("Getting Predictions1")
predictions1, predictions_class1 = getPredictions(x_train, x_test, train1, test1)
print("Getting Predictions2")
predictions2, predictions_class2 = getPredictions(x_train, x_test, train2, test2)
print("Getting Predictions3")
predictions3, predictions_class3 = getPredictions(x_train, x_test, train3, test3)
print("Getting Predictions4")
predictions4, predictions_class4 = getPredictions(x_train, x_test, train4, test4)
print("Getting Predictions5")
predictions5, predictions_class5 = getPredictions(x_train, x_test, train5, test5)
print("Getting Predictions6")
predictions6, predictions_class6 = getPredictions(x_train, x_test, train6, test6)
print("Getting Predictions7")
predictions7, predictions_class7 = getPredictions(x_train, x_test, train7, test7)
print("Getting Predictions8")
predictions8, predictions_class8 = getPredictions(x_train, x_test, train8, test8)

Getting Predictions1











Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 3149 samples, validate on 800 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Getting Predictions2
Train on 3149 samples, validate on 800 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Getting Predictions3
Train on 3149 samples, validate on 800 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Getting Predictions4
Train on 3149 samples, validate on 800 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/

In [None]:
predictions1 = predictions1[:, 0]
predictions_class1 = predictions_class1[:, 0]

predictions2 = predictions2[:, 0]
predictions_class2 = predictions_class2[:, 0]

predictions3 = predictions3[:, 0]
predictions_class3 = predictions_class3[:, 0]

predictions4 = predictions4[:, 0]
predictions_class4 = predictions_class4[:, 0]

predictions5 = predictions5[:, 0]
predictions_class5 = predictions_class5[:, 0]

predictions6 = predictions6[:, 0]
predictions_class6 = predictions_class6[:, 0]

predictions7 = predictions7[:, 0]
predictions_class7 = predictions_class7[:, 0]

predictions8 = predictions8[:, 0]
predictions_class8 = predictions_class8[:, 0]

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [None]:
# accuracy: (tp + tn) / (p + n)
accuracy1 = accuracy_score(test1, predictions_class1)
accuracy2 = accuracy_score(test2, predictions_class2)
accuracy3 = accuracy_score(test3, predictions_class3)
accuracy4 = accuracy_score(test4, predictions_class4)
accuracy5 = accuracy_score(test5, predictions_class5)
accuracy6 = accuracy_score(test6, predictions_class6)
accuracy7 = accuracy_score(test7, predictions_class7)
accuracy8 = accuracy_score(test8, predictions_class8)

final_accuracy = (accuracy1 + accuracy2 + accuracy3 + accuracy4 + accuracy5 + accuracy6 + accuracy7 + accuracy8) / 8
final_accuracy

0.9349999999999999

In [None]:
# precision tp / (tp + fp)
precision1 = precision_score(test1, predictions_class1)
precision2 = precision_score(test2, predictions_class2)
precision3 = precision_score(test3, predictions_class3)
precision4 = precision_score(test4, predictions_class4)
precision5 = precision_score(test5, predictions_class5)
precision6 = precision_score(test6, predictions_class6)
precision7 = precision_score(test7, predictions_class7)
precision8 = precision_score(test8, predictions_class8)

final_precision_score = (precision1 + precision2 + precision3 + precision4 + precision5 + precision6 + precision7 + precision8) / 8
final_precision_score

0.8643908433424881

In [None]:
# recall: tp / (tp + fn)
recall1 = recall_score(test1, predictions_class1)
recall2 = recall_score(test2, predictions_class2)
recall3 = recall_score(test3, predictions_class3)
recall4 = recall_score(test4, predictions_class4)
recall5 = recall_score(test5, predictions_class5)
recall6 = recall_score(test6, predictions_class6)
recall7 = recall_score(test7, predictions_class7)
recall8 = recall_score(test8, predictions_class8)

final_recall_score = (recall1 + recall2 + recall3 + recall4 + recall5 + recall6 + recall7 + recall8) / 8
final_recall_score

0.8559661557244651

In [None]:
# f1: 2 tp / (2 tp + fp + fn)
f1_score1 = f1_score(test1, predictions_class1)
f1_score2 = f1_score(test2, predictions_class2)
f1_score3 = f1_score(test3, predictions_class3)
f1_score4 = f1_score(test4, predictions_class4)
f1_score5 = f1_score(test5, predictions_class5)
f1_score6 = f1_score(test6, predictions_class6)
f1_score7 = f1_score(test7, predictions_class7)
f1_score8 = f1_score(test8, predictions_class8)

final_f1_score = (f1_score1 + f1_score2 + f1_score3 + f1_score4 + f1_score5 + f1_score6 + f1_score7 + f1_score8) / 8
final_f1_score

0.8585907814245815

In [None]:
# ROC AUC
auc1 = roc_auc_score(test1, predictions_class1)
auc2 = roc_auc_score(test2, predictions_class2)
auc3 = roc_auc_score(test3, predictions_class3)
auc4 = roc_auc_score(test4, predictions_class4)
auc5 = roc_auc_score(test5, predictions_class5)
auc6 = roc_auc_score(test6, predictions_class6)
auc7 = roc_auc_score(test7, predictions_class7)
auc8 = roc_auc_score(test8, predictions_class8)

final_auc = (auc1 + auc2 + auc3 + auc4 + auc5 + auc6 + auc7 + auc8) / 8
final_auc

0.9009801102567434