In [1]:
import numpy as np
import json
import os
from collections import OrderedDict
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM, SimpleRNN, GRU,Bidirectional
from keras.optimizers import SGD

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from pprint import pprint
train_file = "/home/atyam/Downloads/aspects-annotated-dataset/tripadvisor/train.unique.json"
test_file  = "/home/atyam/Downloads/aspects-annotated-dataset/tripadvisor/test.unique.json"


Using TensorFlow backend.


In [3]:
def extract_data_json(file):
    data = []
    for line in open(file):
        data.append(json.loads(line))
    return data

In [4]:
def take_data(file):
    segs,seglabel = [],[]
    for line in file:
        segs += line['segments']
        seglabel += [ i for i in line["segmentLabels"] ]
    annotation = OrderedDict(zip(segs,seglabel))
    return annotation

In [5]:
def make_trainable(ann_data):
	"""
	This makes a list of sentences and a list of feature aspects + opinion.
	"""
	text, labels = [], []
	for item in ann_data.items():
		text.append(item[0])
		label_dicts = item[1]
		extra = []
		for i in label_dicts:
			extra.append(i+"_"+label_dicts[i])
		labels.append(extra)
	return text, labels


In [6]:
def to_categorical(lab_list):
	"""
	Converts labels to 1 of k encoding.
	"""
	y_cat = []
	labels = ['BUILDING_in', 'BUILDING_ip', 'BUILDING_ix', 'BUILDING_n', 'BUILDING_p', 'BUILDING_x', 'BUSINESS_in', 'BUSINESS_ip', 'BUSINESS_p', 'BUSINESS_x', 'CHECKIN_in', 'CHECKIN_ip', 'CHECKIN_ix', 'CHECKIN_n', 'CHECKIN_p', 'CHECKIN_x', 'CLEANLINESS_in', 'CLEANLINESS_ip', 'CLEANLINESS_n', 'CLEANLINESS_p', 'CLEANLINESS_x', 'FOOD_i', 'FOOD_in', 'FOOD_ip', 'FOOD_ix', 'FOOD_n', 'FOOD_p', 'FOOD_x', 'LOCATION_in', 'LOCATION_ip', 'LOCATION_ix', 'LOCATION_n', 'LOCATION_p', 'LOCATION_x', 'NOTRELATED_in', 'NOTRELATED_ip', 'NOTRELATED_n', 'NOTRELATED_p', 'NOTRELATED_x', 'OTHER_in', 'OTHER_ip', 'OTHER_ix', 'OTHER_n', 'OTHER_p', 'OTHER_x', 'ROOMS_in', 'ROOMS_ip', 'ROOMS_ix', 'ROOMS_n', 'ROOMS_p', 'ROOMS_x', 'SERVICE_in', 'SERVICE_ip', 'SERVICE_ix', 'SERVICE_n', 'SERVICE_p', 'SERVICE_x', 'VALUE_in', 'VALUE_ip', 'VALUE_n', 'VALUE_p', 'VALUE_x']
	for x in lab_list:
		temp = [0]*63
		for y in x:
			if y in labels:
				temp[labels.index(y)] = labels.index(y)
		y_cat.append(temp)
	return np.array(y_cat)


In [7]:
train_data = extract_data_json(train_file)


In [8]:
print train_data[:5]

[{u'annotatorId': 2, u'segmentLabels': [{u'OTHER': u'p'}, {u'NOTRELATED': u'x'}, {u'OTHER': u'p'}, {u'SERVICE': u'p'}, {u'ROOMS': u'p'}, {u'CLEANLINESS': u'p', u'ROOMS': u'p'}, {u'FOOD': u'p'}, {u'FOOD': u'p'}, {u'LOCATION': u'ip'}, {u'OTHER': u'p'}, {u'OTHER': u'p'}, {u'OTHER': u'p'}], u'ratingOverall': 5, u'ratingRoom': 5, u'author': u'travellerseattle', u'hotelId': u'277882', u'reviewId': u'277882:54', u'ratingLocation': 4, u'ratingService': 5, u'ratingBusiness': 3, u'ratingValue': 5, u'ratingCleanliness': 5, u'date': u'May 15, 2007', u'ratingCheckin': 5, u'segments': [u'LOVED THE HAMPTON INN SEAPORT!!!!!!!!!!!!!!!!!!', u'Just returned from a 3 night stay.', u'This is a FABULOUS hotel.', u'The front desk staff, the doormen, the breakfast staff, EVERYONE is incredibly friendly and helpful and warm and welcoming.', u'The room was fabulous too.', u'Really comfy beds, great decorating, and super super clean.', u'The breakfasts are great - fresh fruit, bagels, muffins, hot eggs and sausa

In [9]:
train_datalabels = take_data(train_data)


In [13]:
print (train_datalabels.items())[:3]

[(u'LOVED THE HAMPTON INN SEAPORT!!!!!!!!!!!!!!!!!!', {u'OTHER': u'p'}), (u'Just returned from a 3 night stay.', {u'NOTRELATED': u'x'}), (u'This is a FABULOUS hotel.', {u'OTHER': u'p'})]


In [14]:
train_text,labels = make_trainable(train_datalabels)

In [15]:
print train_text[:3],labels[:3]

[u'LOVED THE HAMPTON INN SEAPORT!!!!!!!!!!!!!!!!!!', u'Just returned from a 3 night stay.', u'This is a FABULOUS hotel.'] [[u'OTHER_p'], [u'NOTRELATED_x'], [u'OTHER_p']]


In [16]:
train_labels = to_categorical(labels)

In [21]:
##train_labels is a list of lists

In [17]:
print train_labels[:3]

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 43  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0 38  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 43  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]


In [22]:
test_data = extract_data_json(test_file)
test_datalabels = take_data(test_data)
test_text, labels = make_trainable(test_datalabels)
test_labels = to_categorical(labels)


In [34]:
tokenizer = Tokenizer()
# tokenizer.fit_on_texts('i am a girl')
tokenizer.fit_on_texts(train_text)
tokenizer.fit_on_texts(test_text)

In [24]:
train_sequences = tokenizer.texts_to_sequences(train_text)
test_sequences = tokenizer.texts_to_sequences(test_text)


In [36]:
print train_sequences[:3],test_sequences[:3]

[[218, 1, 3878, 595, 5403], [48, 357, 29, 3, 143, 64, 38], [21, 10, 3, 575, 12]] [[350, 169, 55, 14, 1], [2402, 11, 92, 124, 11, 3, 3199, 152, 1, 72, 10, 305, 4, 217, 7, 3, 37, 47, 273, 1, 193, 18, 1, 2402, 796, 149, 4, 3, 1932], [16, 5, 553, 2, 22, 74, 4469, 40, 3, 115, 717, 57, 5, 37, 3, 817, 8, 1306, 2042, 839, 1766, 197, 1364, 580, 897, 4754, 7, 33, 3, 350, 169, 72, 11, 3, 307, 8, 196, 479, 15, 6, 1023, 51, 4, 102, 303, 143, 274, 1, 94, 64, 2, 74, 88, 179, 18, 1, 377]]


In [35]:
# p = tokenizer.texts_to_sequences('i am a boy')
# print p

In [37]:
train_data = pad_sequences(train_sequences, maxlen=40)
test_data = pad_sequences(test_sequences, maxlen=40)


In [40]:
print train_data[:3]

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0  218    1 3878  595 5403]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0   48  357   29    3  143   64   38]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0   21   10    3  575   12]]


In [41]:
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)
data = train_data[indices]
labels = train_labels[indices]
_validation_samples = int(0.2 * data.shape[0])


In [42]:
x_train = data[:-_validation_samples]
y_train = labels[:-_validation_samples]
x_val = data[-_validation_samples:]
y_val = labels[-_validation_samples:]



In [43]:
print("Training data:", x_train.shape, "Training labels:", y_train.shape, "Validation data:", x_val.shape, "Validation_labels", y_val.shape)
print("Test data : ", test_data.shape, "Test_labels:", test_labels.shape)


('Training data:', (2690, 40), 'Training labels:', (2690, 63), 'Validation data:', (672, 40), 'Validation_labels', (672, 63))
('Test data : ', (1484, 40), 'Test_labels:', (1484, 63))


In [66]:
model = Sequential()
model.add(Embedding(10000, 128, dropout=0.2))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(63))
model.add(Activation('softmax'))
sgd = SGD(lr=0.005,decay=1e-5,momentum=0.9,nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

In [67]:
print(x_train[:1],y_train[:1])

(array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,   12,   19,    3, 1281]], dtype=int32), array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0, 40,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]]))


In [68]:
print(x_val[2],y_val[2])

(array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,   13,    5,    3,  115, 2609,   20,   41,
       3854,   13,   40,   15,   25,  189,  441], dtype=int32), array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, 60,  0,  0]))


In [69]:
print test_data[0],test_labels[0]

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 350
 169  55  14   1] [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 43  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0]


In [70]:
model.fit(x_train, y_train, batch_size=32, nb_epoch=50, validation_data=(x_val, y_val))


Train on 2690 samples, validate on 672 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f467739b950>

In [71]:
model.fit(x_train, y_train, batch_size=32, nb_epoch=50, validation_data=(x_val, y_val))

Train on 2690 samples, validate on 672 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f467667ded0>

In [72]:
error_loss,accuracy = model.evaluate(test_data,test_labels,batch_size=32)



In [73]:
print accuracy

0.227088948787


In [82]:
preds = model.predict_classes(test_data,verbose=1)



In [83]:
preds[2]

32

In [79]:
len(test_labels)

1484