In [1]:
import pandas as pd
import numpy as np
import os
import random
import keras
from keras.layers import LeakyReLU
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import text
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, LSTM, Dropout, Bidirectional, Conv1D, MaxPooling1D
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [2]:
random.seed(42)

In [3]:
df = pd.read_csv("../data/TripAdvisor_hotel_reviews/tripadvisor_hotel_reviews.csv")

In [4]:
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5
20487,great location price view hotel great quick pl...,4
20488,"ok just looks nice modern outside, desk staff ...",2
20489,hotel theft ruined vacation hotel opened sept ...,1


In [5]:
df["split"] = df.apply(lambda x: "train" if random.randrange(0,100) > 10 else "valid", axis=1)

In [6]:
df_train = df[df["split"] == "train"]
df_val = df[df["split"] == "valid"]

In [7]:
df_train['Rating'][df_train['Rating'] == 5]

3        5
4        5
5        5
6        5
8        5
        ..
20471    5
20473    5
20476    5
20480    5
20482    5
Name: Rating, Length: 8125, dtype: int64

In [8]:
tokenizer=Tokenizer(oov_token="'oov'")
tokenizer.fit_on_texts(df_train['Review'])

In [9]:
maxlen = 200
train_X = pad_sequences(tokenizer.texts_to_sequences(df_train['Review']), maxlen=maxlen)
val_X = pad_sequences(tokenizer.texts_to_sequences(df_val['Review']), maxlen=maxlen)

In [10]:
train_Y = df_train["Rating"]
val_Y = df_val["Rating"]
train_Y_cat = to_categorical(df_train["Rating"]-1, num_classes=5)
val_Y_cat = to_categorical(df_val["Rating"]-1, num_classes=5)

In [11]:
glove_dir = "../data/TripAdvisor_hotel_reviews/"

embedding_index = {}
f = open(os.path.join(glove_dir,'glove.6B.100d.txt'),encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embedding_index[word] = coefs
f.close()
print('Found %s word vectors ' % len(embedding_index))

Found 400000 word vectors 


In [12]:
max_words = len(tokenizer.word_index) + 1
embedding_dim = 100
embedding_matrix = np.zeros((max_words,embedding_dim))

for word, idx in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx]=embedding_vector

In [13]:
model=Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(8))
model.add(LeakyReLU(alpha=0.3))
model.add(Dense(1, activation="linear"))
model.compile(optimizer="Adam", loss='mean_squared_error', metrics=['mse'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 100)          4942100   
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                34048     
_________________________________________________________________
dense (Dense)                (None, 8)                 520       
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 8)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 9         
Total params: 4,976,677
Trainable params: 34,577
Non-trainable params: 4,942,100
_________________________________________________________________
None


In [14]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="4"

In [15]:
model.fit(train_X, train_Y, epochs=30, batch_size=256, validation_data=(val_X, val_Y))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f460c2fb780>

In [16]:
extractor = keras.Model(inputs=model.inputs,
                        outputs=[layer.output for layer in model.layers])
features = extractor(train_X)

In [17]:
X = features[-2].numpy()

In [18]:
y = train_Y.values.reshape(-1,1)

In [22]:
X

array([[ 1.1798579 , -0.49264535,  1.1256189 , ...,  0.7632245 ,
         0.17701972,  0.03133251],
       [ 0.31453654, -0.5542245 ,  0.42863744, ...,  0.43070337,
        -0.07814672, -0.00660958],
       [ 1.6489334 , -0.57017606,  1.3921802 , ...,  1.0850699 ,
        -0.0455947 ,  0.2979708 ],
       ...,
       [ 0.7484249 , -0.33370164,  1.0032203 , ...,  1.0528516 ,
         0.2597388 ,  0.13129671],
       [ 0.3649792 , -0.47781938,  0.24737458, ...,  0.02840276,
        -0.27283886,  0.5970943 ],
       [ 0.82802033, -0.45970428,  0.6262628 , ...,  0.2577461 ,
        -0.045151  ,  0.10608499]], dtype=float32)

In [23]:
y

array([[4],
       [2],
       [5],
       ...,
       [2],
       [1],
       [2]])

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

X_copy = X.copy()
y_copy = y.copy()

x_scaler = StandardScaler()
y_scaler = StandardScaler()
X_scaled = x_scaler.fit_transform(X_copy)
y_scaled = y_scaler.fit_transform(y_copy)

# cols = [1,4,5,8,9,10,11,12,13]

# X_copy['inv_V1'] = 1/X_copy['V1']
x_train,x_test,y_train,y_test=train_test_split(X_scaled[:3000],y_scaled[:3000],test_size=0.2,random_state=1234)

lm=LinearRegression()
lm.fit(x_train,y_train)
pred=lm.predict(x_test)
r2_score(y_test,pred)

0.7683413167908243

In [25]:
X.shape

(18345, 8)

In [26]:
y.shape

(18345, 1)

In [27]:
np.savez('../data/TripAdvisor_hotel_reviews/extracted_features.npz', X=X, y=y)