In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv


In [6]:
import pandas as pd

### Obtain and preprocess data

In [7]:
df=pd.read_csv("../input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv")
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [8]:
df["Rating"].value_counts()

5    9054
4    6039
3    2184
2    1793
1    1421
Name: Rating, dtype: int64

In [9]:
# If rating less than 3 then is negative(1) else positive(0)
def getSentiment(rating):
    if rating<=3:
        sentiment=0
    else:
        sentiment=1   
    return sentiment

In [10]:
df["Sentiment"]=df["Rating"].map(getSentiment)
df.drop(columns=["Rating"],inplace=True)    # remove rating column since not needed
df.head()

Unnamed: 0,Review,Sentiment
0,nice hotel expensive parking got good deal sta...,1
1,ok nothing special charge diamond member hilto...,0
2,nice rooms not 4* experience hotel monaco seat...,0
3,"unique, great stay, wonderful time hotel monac...",1
4,"great stay great stay, went seahawk game aweso...",1


### Split data into training and validating

In [30]:
from sklearn.model_selection import train_test_split

X=df["Review"]
y=df["Sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [31]:
print(X_train.shape,X_test.shape)

(16392,) (4099,)


In [14]:
max_len=len(df["Review"][0].split(" "))
print(max_len)

for i in range(1,20491):
    cur_len=len(df["Review"][i].split(" "))
    #print(cur_len)
    if max_len<cur_len:
        max_len=cur_len
    else:
        continue
print(max_len)

89
1933


### Tokenize the words 

In [32]:
from tensorflow.keras.preprocessing.text import Tokenizer
#use 5000 most commmon words

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)



### Pad the sequences

In [33]:
from keras.preprocessing import sequence
max_len=250

# if less than 250 will add 0 at the front if less, will remove from the end if more

X_train=sequence.pad_sequences(X_train,maxlen=max_len,truncating="post")
X_test=sequence.pad_sequences(X_test,maxlen=max_len,truncating="post")

In [17]:
VOCAB_SIZE=len(tokenizer.word_index)+1
VOCAB_SIZE

46609

### Model creation

In [34]:
import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [35]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 32)          1491488   
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 1,499,841
Trainable params: 1,499,841
Non-trainable params: 0
_________________________________________________________________


In [36]:
model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=["acc"])

In [37]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fe8c016e250>

In [40]:
count=0
for i,j in tokenizer.word_index.items():
    print("Key:",i)
    print("Value:",j)
    print("break")
    count+=1
    if count>=5:
        break


Key: hotel
Value: 1
break
Key: room
Value: 2
break
Key: not
Value: 3
break
Key: great
Value: 4
break
Key: n't
Value: 5
break


In [48]:
from tensorflow import keras
# sample_text = 'This is a sample sentence.'
# tf.keras.preprocessing.text.text_to_word_sequence(sample_text)
# ["this","is","a","sample","sentence"]

def encode_text(text):
  tokens = keras.preprocessing.text.text_to_word_sequence(text)
  # find index of word if exists else make it 0
  tokens = [word_index[word] if word in word_index else 0 for word in tokens]
  return sequence.pad_sequences([tokens],max_len)
  #return sequence.pad_sequences([tokens],max_len)[0]=access only the 1D array

In [49]:
encode_text("This hotel is great")

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

### Make predictions

In [50]:
import numpy as np

def predict(text):
    encoded_text = encode_text(text)
    #pred = np.zeros((1,250))
    #pred[0] = encoded_text
    #result = model.predict(pred) 
    result=model.predict(encoded_text)
    if result[0]<=0.5:
        sentiment="Negative"
    else:
        sentiment="Positive"
    return sentiment

In [51]:
text="Best hotel ever.The room are very nice and spacious.The staffs are very friendly and kind."
predict(text)



'Positive'

In [52]:
text2="This is the worst hotel that I have every stayed in.The rooms are very dirty and small."
predict(text2)

'Negative'

In [53]:
model.save("my_model")