# LSTM Rating Classifier using Word Embeddings

Q:
https://www.youtube.com/watch?v=nam2zR7p7Os
https://www.youtube.com/watch?v=Wp-Wb456kSU
https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing

In [37]:
import pandas as pd
from keras.preprocessing.text import one_hot
from keras.utils import pad_sequences # not preprocessing.sequence (deprecated)
from numpy import asarray , array
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, LSTM
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [38]:
yelp = pd.read_csv("../2_text_preprocessing/cleaned_data_yelp.csv",index_col=None)

### Word Embeddings with Keras

In [39]:
from keras.preprocessing.text import Tokenizer

comments_array = asarray(yelp['Lemmatized_Comment'])
vocab_size = 25000
tokenizer=Tokenizer(num_words=vocab_size)
#get index per word
tokenizer.fit_on_texts(comments_array)
tokenizer.word_index

{'der': 1,
 'sein': 2,
 'und': 3,
 'ein': 4,
 'in': 5,
 'ich': 6,
 'zu': 7,
 'haben': 8,
 'es': 9,
 'nicht': 10,
 'sehr': 11,
 'auch': 12,
 'gut': 13,
 'mit': 14,
 'wir': 15,
 'man': 16,
 'aber': 17,
 'werden': 18,
 'essen': 19,
 'für': 20,
 'auf': 21,
 'an': 22,
 'sich': 23,
 'können': 24,
 'von': 25,
 'lecker': 26,
 'so': 27,
 'hier': 28,
 'noch': 29,
 'als': 30,
 'geben': 31,
 'bei': 32,
 'mein': 33,
 'alle': 34,
 'restaurant': 35,
 'nur': 36,
 'preis': 37,
 'mal': 38,
 'kommen': 39,
 'dieser': 40,
 'da': 41,
 'etwas': 42,
 'service': 43,
 'dass': 44,
 'wie': 45,
 'wieder': 46,
 'freundlich': 47,
 'nach': 48,
 'wenn': 49,
 'immer': 50,
 'dann': 51,
 'schon': 52,
 'klein': 53,
 'nett': 54,
 'aus': 55,
 'kein': 56,
 'oder': 57,
 'wirklich': 58,
 'bedienung': 59,
 'super': 60,
 'was': 61,
 'ganz': 62,
 'laden': 63,
 'uns': 64,
 'leider': 65,
 'berlin': 66,
 'mehr': 67,
 'tisch': 68,
 'gehen': 69,
 'schön': 70,
 'einfach': 71,
 'finden': 72,
 'groß': 73,
 'jeder': 74,
 'mir': 75,
 'schm

In [40]:
def int_encode_comments(comment_list, tokenizer):
    return tokenizer.texts_to_sequences(comment_list)

embedded_comments = int_encode_comments(comments_array, tokenizer)
print(embedded_comments[:5])

[[6, 8, 23, 311, 7, 35, 21, 15267, 1438, 3, 40, 35, 8, 13, 916, 102, 8, 6, 9, 429, 3, 9, 10, 2422], [1, 2825, 2, 4, 245, 198, 4338, 35, 5, 472, 1295, 9, 2, 182, 10, 173, 2719, 17, 162, 945, 3, 762, 350, 14, 70, 4339, 1, 43, 2, 871, 47, 1420, 3, 145, 173, 15268, 1, 19, 39, 94, 126, 11, 1602, 55, 3, 27, 76, 9, 12, 1, 4338, 3597, 962, 3, 512, 8, 64, 11, 13, 76, 1, 37, 2, 289, 3, 20, 66, 215, 10, 7, 212, 15, 2, 36, 42, 580, 373, 52, 79, 11, 201, 453, 3793, 18, 15, 24, 64, 156, 10, 67, 1007, 22, 134, 68, 687, 21, 3423, 18, 1, 42, 10575, 218, 81, 4338, 85, 219, 13, 37, 256, 390, 206, 14, 348, 21, 1, 763, 3, 54, 43, 2, 28, 11, 13, 1083], [15269, 5562, 8, 15, 23, 224, 10576, 15270, 2826, 3, 2825, 18, 7, 6206, 15271, 31, 34, 3, 8370, 67, 30, 1, 301, 30, 6, 550, 39, 1439, 9, 5563, 17, 6, 2, 40, 305, 71, 10577, 2197, 15272, 3, 34, 8370, 14, 75, 546, 1019, 3424, 2, 1, 379, 4340, 8, 105, 67, 15273, 17, 2, 7, 1629], [19, 2, 4663, 149, 186, 6, 10, 398, 8, 229, 2, 33, 662, 5, 2825, 11, 825, 418, 6, 73

#### LSTM Model Preparation

1) Define padding size

In [41]:
def get_max_token_length_per_cmt(Comments_arr: list[str])-> int:
    return max(list(map(lambda x: len(x.split()),Comments_arr)))
padding_size = get_max_token_length_per_cmt(comments_array)
print(str(padding_size))

779


2) Pad embedded comments

In [42]:
padded_cmts = pad_sequences(embedded_comments, maxlen=padding_size)

3. One-Hot-Encoding of Rating
Q: https://www.atmosera.com/blog/multiclass-classification-with-neural-networks/

In [43]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# need to pre-encode labels (1-5) to (0-4) because to_categorical assumes 0 as lowest label value
# otherwise we would get 6 classes in the one-hot encoded arrays

yelp['Rating_Encoded'] = LabelEncoder().fit_transform(yelp['Rating'])
print(yelp.Rating)
print(to_categorical(yelp.Rating))

0       5
1       4
2       5
3       2
4       5
       ..
9701    3
9702    3
9703    5
9704    5
9705    5
Name: Rating, Length: 9706, dtype: int64
[[0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 ...
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]]


#### Train-Dev-Test Split

In [44]:
seed=101010
y=yelp["Rating_Encoded"]

X_train_dev, X_test, y_train_dev, y_test = train_test_split(padded_cmts, y, test_size = 0.1, random_state=seed)
X_train, X_dev, y_train, y_dev = train_test_split(X_train_dev, y_train_dev, test_size = 0.1, random_state=seed)


#### Define LSTM Model

In [58]:
from keras import optimizers

model = Sequential()
model.add(Embedding(input_dim=vocab_size,  output_dim=128, input_length=padding_size))
model.add(LSTM(units=128, activation='tanh'))
#model.add(Flatten())
model.add(Dense(5, activation='sigmoid'))
opt = optimizers.RMSprop(learning_rate=1e-06)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 779, 128)          3200000   
                                                                 
 lstm_12 (LSTM)              (None, 128)               131584    
                                                                 
 dense_10 (Dense)            (None, 5)                 645       
                                                                 
Total params: 3,332,229
Trainable params: 3,332,229
Non-trainable params: 0
_________________________________________________________________


#### Train LSTM Model

In [59]:
hist = model.fit(X_train,
                 to_categorical(y_train),
                 #batch_size=128,
                 epochs=2,
                 validation_data=(X_dev,to_categorical(y_dev)))

Epoch 1/2
Epoch 2/2


In [57]:
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score
import numpy as np
pred_test= np.argmax(model.predict(X_test), axis=1)
print(pred_test)
print(accuracy_score(pred_test,y_test))

[4 3 3 2 4 2 2 4 3 3 3 3 3 2 3 3 4 4 2 4 3 3 4 2 2 3 3 4 4 4 3 4 3 4 4 4 4
 4 0 1 3 2 4 2 4 2 1 1 3 3 3 4 4 4 3 1 3 4 4 1 2 4 4 2 4 4 4 2 4 1 4 2 4 2
 4 3 4 4 4 3 4 4 4 4 2 2 3 2 4 2 3 4 2 4 3 4 4 0 3 2 4 4 4 2 4 2 4 4 2 4 4
 3 1 4 1 4 1 3 1 4 4 3 1 4 3 3 4 4 4 0 2 2 3 4 4 3 2 4 2 3 4 3 3 4 4 4 4 4
 4 4 4 3 4 4 3 4 4 4 4 4 4 2 3 1 3 4 1 3 3 4 2 3 0 1 2 3 4 2 4 2 3 4 3 4 4
 3 4 3 2 4 4 2 3 4 3 2 3 3 3 4 2 4 3 3 2 4 4 4 4 4 4 4 3 4 3 3 4 4 4 3 3 4
 3 2 3 2 3 1 4 4 4 3 4 4 3 3 2 3 4 4 4 4 4 4 4 2 0 3 4 4 4 1 2 3 4 3 4 3 4
 2 2 1 1 3 3 3 4 3 3 4 2 4 4 2 4 4 3 2 4 2 1 2 4 3 2 3 4 3 4 2 4 4 4 3 3 2
 3 4 1 3 3 2 1 0 4 4 3 4 3 4 3 4 4 3 3 4 2 4 4 2 4 3 3 2 4 4 4 3 4 3 2 4 4
 4 3 4 3 4 3 3 4 4 4 4 4 3 4 2 4 4 3 2 2 3 4 4 3 4 3 2 4 3 3 2 4 4 3 4 4 3
 4 4 4 3 4 4 4 3 3 3 2 2 1 4 3 4 3 2 4 3 3 2 3 4 3 2 3 4 4 1 3 4 4 1 4 4 4
 3 4 2 4 0 4 4 1 4 3 4 2 4 0 2 2 2 0 2 3 4 2 2 3 3 2 3 4 2 3 4 4 2 4 2 4 4
 4 4 4 2 3 2 4 3 4 4 4 4 1 4 3 2 3 4 4 3 2 3 4 0 4 4 3 4 3 4 4 4 3 4 4 2 3
 3 4 4 4 1 3 4 3 4 2 4 3 