In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding



SSM REVIEWS Classification

In [2]:
reviews = ['nice business school',
          'I learned a lot about big data and AI, very good experience',
          'bad experience',
          ' it was ok, but I do note recommanded to anyone',
          'very nice shcool, I learned a lot and I made friends with whom I started projects',
          'Need more improvement']
sentiment = np.array([1,1,0,0,1,0]) #1 for good sentiment and 0 for bad one

In [3]:
#The first thing to do is convert it to OHE
#each review will be converted into a random unique number depending on which does not exceed your vocabulary size.

one_hot("nice business school",50)

[12, 23, 6]

In [4]:
#you see for nice we have 12, for business 23 and for school 6

In [5]:
#let's encode all the reviews now
vocab_size = 50
encoded_reviews = [one_hot(i, vocab_size)for i in reviews]
print(encoded_reviews)

[[12, 23, 6], [8, 33, 10, 19, 6, 1, 18, 17, 40, 20, 29, 38], [21, 38], [29, 41, 48, 47, 8, 35, 16, 25, 9, 17], [20, 12, 15, 8, 33, 10, 19, 17, 8, 8, 14, 3, 9, 8, 47, 31], [26, 45, 19]]


In [6]:
#We need to use padding because we have different length for reviews

In [8]:
max_length = 16
padded_reviews = pad_sequences(encoded_reviews, maxlen=max_length, padding='post')
#Padding post mean that you need to pad the review toward the end
padded_reviews
#They have equal size now

array([[12, 23,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 8, 33, 10, 19,  6,  1, 18, 17, 40, 20, 29, 38,  0,  0,  0,  0],
       [21, 38,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [29, 41, 48, 47,  8, 35, 16, 25,  9, 17,  0,  0,  0,  0,  0,  0],
       [20, 12, 15,  8, 33, 10, 19, 17,  8,  8, 14,  3,  9,  8, 47, 31],
       [26, 45, 19,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [9]:
embeded_vector_size = 4 #The features

In [12]:
#Creating my model

model = Sequential()
model.add(Embedding(vocab_size, embeded_vector_size, input_length=max_length,name="embedding"))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

#The first argument is your vocabulary size, then the embeded vector, the inputs length (features) and give it a name.
#The second layer is flattened all the vector into one.
#Then you apply sigmoid function

In [13]:
#Defining X and y to train the model:

X = padded_reviews #Inputs
y = sentiment #Outputs

In [14]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())
#Binary_crossentropy because we only have 0 or 1

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 16, 4)             200       
                                                                 
 flatten (Flatten)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 265
Trainable params: 265
Non-trainable params: 0
_________________________________________________________________
None


- The maximum penalty is 16
- The sinking vector is 4
- Flatten is 64 (4*16)
- Dense is 1 because we have a neuron

In [18]:
model.fit(X, y, epochs=50, verbose=0)

<keras.callbacks.History at 0x2122fb3c4f0>

In [16]:
# evaluate the model
loss, accuracy = model.evaluate(X, y)
accuracy
#83% accuracy



0.8333333134651184

In [19]:
weights = model.get_layer('embedding').get_weights()[0]
len(weights) #it's 50 because of the vocabulary size 
#This method gives you all the weights in the embedding matrix

50

In [20]:
weights[12] #4 because of the features for the word nice

array([ 0.04734632,  0.00258898,  0.02213548, -0.03280065], dtype=float32)

In [21]:
#let's compare it with the words good
weights[29]

array([ 0.11190153,  0.12063858,  0.01715918, -0.10536227], dtype=float32)

In [22]:
#Normally they should be similar, but due to the lack of data, the similarity is not present