In [12]:
import pandas as pd
import numpy as np

import os
import pickle

from sklearn.model_selection import train_test_split

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import gensim

In [3]:
#loading data
df_all = pd.read_pickle('df_all.pkl')

In [4]:
#changing 'rating' column to integer
df_all['rating'] = df_all['rating'].astype(int)

In [5]:
df_1_rating = df_all[df_all['rating'] == 1]
df_2_rating = df_all[df_all['rating'] == 2]
df_3_rating = df_all[df_all['rating'] == 3]
df_4_rating = df_all[df_all['rating'] == 4]
df_5_rating = df_all[df_all['rating'] == 5]

In [6]:
df_negativ = pd.DataFrame()
df_negativ = df_negativ.append(df_1_rating)
df_negativ = df_negativ.append(df_2_rating)
df_negativ.shape

(17832, 2)

In [7]:
df_positiv = pd.DataFrame()
df_positiv = df_positiv.append(df_4_rating)
df_positiv = df_positiv.append(df_5_rating)
df_positiv = df_positiv.sample(n=17832)
df_positiv.shape

(17832, 2)

In [8]:
df_all = pd.DataFrame()
df_all = df_all.append(df_negativ)
df_all = df_all.append(df_positiv)
df_all.shape

(35664, 2)

In [9]:
df_all['sentiment'] = df_all['rating'].map(lambda x: 0 if x<3 else 1)
df_all.sample(5)

Unnamed: 0,rating,fullTextHtml,sentiment
217280,4,<p></p>\n<div>\n Cover und Titel sind aufeina...,1
219588,4,"<p>""Herbstfunkeln"" ist der erste Teil der ""Cor...",1
170155,5,<p></p>\n<p>Die beiden Autoren Daniel Juhr und...,1
87185,2,<p>Leider hab ich das Buch nach nicht einmal 1...,0
319767,5,<p></p>\n<p><span>Rezension zu „Die Duftapothe...,1


In [10]:
X = df_all['fullTextHtml'].values
y = df_all['sentiment'].values

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=8888)

In [14]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([13288, 13460], dtype=int64))

In [15]:
np.unique(y_test, return_counts=True)

(array([0, 1], dtype=int64), array([4544, 4372], dtype=int64))

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26748,), (8916,), (26748,), (8916,))

DATA PREPARATION

In [34]:
#word embeddings with Keras Tokenizer
tokenizer = Tokenizer()

total_reviews = np.concatenate((X_train, X_test))
tokenizer.fit_on_texts(total_reviews)

#pad sequences
review_lengths = [len(s.split()) for s in total_reviews]
#pad_length = int(np.mean(review_lengths))
pad_length = 100

#define vocabulary size
vocab_size = len(tokenizer.word_index)+1

X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens, maxlen=pad_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=pad_length, padding='post')


#embedding layer
embedding_dim =100 #dimension of vector
Embedding(vocab_size, embedding_dim, input_length=pad_length)

<keras.layers.embeddings.Embedding at 0x2356ff9f9e8>

BUILDING MODEL

In [35]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=pad_length),
    GRU(units=32, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          22954500  
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 22,967,301
Trainable params: 22,967,301
Non-trainable params: 0
_________________________________________________________________


TRAINING MODEL

In [36]:
model.fit(X_train_pad, y_train, batch_size=64, epochs=5, verbose=2, validation_data=(X_test_pad, y_test))

Instructions for updating:
Use tf.cast instead.
Train on 26748 samples, validate on 8916 samples
Epoch 1/5
 - 191s - loss: 0.4124 - acc: 0.8089 - val_loss: 0.2635 - val_acc: 0.8943
Epoch 2/5
 - 182s - loss: 0.2163 - acc: 0.9218 - val_loss: 0.2441 - val_acc: 0.9023
Epoch 3/5
 - 184s - loss: 0.1336 - acc: 0.9555 - val_loss: 0.2511 - val_acc: 0.9121
Epoch 4/5
 - 188s - loss: 0.0816 - acc: 0.9742 - val_loss: 0.2636 - val_acc: 0.9124
Epoch 5/5
 - 184s - loss: 0.0453 - acc: 0.9865 - val_loss: 0.2992 - val_acc: 0.9132


<keras.callbacks.History at 0x2355bfa4d68>

In [37]:
#testing on some examples
sample_1 = 'Dieser Film ist fantastisch! Er gefällt mir wirklich, weil er so aufregend ist.'
sample_2 = 'Sehr schöner Film.'
sample_3 = 'Was für ein schlechter Film das war. Ich hätte etwas viel Besseres erwartet.'
sample_4 = 'Dieser Film ist wirklich scheiße. Keine Aktion und die Handlung ist flach wie ein Tisch.'
sample_5 = 'Sehr interessantes Kino. Tolles Spiel der Schauspieler.'

test_samples = [sample_1, sample_2, sample_3, sample_4, sample_5]
test_samples_tokens = tokenizer.texts_to_sequences(test_samples)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=pad_length)

#predict
model.predict(x=test_samples_tokens_pad)

array([[0.84106183],
       [0.9651642 ],
       [0.02035591],
       [0.05424052],
       [0.9096227 ]], dtype=float32)

Train word2vec Embedding

In [39]:
#The first step is to prepare the text corpus for learning the embedding by creating word tokens, removing punctuation, 
#removing stop words etc. The word2vec algorithm processes documents sentence by sentence.
tokenizer = RegexpTokenizer(r'\w+')

review_lines = list()
lines = df_all['fullTextHtml'].values.tolist()


for line in lines:
    tokens = tokenizer.tokenize(line)
    #convert to lower case
    tokens = [w.lower() for w in tokens]
    #remove tokens that are not alphabetic
    tokens = [t for t in tokens if t.isalpha]
    #filter out stop words
    stop_words = set(stopwords.words('german'))
    tokens = [t for t in tokens if t not in stop_words]
    review_lines.append(tokens)
    
len(review_lines)

35664

In [40]:
#train word2vec model
model = gensim.models.Word2Vec(sentences=review_lines, size=embedding_dim, window=5, workers=4, min_count=1)

#vocab size
words = list(model.wv.vocab)
print ('Vocabulary size is: {}'.format(len(words)))

Vocabulary size is: 206481


Test Word2Vec Model

In [43]:
model.wv.most_similar('langweilig')

[('langatmig', 0.8203978538513184),
 ('uninteressant', 0.7964168787002563),
 ('eintönig', 0.77809739112854),
 ('gelangweilt', 0.7662367820739746),
 ('öde', 0.7570236921310425),
 ('ermüdend', 0.752723753452301),
 ('vorhersehbar', 0.7268033027648926),
 ('zäh', 0.7245047092437744),
 ('unnötig', 0.7159459590911865),
 ('abgedreht', 0.7103122472763062)]

In [45]:
model.wv.most_similar_cosmul(positive=['frau', 'könig'], negative=['mann'])

[('eingekerkerte', 0.8936576247215271),
 ('kurfürst', 0.8806434273719788),
 ('friebert', 0.8764792680740356),
 ('sklavin', 0.8740070462226868),
 ('geschichtsstudentin', 0.8692702054977417),
 ('eismagierin', 0.8690164089202881),
 ('quichote', 0.8689929842948914),
 ('königs', 0.8688567876815796),
 ('herrscher', 0.8683165311813354),
 ('mächtigen', 0.8673161268234253)]

In [46]:
#save model
model.wv.save_word2vec_format('w2v_lovelybooks_model', binary=False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Use Pre-trained Embedding

In [49]:
#extract the word embedding from file
embeddings_index ={}
f = open(os.path.join('','w2v_lovelybooks_model'), encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

convert word embedding into tokenized vector

In [51]:
#vectorize the text samples into a 2D integer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_lines)
sequences = tokenizer.texts_to_sequences(review_lines)

#pad sequences
word_index = tokenizer.word_index
print('Found {} unique tokens.'.format(len(word_index)))
review_pad = pad_sequences(sequences, maxlen=pad_length)
sentiment = df_all['sentiment'].values
print('Shape of review sensor: ', review_pad.shape)
print('Shape of sentiment sensor: ', sentiment.shape)

Found 206481 unique tokens.
Shape of review sensor:  (35664, 100)
Shape of sentiment sensor:  (35664,)


In [52]:
#Now we will map embeddings from the loaded word2vec model for each word to the tokenizer.word_index vocabulary 
#and create a matrix with  word vectors.
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i > num_words: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        #words not found in embedding index will be all zeros
        embedding_matrix[i] = embedding_vector
print(num_words)

206482


In [53]:
#We are now ready with the trained embedding vector to be used directly in the embedding layer. 
#In the below code, the only change from previous model is using the embedding_matrix as input to the Embedding layer 
#and setting trainable = False, since the embedding is already learned.
embedding_layer = Embedding(num_words, 
                            embedding_dim, 
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=pad_length,
                            trainable=False)

model = Sequential([
    embedding_layer,
    GRU(units=32, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 100)          20648200  
_________________________________________________________________
gru_2 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 20,661,001
Trainable params: 12,801
Non-trainable params: 20,648,200
_________________________________________________________________


Training sentiment model

In [54]:
#split the data into training set and validation set
validation_split_ratio = 0.25
indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = sentiment[indices]
num_validation_samples = int(validation_split_ratio*review_pad.shape[0])

X_train_pad = review_pad[:-num_validation_samples]
y_train = sentiment[:-num_validation_samples]
X_test_pad = review_pad[-num_validation_samples:]
y_test = sentiment[-num_validation_samples:]

print('Shape of X_train_pad tensor: ', X_train_pad.shape)
print('Shape of y_train tensor: ', y_train.shape)
print('Shape of X_test_pad tensor: ', X_test_pad.shape)
print('Shape of y_test tensor: ', y_test.shape)

Shape of X_train_pad tensor:  (26748, 100)
Shape of y_train tensor:  (26748,)
Shape of X_test_pad tensor:  (8916, 100)
Shape of y_test tensor:  (8916,)


In [55]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([13351, 13397], dtype=int64))

In [56]:
np.unique(y_test, return_counts=True)

(array([0, 1], dtype=int64), array([4481, 4435], dtype=int64))

In [57]:
#training the model
model.fit(X_train_pad, y_train, batch_size=128, epochs=10, verbose=2, validation_data=(X_test_pad, y_test))

Train on 26748 samples, validate on 8916 samples
Epoch 1/10
 - 43s - loss: 0.5974 - acc: 0.6555 - val_loss: 0.3542 - val_acc: 0.8491
Epoch 2/10
 - 33s - loss: 0.3707 - acc: 0.8368 - val_loss: 0.2766 - val_acc: 0.8859
Epoch 3/10
 - 34s - loss: 0.3137 - acc: 0.8650 - val_loss: 0.2527 - val_acc: 0.8958
Epoch 4/10
 - 35s - loss: 0.2821 - acc: 0.8794 - val_loss: 0.2280 - val_acc: 0.9081
Epoch 5/10
 - 32s - loss: 0.2627 - acc: 0.8892 - val_loss: 0.2185 - val_acc: 0.9125
Epoch 6/10
 - 33s - loss: 0.2514 - acc: 0.8948 - val_loss: 0.2135 - val_acc: 0.9138
Epoch 7/10
 - 31s - loss: 0.2390 - acc: 0.9005 - val_loss: 0.2076 - val_acc: 0.9170
Epoch 8/10
 - 32s - loss: 0.2274 - acc: 0.9055 - val_loss: 0.2040 - val_acc: 0.9178
Epoch 9/10
 - 32s - loss: 0.2259 - acc: 0.9059 - val_loss: 0.1991 - val_acc: 0.9173
Epoch 10/10
 - 33s - loss: 0.2185 - acc: 0.9088 - val_loss: 0.1978 - val_acc: 0.9216


<keras.callbacks.History at 0x2355bfb2c50>