In [1]:
import pandas as pd
import numpy as np

import os
import pickle

from sklearn.model_selection import train_test_split

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import gensim

Using TensorFlow backend.


In [2]:
#loading data
df_all = pd.read_pickle('df_all.pkl')

In [3]:
#changing 'rating' column to integer
df_all['rating'] = df_all['rating'].astype(int)

In [4]:
df_1_rating = df_all[df_all['rating'] == 1]
df_2_rating = df_all[df_all['rating'] == 2]
df_3_rating = df_all[df_all['rating'] == 3]
df_4_rating = df_all[df_all['rating'] == 4]
df_5_rating = df_all[df_all['rating'] == 5]

In [5]:
df_negativ = pd.DataFrame()
df_negativ = df_negativ.append(df_1_rating)
df_negativ = df_negativ.append(df_2_rating)
df_negativ.shape

(17832, 2)

In [6]:
df_positiv = pd.DataFrame()
df_positiv = df_positiv.append(df_4_rating)
df_positiv = df_positiv.append(df_5_rating)
df_positiv = df_positiv.sample(n=df_negativ.shape[0])
df_positiv.shape

(17832, 2)

In [7]:
df_all = pd.DataFrame()
df_all = df_all.append(df_negativ)
df_all = df_all.append(df_positiv)
df_all.shape

(35664, 2)

In [8]:
df_all['sentiment'] = df_all['rating'].map(lambda x: 0 if x<3 else 1)
df_all.sample(5)

Unnamed: 0,rating,fullTextHtml,sentiment
179024,5,<p></p>\n<p>Schwimmen macht Spaß – Jana im Sch...,1
268700,5,"<p></p>\n<p><span>Was macht man, wenn man nur ...",1
301977,4,<p></p>\n<p><strong>Inhaltsangabe:</strong><br...,1
40184,2,"<p></p>\n<div>\n Achtung, Spoiler bzgl. Teil ...",0
257285,4,"<p><span>""Dragon Fortune"" </span><span>lässt s...",1


In [9]:
X = df_all['fullTextHtml'].values
y = df_all['sentiment'].values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=8888, stratify=y)

In [11]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([13374, 13374], dtype=int64))

In [12]:
np.unique(y_test, return_counts=True)

(array([0, 1], dtype=int64), array([4458, 4458], dtype=int64))

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26748,), (8916,), (26748,), (8916,))

In [21]:
#function to clear the data and split to tokens
def process(seq):
    seq = seq.lower()
    
    punctuations = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    for char in punctuations:
        seq = seq.replace(char, ' ')
    tokens = seq.split(' ')

    stopwords = ['buch', '', 'p', 'br', 'style', 'auf', 'div', 'ul', 'n']
    tokens = [token for token in tokens if token not in stopwords]
    return tokens

In [22]:
#clearing the data
X_train = [process(seq) for seq in X_train]
X_test = [process(seq) for seq in X_test]

DATA PREPARATION

In [25]:
#word embeddings with Keras Tokenizer
tokenizer = Tokenizer()

total_reviews = np.concatenate((X_train, X_test))
tokenizer.fit_on_texts(total_reviews)

#pad sequences
#review_lengths = [len(s.split()) for s in total_reviews]
#pad_length = int(np.mean(review_lengths))
pad_length = 100

#define vocabulary size
vocab_size = len(tokenizer.word_index)+1

X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens, maxlen=pad_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=pad_length, padding='post')


#embedding layer
embedding_dim =100 #dimension of vector
Embedding(vocab_size, embedding_dim, input_length=pad_length)

<keras.layers.embeddings.Embedding at 0x1d02a14def0>

BUILDING MODEL

In [26]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=pad_length),
    GRU(units=32, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          22912700  
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 22,925,501
Trainable params: 22,925,501
Non-trainable params: 0
_________________________________________________________________


TRAINING MODEL

In [27]:
model.fit(X_train_pad, y_train, batch_size=64, epochs=5, verbose=2, validation_data=(X_test_pad, y_test))

Instructions for updating:
Use tf.cast instead.
Train on 26748 samples, validate on 8916 samples
Epoch 1/5
 - 219s - loss: 0.3839 - acc: 0.8240 - val_loss: 0.2903 - val_acc: 0.8766
Epoch 2/5
 - 215s - loss: 0.2034 - acc: 0.9275 - val_loss: 0.2219 - val_acc: 0.9145
Epoch 3/5
 - 214s - loss: 0.1315 - acc: 0.9561 - val_loss: 0.2473 - val_acc: 0.9104
Epoch 4/5
 - 193s - loss: 0.0789 - acc: 0.9735 - val_loss: 0.2467 - val_acc: 0.9118
Epoch 5/5
 - 187s - loss: 0.0501 - acc: 0.9834 - val_loss: 0.2737 - val_acc: 0.9135


<keras.callbacks.History at 0x1d01cc5f198>

In [28]:
#testing on some examples
sample_1 = 'Dieser Film ist fantastisch! Er gefällt mir wirklich, weil er so aufregend ist.'
sample_2 = 'Sehr schöner Film.'
sample_3 = 'Was für ein schlechter Film das war. Ich hätte etwas viel Besseres erwartet.'
sample_4 = 'Dieser Film ist wirklich scheiße. Keine Aktion und die Handlung ist flach wie ein Tisch.'
sample_5 = 'Sehr interessantes Kino. Tolles Spiel der Schauspieler.'

test_samples = [sample_1, sample_2, sample_3, sample_4, sample_5]
test_samples_tokens = tokenizer.texts_to_sequences(test_samples)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=pad_length)

#predict
model.predict(x=test_samples_tokens_pad)

array([[0.6984233 ],
       [0.97958404],
       [0.00228065],
       [0.0195238 ],
       [0.99113816]], dtype=float32)

Train word2vec Embedding

In [29]:
#The first step is to prepare the text corpus for learning the embedding by creating word tokens, removing punctuation, 
#removing stop words etc. The word2vec algorithm processes documents sentence by sentence.
tokenizer = RegexpTokenizer(r'\w+')

review_lines = list()
lines = df_all['fullTextHtml'].values.tolist()


for line in lines:
    tokens = tokenizer.tokenize(line)
    #convert to lower case
    tokens = [w.lower() for w in tokens]
    #remove tokens that are not alphabetic
    tokens = [t for t in tokens if t.isalpha]
    #filter out stop words
    stop_words = set(stopwords.words('german'))
    tokens = [t for t in tokens if t not in stop_words]
    review_lines.append(tokens)
    
len(review_lines)

35664

In [30]:
#train word2vec model
model = gensim.models.Word2Vec(sentences=review_lines, size=embedding_dim, window=5, workers=4, min_count=1)

#vocab size
words = list(model.wv.vocab)
print ('Vocabulary size is: {}'.format(len(words)))

Vocabulary size is: 205903


Test Word2Vec Model

In [31]:
model.wv.most_similar('langweilig')

[('langatmig', 0.8075939416885376),
 ('uninteressant', 0.7792505025863647),
 ('gelangweilt', 0.7586060762405396),
 ('öde', 0.7402040958404541),
 ('eintönig', 0.7377558350563049),
 ('ermüdend', 0.7332887053489685),
 ('unnötig', 0.7288734912872314),
 ('vorhersehbar', 0.7247648239135742),
 ('langgezogen', 0.7162354588508606),
 ('unspektakulär', 0.7069278955459595)]

In [32]:
model.wv.most_similar_cosmul(positive=['frau', 'könig'], negative=['mann'])

[('kurfürst', 0.9012632369995117),
 ('herbalistin', 0.888185977935791),
 ('königs', 0.882672131061554),
 ('coolste', 0.8804940581321716),
 ('krieger', 0.8790843486785889),
 ('königin', 0.8789700865745544),
 ('ledige', 0.8754025101661682),
 ('prinzessin', 0.8716257214546204),
 ('mächtigen', 0.8710815906524658),
 ('militärakademie', 0.8694743514060974)]

In [33]:
#save model
model.wv.save_word2vec_format('w2v_lovelybooks_model', binary=False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Use Pre-trained Embedding

In [34]:
#extract the word embedding from file
embeddings_index ={}
f = open(os.path.join('','w2v_lovelybooks_model'), encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

convert word embedding into tokenized vector

In [35]:
#vectorize the text samples into a 2D integer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_lines)
sequences = tokenizer.texts_to_sequences(review_lines)

#pad sequences
word_index = tokenizer.word_index
print('Found {} unique tokens.'.format(len(word_index)))
review_pad = pad_sequences(sequences, maxlen=pad_length)
sentiment = df_all['sentiment'].values
print('Shape of review tensor: ', review_pad.shape)
print('Shape of sentiment tensor: ', sentiment.shape)

Found 205903 unique tokens.
Shape of review tensor:  (35664, 100)
Shape of sentiment tensor:  (35664,)


In [36]:
#Now we will map embeddings from the loaded word2vec model for each word to the tokenizer.word_index vocabulary 
#and create a matrix with  word vectors.
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i > num_words: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        #words not found in embedding index will be all zeros
        embedding_matrix[i] = embedding_vector
print(num_words)

205904


In [37]:
#We are now ready with the trained embedding vector to be used directly in the embedding layer. 
#In the below code, the only change from previous model is using the embedding_matrix as input to the Embedding layer 
#and setting trainable = False, since the embedding is already learned.
embedding_layer = Embedding(num_words, 
                            embedding_dim, 
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=pad_length,
                            trainable=False)

model = Sequential([
    embedding_layer,
    GRU(units=32, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 100)          20590400  
_________________________________________________________________
gru_2 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 20,603,201
Trainable params: 12,801
Non-trainable params: 20,590,400
_________________________________________________________________


Training sentiment model

In [38]:
#split the data into training set and validation set
validation_split_ratio = 0.25
indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = sentiment[indices]
num_validation_samples = int(validation_split_ratio*review_pad.shape[0])

X_train_pad = review_pad[:-num_validation_samples]
y_train = sentiment[:-num_validation_samples]
X_test_pad = review_pad[-num_validation_samples:]
y_test = sentiment[-num_validation_samples:]

print('Shape of X_train_pad tensor: ', X_train_pad.shape)
print('Shape of y_train tensor: ', y_train.shape)
print('Shape of X_test_pad tensor: ', X_test_pad.shape)
print('Shape of y_test tensor: ', y_test.shape)

Shape of X_train_pad tensor:  (26748, 100)
Shape of y_train tensor:  (26748,)
Shape of X_test_pad tensor:  (8916, 100)
Shape of y_test tensor:  (8916,)


In [39]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([13350, 13398], dtype=int64))

In [40]:
np.unique(y_test, return_counts=True)

(array([0, 1], dtype=int64), array([4482, 4434], dtype=int64))

In [41]:
#training the model
model.fit(X_train_pad, y_train, batch_size=128, epochs=10, verbose=2, validation_data=(X_test_pad, y_test))

Train on 26748 samples, validate on 8916 samples
Epoch 1/10
 - 24s - loss: 0.5582 - acc: 0.6960 - val_loss: 0.3316 - val_acc: 0.8574
Epoch 2/10
 - 17s - loss: 0.3633 - acc: 0.8415 - val_loss: 0.2718 - val_acc: 0.8868
Epoch 3/10
 - 20s - loss: 0.3062 - acc: 0.8690 - val_loss: 0.2505 - val_acc: 0.8956
Epoch 4/10
 - 18s - loss: 0.2808 - acc: 0.8816 - val_loss: 0.2364 - val_acc: 0.9025
Epoch 5/10
 - 18s - loss: 0.2595 - acc: 0.8902 - val_loss: 0.2165 - val_acc: 0.9107
Epoch 6/10
 - 24s - loss: 0.2507 - acc: 0.8974 - val_loss: 0.2076 - val_acc: 0.9155
Epoch 7/10
 - 23s - loss: 0.2376 - acc: 0.9022 - val_loss: 0.2094 - val_acc: 0.9124
Epoch 8/10
 - 22s - loss: 0.2354 - acc: 0.9022 - val_loss: 0.2024 - val_acc: 0.9154
Epoch 9/10
 - 20s - loss: 0.2298 - acc: 0.9049 - val_loss: 0.1986 - val_acc: 0.9188
Epoch 10/10
 - 19s - loss: 0.2215 - acc: 0.9063 - val_loss: 0.1958 - val_acc: 0.9180


<keras.callbacks.History at 0x1d092e3ff98>