In [1]:
import os
import urllib.request

import gensim
import numpy as np
import pandas as pd
from keras.layers import (Dense, Input, 
    GlobalMaxPooling1D, Conv1D, Embedding)
from keras.models import Model
from keras.optimizers import RMSprop
from keras.preprocessing.sequence import (
    pad_sequences)
from keras.preprocessing.text import Tokenizer
from gensim.models.keyedvectors import (
    KeyedVectors)



In [2]:
url="https://cssbook.net/d/dutch_sentiment.csv"
h = pd.read_csv(url)
h.head()

Unnamed: 0,id,value,lemmata
0,10007,0,Rabobank voorspellen flink stijging hypotheekr...
1,10027,0,D66 willen reserve provincie aanspreken voor g...
2,10037,1,UWV dit jaar veel baan
3,10059,1,proost op geslaagd beursgang bols
4,10099,0,helft werknemer gaan na 65ste met pensioen


In [11]:
# Tokenize texts
tokenizer=Tokenizer(num_words=9999)
tokenizer.fit_on_texts(h.lemmata)
word_index=tokenizer.word_index
sequences=tokenizer.texts_to_sequences(h.lemmata)
tokens=pad_sequences(sequences, maxlen=1000)

# Prepare embeddings layer
fn = "w2v_320d_trimmed"
if not os.path.exists(fn):
    url = f"https://cssbook.net/d/{fn}"
    print(f"Downloading embeddings from {url}")
    urllib.request.urlretrieve(url, fn)
embeddings = KeyedVectors.load_word2vec_format(fn)
emb_matrix = np.zeros(
    (len(tokenizer.word_index) + 1, 
     embeddings.vector_size))
for word, i in tokenizer.word_index.items():
    if word in embeddings:
        emb_matrix[i] = embeddings[word]
embedding_layer = Embedding(
    emb_matrix.shape[0], emb_matrix.shape[1],
    input_length=tokens.shape[1], trainable=True,
    weights=[emb_matrix])
    
print("Building RNN model")
sequence_input = Input(shape=(tokens.shape[1],), 
                       dtype="int32")
seq = embedding_layer(sequence_input)
m = Conv1D(filters=128, kernel_size=3,
           activation="relu")(seq)
m = GlobalMaxPooling1D()(m)
m = Dense(64, activation="relu")(m)
preds = Dense(1, activation="tanh")(m)
m = Model(sequence_input, preds)
m.summary()

Building RNN model
Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 1000)]            0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 1000, 320)         2176640   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 998, 128)          123008    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 2,307,969
Trainable params: 2,307,969
Non-trainable params: 0
______________________________

In [12]:
# Split data into train and test
train_data = tokens[:4000]
test_data = tokens[4000:]
train_labels = h.value[:4000]
test_labels = h.value[4000:]

# Train model
m.compile(loss="mean_absolute_error", 
          optimizer=RMSprop(lr=.004))
labels = np.asarray([[x] for x in train_labels])
m.fit(train_data,labels,epochs=5,batch_size=128)

# Validate against test data
output = m.predict(test_data)
# Bin output into -1, 0, 1
pred=[1 if x[0]>.3 else (0 if x[0]>-.3 else -1) 
      for x in output]
correct=[x==y for (x,y) in zip(pred,test_labels)]
acc = sum(correct) / len(pred)
print(f"Accuracy: {acc}")

Train model
Validate against test data
Accuracy: 0.46468561584840656
