In [3]:
import os
import pyprind
import numpy as np
import pandas as pd
from collections import Counter
from string import punctuation

# Read csv file
from rnn import SentimentRNN

df = pd.read_csv('music_review.csv', encoding='utf-8')

counts = Counter()
pbar = pyprind.ProgBar(len(df['review']), title='Counting words occurrences')

for i, review in enumerate(df['review']):
    text = ''.join([c if c not in punctuation else ' '+c+' ' for c in str(review)]).lower()
    df.loc[i, 'review'] = text
    pbar.update()
    counts.update(text.split())

# Mapping the each unique word into an integer
word_counts = sorted(counts, key=counts.get, reverse=True)
print(word_counts[:5])
word_to_int = {
    word: ii for ii, word in enumerate(word_counts, 1)
}

mapped_reviews = []
pbar = pyprind.ProgBar(len(df['review']), title='Map reviews to integers')
for review in df['review']:
    mapped_reviews.append([word_to_int[word] for word in review.split()])
    pbar.update()

Counting words occurrences
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:07:41
Map reviews to integers


['i', 'album', 'song', 'the', 'quot']


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:02


In [4]:
# zero padding process
sequence_length = 200
sequences = np.zeros((len(mapped_reviews), sequence_length), dtype=int)

for i, row in enumerate(mapped_reviews):
    review_arr = np.array(row)
    sequences[i, -len(row):] = review_arr[-sequence_length:]

# split data set into training part and testing part
X_train = sequences[:44705, :]
y_train = df.loc[:44705, 'sentiment'].values

X_test = sequences[44705:, :]
y_test = df.loc[44705:, 'sentiment'].values

np.random.seed(123)

In [None]:
n_words = max(list(word_to_int.values())) + 1

rnn = SentimentRNN(n_words=n_words,
                   seq_len=sequence_length,
                   embed_size=256,
                   lstm_size=128,
                   num_layers=1,
                   batch_size=100,
                   learning_rate=0.001)

rnn.train(X_train, y_train, num_epochs=40)
preds = rnn.predict(X_test)
y_true = y_test[:len(preds)]
print('Test acc.: %.3f' % (np.sum(preds==y_true)/len(y_true)))

 << initial state >>  (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(100, 128) dtype=float32>),)

 << lstm output >> Tensor("rnn/transpose:0", shape=(100, 200, 128), dtype=float32)

 << final state >> (LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_2:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_3:0' shape=(100, 128) dtype=float32>),)

 << logits >> Tensor("logits_squeezed:0", shape=(100,), dtype=float32)

 << predictions >>  {'probabilities': <tf.Tensor 'probabilities:0' shape=(100,) dtype=float32>, 'labels': <tf.Tensor 'labels:0' shape=(100,) dtype=int32>}
Epoch: 1/40 Iteration: 20 | Train loss: 0.54186
Epoch: 1/40 Iteration: 40 | Train loss: 0.24232
Epoch: 1/40 Iteration: 60 | Train loss: 0.43569
Epoch: 1/40 Iteration: 80 | Train loss: 0.38813
Epoch: 1/40 Iteration: 100 | Tra