# Sentiment classification

IMDB sentiment classification using convolutional networks CNN 1D

In [5]:
import pandas as pd
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb
from sklearn.metrics import accuracy_score, classification_report

In [6]:
# Set parameters.
max_features = 6_000
max_length = 400
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train observations')
print(len(x_test), 'test_observations')

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])


25000 train observations
25000 test_observations


  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [8]:
# Creating numbers to word mapping
wind = imdb.get_word_index()
revind = dict((v, k) for k, v in wind.items())
print(x_train[0])
print(y_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
1


In [9]:
def decode(sent_list):
    new_words = []
    for i in sent_list:
        new_words.append(revind[i])
    comb_words = ' '.join(new_words)
    return comb_words

In [10]:
print(decode(x_train[0]))

the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room and it so heart shows to years of every never going and help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but and to story wonderful that in seeing in character to of 70s musicians with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other and in of seen over landed for anyone of and br show's to whether from than out themselves history he name half some br of and odd was two most of mean for 1 any an boat she he should is thought and but of script you not while history he heart to real at barrel but when from one bit then have two

In [12]:
# Pad sequence for computational efficiency.
x_train = sequence.pad_sequences(x_train, maxlen=max_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_length)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

x_train shape: (25000, 400)
x_test shape: (25000, 400)


In [13]:
# Deep learning architecture parameters.
batch_size = 32
embedding_dims = 60
num_kernels = 260
kernel_size = 3
hidden_dims = 300
epochs = 3

In [14]:
# Building the model.
model = Sequential()
model.add(Embedding(max_features, embedding_dims, input_length=max_length))
model.add(Dropout(0.2))

model.add(Conv1D(num_kernels, kernel_size, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims))
model.add(Dropout(0.5))

model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 400, 60)           360000    
_________________________________________________________________
dropout (Dropout)            (None, 400, 60)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 398, 260)          47060     
_________________________________________________________________
global_max_pooling1d (Global (None, 260)               0         
_________________________________________________________________
dense (Dense)                (None, 300)               78300     
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
activation (Activation)      (None, 300)               0

In [15]:
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x14908fcd0>

In [22]:
y_train_predclass = model.predict(x_train, batch_size=batch_size)
y_test_predclass = model.predict(x_test, batch_size=batch_size)

In [17]:
y_train_predclass.shape = y_train.shape
y_test_predclass.shape = y_test.shape

In [45]:
y_train_pred = np.where(y_train_predclass > 0.5, 1, 0)
# Model accuracies and metrics calculation.
print('CNN 1D - Train accuracy:')
print(round(accuracy_score(y_train, y_train_pred), 3))

print('CNN 1D - Training data:')
print(classification_report(y_train, y_train_pred))

print('CCN 1D - Train confusion matrix:')
print(pd.crosstab(y_train, y_train_pred, rownames=['Actual'], colnames=['Predicted']))

CNN 1D - Train accuracy:
0.96
CNN 1D - Training data:
              precision    recall  f1-score   support

           0       0.98      0.94      0.96     12500
           1       0.94      0.98      0.96     12500

    accuracy                           0.96     25000
   macro avg       0.96      0.96      0.96     25000
weighted avg       0.96      0.96      0.96     25000

CCN 1D - Train confusion matrix:
Predicted      0      1
Actual                 
0          11718    782
1            212  12288


In [46]:
y_test_pred = np.where(y_test_predclass > 0.5, 1, 0)
# Model accuracies and metrics calculation.
print('CNN 1D - Test accuracy:')
print(round(accuracy_score(y_test, y_test_pred), 3))

print('CNN 1D - Testing data:')
print(classification_report(y_test, y_test_pred))

print('CCN 1D - Test confusion matrix:')
print(pd.crosstab(y_test, y_test_pred, rownames=['Actual'], colnames=['Predicted']))

CNN 1D - Test accuracy:
0.883
CNN 1D - Testing data:
              precision    recall  f1-score   support

           0       0.92      0.84      0.88     12500
           1       0.85      0.93      0.89     12500

    accuracy                           0.88     25000
   macro avg       0.89      0.88      0.88     25000
weighted avg       0.89      0.88      0.88     25000

CCN 1D - Test confusion matrix:
Predicted      0      1
Actual                 
0          10443   2057
1            870  11630
