# Text Classification wth Keras

## Introduction

In this notebook, we will build a text classification model using Keras. We will use the <u>Sentiment Analysis</u> Data Set from [UCI](https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences). We will create sequential, CNN, and RNN models to predict the sentiment of a given text. 


In [2]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import layers, models

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import numpy as np
import pandas as pd

# set seed for reproducibility
np.random.seed(1234)

## Data Load

In [3]:
# load in each file
# data appears to be tab delimited

amazon = pd.read_csv('amazon_cells_labelled.txt', delimiter='\t', header=None)
imdb = pd.read_csv('imdb_labelled.txt', delimiter='\t', header=None)
yelp = pd.read_csv('yelp_labelled.txt', delimiter='\t', header=None)

# combine all data into one dataframe
df = pd.concat([amazon, imdb, yelp])
df.columns = ['text', 'label']
df.head()

Unnamed: 0,text,label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


## Data Preprocessing

In [4]:
# split df into train and test
i = np.random.rand(len(df)) < 0.8
train = df[i]
test = df[~i]
print("train data size: ", train.shape)
print("test data size: ", test.shape)

train data size:  (2180, 2)
test data size:  (568, 2)


In [5]:
# tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['text'])

X_train = tokenizer.texts_to_matrix(train['text'])
X_test = tokenizer.texts_to_matrix(test['text'])


# use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train['label'])
y_train = encoder.transform(train['label'])
y_test = encoder.transform(test['label'])

# check shape
print("train shapes:", X_train.shape, y_train.shape)
print("test shapes:", X_test.shape, y_test.shape)
print("test first five labels:", y_test[:5])

train shapes: (2180, 4698) (2180,)
test shapes: (568, 4698) (568,)
test first five labels: [1 0 0 0 0]


## Sequential Model

In [6]:
model = models.Sequential()

model.add(layers.Dense(512, input_shape=(X_train.shape[1],)))
model.add(layers.Activation('relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1))
model.add(layers.Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               2405888   
                                                                 
 activation (Activation)     (None, 512)               0         
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 513       
                                                                 
 activation_1 (Activation)   (None, 1)                 0         
                                                                 
Total params: 2,406,401
Trainable params: 2,406,401
Non-trainable params: 0
_________________________________________________________________


In [7]:
history = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.1, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
score = model.evaluate(X_test, y_test, batch_size=100, verbose=1)
print('Accuracy: ', score[1])

Accuracy:  0.8309859037399292


In [9]:
score

[0.4387046992778778, 0.8309859037399292]

In [10]:
# calculate metrics
y_pred = model.predict(X_test)
y_pred = np.round(y_pred).astype(int).flatten()
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))

# print the confusion matrix
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

print('Classification Report: \n', classification_report(y_test, y_pred))

Accuracy:  0.8309859154929577
Precision:  0.8137254901960784
Recall:  0.8645833333333334
F1:  0.8383838383838385
Confusion Matrix: 
 [[223  57]
 [ 39 249]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.80      0.82       280
           1       0.81      0.86      0.84       288

    accuracy                           0.83       568
   macro avg       0.83      0.83      0.83       568
weighted avg       0.83      0.83      0.83       568



## CNN Model

In [11]:
model = models.Sequential()

model.add(layers.Conv1D(256, 5, padding='valid', activation='relu', strides=1, input_shape=(X_train.shape[1], 1)))
model.add(layers.MaxPooling1D(pool_size=4))
model.add(layers.Dropout(0.5))
model.add(layers.Conv1D(128, 5, padding='valid', activation='relu', strides=1))
model.add(layers.MaxPooling1D(pool_size=4))
model.add(layers.Dropout(0.5))
model.add(layers.Flatten())
model.add(layers.Dense(1))
model.add(layers.Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 4694, 256)         1536      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 1173, 256)        0         
 )                                                               
                                                                 
 dropout_1 (Dropout)         (None, 1173, 256)         0         
                                                                 
 conv1d_1 (Conv1D)           (None, 1169, 128)         163968    
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 292, 128)         0         
 1D)                                                             
                                                                 
 dropout_2 (Dropout)         (None, 292, 128)         

In [12]:
history = model.fit(X_train, y_train, epochs=20, batch_size=128, validation_split=0.1, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [13]:
# calculate metrics
y_pred = model.predict(X_test)
y_pred = np.round(y_pred).astype(int).flatten()
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))

# print the confusion matrix
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

print('Classification Report: \n', classification_report(y_test, y_pred))

Accuracy:  0.6883802816901409
Precision:  0.7126436781609196
Recall:  0.6458333333333334
F1:  0.6775956284153006
Confusion Matrix: 
 [[205  75]
 [102 186]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.67      0.73      0.70       280
           1       0.71      0.65      0.68       288

    accuracy                           0.69       568
   macro avg       0.69      0.69      0.69       568
weighted avg       0.69      0.69      0.69       568



## RNN Model

In [14]:
# preprocess slightly differently for an RNN model

# tokenize text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train['text'])

X_train = tokenizer.texts_to_sequences(train['text'])
X_test = tokenizer.texts_to_sequences(test['text'])

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

maxlen = 100

X_train_pad = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test_pad = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [15]:
model = models.Sequential()
model.add(layers.Embedding(vocab_size, 64, input_length=maxlen))
model.add(layers.SimpleRNN(64))
model.add(layers.Dense(2, activation='softmax'))

model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 64)           300672    
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 2)                 130       
                                                                 
Total params: 309,058
Trainable params: 309,058
Non-trainable params: 0
_________________________________________________________________


In [19]:
history = model.fit(X_train_pad, y_train, epochs=20, batch_size=128, validation_data=(X_test_pad, y_test), verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [23]:
# calculate metrics
y_pred = model.predict(X_test_pad)
y_pred = np.argmax(y_pred, axis=1)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))

# print the confusion matrix
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

print('Classification Report: \n', classification_report(y_test, y_pred))

Accuracy:  0.6936619718309859
Precision:  0.6815286624203821
Recall:  0.7430555555555556
F1:  0.7109634551495018
Confusion Matrix: 
 [[180 100]
 [ 74 214]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.71      0.64      0.67       280
           1       0.68      0.74      0.71       288

    accuracy                           0.69       568
   macro avg       0.70      0.69      0.69       568
weighted avg       0.69      0.69      0.69       568

