<a href="https://colab.research.google.com/github/bodamohannaik/keras_code_examples/blob/master/nlp/keras_text_classification_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import string
import tensorflow as tf

# Download IMDB movie reviews

@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}

In [None]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  29.3M      0  0:00:02  0:00:02 --:--:-- 29.3M


In [None]:
# list down the subfolders
!ls aclImdb

imdbEr.txt  imdb.vocab	README	test  train


In [None]:
!cat aclImdb/README

Large Movie Review Dataset v1.0

Overview

This dataset contains movie reviews along with their associated binary
sentiment polarity labels. It is intended to serve as a benchmark for
sentiment classification. This document outlines how the dataset was
gathered, and how to use the files provided. 

Dataset 

The core dataset contains 50,000 reviews split evenly into 25k train
and 25k test sets. The overall distribution of labels is balanced (25k
pos and 25k neg). We also include an additional 50,000 unlabeled
documents for unsupervised learning. 

In the entire collection, no more than 30 reviews are allowed for any
given movie because reviews for the same movie tend to have correlated
ratings. Further, the train and test sets contain a disjoint set of
movies, so no significant performance is obtained by memorizing
movie-unique terms and their associated with observed labels.  In the
labeled train/test sets, a negative review has a score <= 4 out of 10,
and a positive review has a scor

In [None]:
# delete other folder 
!rm -r aclImdb/train/unsup

# Load DataSet

In [None]:
dataset_train = tf.keras.utils.text_dataset_from_directory('aclImdb/train', label_mode='binary', 
                                                           class_names=['neg', 'pos'], batch_size=32, subset="training",
                                                           validation_split =0.2, seed =0)
dataset_val = tf.keras.utils.text_dataset_from_directory('aclImdb/train', label_mode='binary', 
                                                           class_names=['neg', 'pos'], batch_size=32, subset="validation",
                                                           validation_split =0.2, seed =0)
dataset_test = tf.keras.utils.text_dataset_from_directory('aclImdb/test', label_mode='binary', 
                                                           class_names=['neg', 'pos'], batch_size=32,
                                                            seed =0)


Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [None]:
# view sample data
for sample_reviews, labels in dataset_train.take(1):
  for i in range(10):
    print("-"*80)
    print(str(labels[i])+":"+sample_reviews[i])

--------------------------------------------------------------------------------
tf.Tensor(b"tf.Tensor([0.], shape=(1,), dtype=float32):Violence whether real or not always has an impact. In this film the violence is about as crass as you could ask for. In the Great Ecstacy the director has successfully demonstrated what extremes of violence people are capable of. But what was the point? The violence looks like a mix of No\xc3\xab's 'Irreversible, and ' Kubrick's 'Clockwork Orange'...both of which are remarkable films. Don't get me wrong, I'm not opposed to screen violence at all and I've seen some nasty stuff in my film-going years, but this film as a whole is totally juvenile. The story is never developed enough to offer any reason for the extreme violence, the rizla paper thin reason we are give for Robert's demise is his introduction to drugs. Danny Dyer plays the character who is partly responsible for Robert's drug fuelled demise, however he is on screen for less than 5 minutes. L

# Vectorization

In [None]:
def custom_standardize(text):
  text = tf.strings.lower(text)
  text = tf.strings.regex_replace(input = text, pattern = "<.+?>", rewrite="")
  text = tf.strings.regex_replace(input = text, pattern =f"[{re.escape(string.punctuation)}]", rewrite="")
  return text

In [None]:
custom_standardize('the worst ones ever made. <br /><br />This is a movie that doesn\'t keep its promises.')

<tf.Tensor: shape=(), dtype=string, numpy=b'the worst ones ever made this is a movie that doesnt keep its promises'>

In [None]:
# vocabulary size
max_features = 20000
# max review length
output_sequence_length = 500
vectorization_layer = tf.keras.layers.TextVectorization(max_tokens=max_features, standardize=custom_standardize,
                                                        split='whitespace',output_mode='int', output_sequence_length=output_sequence_length)
vectorization_layer.adapt(dataset_train.map(lambda texts_temp, labels_temp: texts_temp))

In [None]:
print(f"vocabulary size: {vectorization_layer.vocabulary_size()}")

vocabulary size: 20000


In [None]:
vectorization_layer(['the worst ones ever made. <br /><br />This is a movie that doesn\'t keep its promises.'])

<tf.Tensor: shape=(1, 500), dtype=int64, numpy=
array([[   2,  239,  524,  121,   90,   10,    7,    4,   17,   12,  144,
         379,   29, 4528,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,   

# Build Model

In [None]:
# input layer
inputs = tf.keras.layers.Input(shape=(1,),dtype = tf.string)
# vectorization
x = vectorization_layer(inputs)
# embedding layer
x = tf.keras.layers.Embedding(input_dim=max_features, output_dim=128)(x)
# Convolution layer
x = tf.keras.layers.Conv1D(filters = 64, kernel_size=7, padding="valid", strides = 3, activation ='relu')(x)
x = tf.keras.layers.Conv1D(filters = 32, kernel_size=7, padding ="valid", strides =3, activation = 'relu')(x)
x = tf.keras.layers.Dropout(rate = .5)(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
# Dense layer
x = tf.keras.layers.Dense(units=128, activation ='relu')(x)
x = tf.keras.layers.Dropout(rate =.5)(x)
x = tf.keras.layers.Dense(units = 1, activation = 'sigmoid')(x)
# model
model = tf.keras.Model(inputs = inputs, outputs = x)
model.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 500)              0         
 torization)                                                     
                                                                 
 embedding_7 (Embedding)     (None, 500, 128)          2560000   
                                                                 
 conv1d_11 (Conv1D)          (None, 165, 64)           57408     
                                                                 
 conv1d_12 (Conv1D)          (None, 53, 32)            14368     
                                                                 
 dropout_1 (Dropout)         (None, 53, 32)            0         
                                                           

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])
model.fit(dataset_train, epochs = 10, validation_data = dataset_val)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efd976b2d50>

In [None]:
model.evaluate(dataset_test)



[0.7486086487770081, 0.8207600116729736]