# Text sentiment classification starting from raw text files.

## Imports

In [23]:
import os

os.environ['KERAS_BACKEND'] = 'tensorflow'

import keras
import tensorflow as tf
import numpy as np
from keras import layers

## Load the data: IMDB movie review sentiment classification

In [15]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  3118k      0  0:00:26  0:00:26 --:--:-- 3250k


In [16]:
# The aclImdb folder contains a train and test subfolder
!ls aclImdb

imdbEr.txt  imdb.vocab	README	test  train


In [17]:
!ls aclImdb/test

labeledBow.feat  neg  pos  urls_neg.txt  urls_pos.txt


In [18]:
!ls aclImdb/train

labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt


In [19]:
# The aclImdb/train/pos and aclImdb/train/neg folders contain text files (either positive or negative review)
!cat aclImdb/train/pos/11558_10.txt

"The Odd Couple" is one of those movies that far surpasses its reputation. People all know it, they hum the theme song, they complain of living with a sloppy "Oscar" or a fastidious "Felix"...but they're under-selling the film without knowing it. This isn't just about a neat guy living with a sloppy guy; it's a portrait of two friends helping each other through the agony of divorce. It's also damn funny from start to finish, but it's the kind of comedy that arises from realistic, stressful, and just plain awful situations. So, some viewers have actually found the film to be a bit uncomfortable, but I think its verisimilitude is its strength. Besides, Matthau's bulldog face just cracks me up! My favorite comedy, by a country mile.

In [20]:
!cat aclImdb/train/neg/11008_1.txt

I bought this at tower records after seeing the info-mercial about fifteen hundred times on comedy central. I was actually really looking forward to watching this. My god where did i go wrong? Now before i give my review let me just say that i am a person who can pretty much find the good in all movies, hell i own over 1,500 dvd's! With that said, the underground comedy movie ranks up there with the worst film i have EVER seen. I tried to give it a chance, but not only was it not funny. It had no point, did not offend what-so-ever and was all around stupid. God who in their right mind thought these pieces of crap were funny? this is going right to the bottom of the bin...

In [21]:
# Keeping pos and neg subfolders only (remove unsup)
!rm -r aclImdb/train/unsup

## Spliting the data

In [24]:
batch_size = 32

"""
The utility `keras.utils.text_dataset_from_directory` to generate a labeled
tf.data.Dataset object from a set of text files on disk filed into class-specific folders
"""
raw_train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=1337,
)

raw_val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=133
)

raw_test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
)

print(f'Number of batches in raw_train_ds: {raw_train_ds.cardinality()}')
print(f'Number of batches in raw_val_ds: {raw_val_ds.cardinality()}')
print(f'Number of batches in raw_test_ds: {raw_test_ds.cardinality()}')

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
Number of batches in raw_train_ds: 625
Number of batches in raw_val_ds: 157
Number of batches in raw_test_ds: 782


In [26]:
# Preview some samples
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(3):
        print(f'Review {i + 1}: {text_batch.numpy()[i]}')
        print(f'Label {i + 1}: {label_batch.numpy()[i]}')

Review 1: b'I am very disappointed with "K-911." The original "good" quality of "K-9" doesn\'t exist any more. This is more like a sitcom! Some of casts from original movie returned and got some of my memory back. The captain of Dooley now loves to hit him like a scene from old comedy show. That was crazy. What\'s the deal with the change of Police? It seems like they are now LAPD! Not San Diego PD. It is a completely different movie from "'
Label 1: 0
Review 2: b"Giallo fans, seek out this rare film. It is well written, and full of all sorts of the usual low lifes that populate these films. I don't want to give anything away, so I wont even say anything about the plot. The whole movie creates a very bizarre atmosphere, and you don't know what to expect or who to suspect. Recommended! The only place I've seen to get this film in english is from European Trash Cinema, for $15."
Label 2: 1
Label 3: 1


## Data Preparation

In [27]:
## Let's remove <br /> tags
import string
import re

# Create a custom standardization function
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

# Model constantans
max_features = 20000
embedding_dim = 128
sequence_length = 500

# Instantiate our text vectorization layer
vectorize_layer = keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Make a text only dataset (No labels)
text_ds = raw_train_ds.map(lambda x, y: x)

# Let's call adapt to build the vocabulary
vectorize_layer.adapt(text_ds)

## Vectorize the Data

In [28]:
# Apply it to the text dataset
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

# Vectorize the data
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# async prefetching/buffering
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

## Build the simple 1D convnet Model

In [29]:
# A integer input for vocab indices
inputs = keras.Input(shape=(None,), dtype='int64')

# Add a layer to map vocab indices into a space of dimentionality 'embedding_dim'
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Build Conv1d with global max pooling
x = layers.Conv1D(128, 7, padding='valid', activation='relu', strides=3)(x)
x = layers.Conv1D(128, 7, padding='valid', activation='relu', strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# Add a layer that tranks the features
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.5)(x)

# Final
predictions = layers.Dense(1, activation='sigmoid', name='predictions')(x)

model = keras.Model(inputs, predictions)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

## Train the Model

In [30]:
epochs = 10

# Fit the model with train and validation datasets
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78a5acb344f0>

## Evaluate the model on test dataset

In [31]:
model.evaluate(test_ds)



[0.8893537521362305, 0.8583999872207642]

## End-to_end Model
- Capable of preprocessing raw strings
- Create a new model with trained weights of above model

In [32]:
# A string input
inputs = keras.Input(shape=(1, ), dtype='string')

# Turn strings into vocab indices
indices = vectorize_layer(inputs)

# Turn vocab indices into predictions
outputs = model(indices)

# End-to-end model
end_to_end_model = keras.Model(inputs, outputs)
end_to_end_model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Test it with 'raw_test_ds'
end_to_end_model.evaluate(raw_test_ds)



[0.8893529176712036, 0.8583999872207642]