In [None]:
#binary sentiment analysis classifier
import matplotlib.pyplot as plt
import os 
import re
import shutil
import string
import numpy as np
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses

print(tf.__version__)

2.7.0


In [None]:
#Extract dataset from an online sourse
url = "http://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz"

dataset = tf.keras.utils.get_file("stack_overflow", url, untar = True, cache_dir = '.', cache_subdir = '')

dataset_dir = os.path.join(os.path.dirname(dataset), 'stack_overflow')

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz


In [None]:
os.listdir(dataset_dir)
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

FileNotFoundError: ignored

In [None]:
sample_file = os.path.join(train_dir, 'pos/1181_9.txt') 
with open(sample_file) as f: 
  print(f.read())

In [None]:
#remove additional folders
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [None]:
#divide dataset into train, cval, and test

batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'stack_overflow/train', 
    batch_size = batch_size,
    validation_split = 0.2, #use 80% of the examples in the training folder for training
    subset = 'training', 
    seed = seed
)




In [None]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])

In [None]:
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'stack_overflow/train', 
    batch_size = batch_size,
    validation_split = 0.2, 
    subset = 'validation', 
    seed = seed
)

In [None]:
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    'stack_overflow/test', 
    batch_size = batch_size
)

In [None]:
#Standardize, tokenize, and vectorize data
#All accomplished using tf.keras.layers.TextVectorization layer!
#However, this won't remove HTML tags, so you need to build a custom fn. 

def custom_standardization(input_data): 
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')

In [None]:
max_features = 1000
sequence_length = 250 #explicit maximum token length

vectorize_layer = layers.TextVectorization(
    standardize = custom_standardization, #calling above fn 
    max_tokens = max_features,
    output_mode = 'int',
    output_sequence_length = sequence_length
)


In [None]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
#gets the model to build an index of strings to integers
vectorize_layer.adapt(train_text)

def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [None]:
# tokenization of the first example
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

In [None]:
# apply the TextVectorization layer to the train, cval, and test datasets
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [None]:
# cache the data for better performance
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size = AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size = AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size = AUTOTUNE)

In [None]:
# create the model
embedding_dim = 16

model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim), 
  layers.Dropout(0.2), #fraction of the input units to drop
  layers.GlobalAveragePooling1D(), 
  layers.Dropout(0.2), 
  layers.Dense(4)  
])

model.summary()

In [None]:
# Loss function and optimizer
model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits = True), optimizer = 'adam', metrics = ['accuracy'])

In [None]:
# Train model 
epochs = 10
history = model.fit(train_ds, validation_data = val_ds, epochs = epochs)

In [None]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

In [None]:
# create a plot of accuracy and loss over time
# model.fit() returns a History object that contains a dictionary with everything
# that happened during training
history_dict = history.history
history_dict.keys()

In [None]:
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
# b is for "solid blue line"
plt.plot(epochs, loss, 'bo', label = 'Training loss')
plt.plot(epochs, val_loss, 'b', label = 'Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show

In [None]:
# Training and Validation accuracy plot
plt.plot(epochs, acc, 'bo', label = 'Training acc')
plt.plot(epochs, val_acc, 'b', label = 'Validation acc')
plt.title('Training and Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc = 'lower right')
plt.show()

In [None]:
export_model = tf.keras.Sequential([
    vectorize_layer, 
    model, 
    layers.Activation('sigmoid')                                
])
export_model.compile(
    loss = losses.BinaryCrossentropy(from_logits = False), optimizer = "adam", metrics = ['accuracy']
)

loss, accuracy = export_model.evaluate(raw_test_ds)
print(accuracy)

In [None]:
# Inference on new data
examples = [
  "The movie was great!", 
  "The movie was okay.",
  "The movie was terrible..."
]
predictions_ = export_model.predict(examples)
print(predictions_)
