## Import

In [1]:
import tensorflow as tf

import numpy as np
import pandas as pd
import os

# Split data
from sklearn.model_selection import train_test_split

# Naive Bayes (build a non-DL baseline model)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

## Functions

In [12]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
      y_true: true labels in the form of a 1D array
      y_pred: predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [2]:
import matplotlib.pyplot as plt

def plot_loss_curves(history):
  """
  Returns separate loss curves for training and validation metrics.

  Args:
    history: TensorFlow model History object (see: https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/History)
  """ 
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  accuracy = history.history['accuracy']
  val_accuracy = history.history['val_accuracy']

  epochs = range(len(history.history['loss']))

  # Plot loss
  plt.plot(epochs, loss, label='training_loss')
  plt.plot(epochs, val_loss, label='val_loss')
  plt.title('Loss')
  plt.xlabel('Epochs')
  plt.legend()

  # Plot accuracy
  plt.figure()
  plt.plot(epochs, accuracy, label='training_accuracy')
  plt.plot(epochs, val_accuracy, label='val_accuracy')
  plt.title('Accuracy')
  plt.xlabel('Epochs')
  plt.legend()

## Download the Kaggle NLP-Getting-Started Dataset

The text samples of Tweets labelled as disaster or not disaster

In [3]:
url = 'https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip'

zip_dir = tf.keras.utils.get_file(origin=url, extract=True)
zip_dir

'C:\\Users\\bruce\\.keras\\datasets\\nlp_getting_started.zip'

In [4]:
base_dir = os.path.dirname(zip_dir)
train_csv = os.path.join(base_dir, 'train.csv')
test_csv = os.path.join(base_dir, 'test.csv')

train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

for label, count in enumerate(train_df.target.value_counts()):
    print(f'There are {count} sentences labelled as {label} in Training dataset')

# Shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42) # frac = % of the data

# Split 80% Training data and 20% Validation data
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_df_shuffled['text'].to_numpy(),
    train_df_shuffled['target'].to_numpy(),
    train_size=0.8,
    random_state=42
)

print('---')
print('Number of Training data:', len(train_sentences))
print('Number of Validation data:', len(val_sentences))
print('Number of Testing data:', len(test_df))

There are 4342 sentences labelled as 0 in Training dataset
There are 3271 sentences labelled as 1 in Training dataset
---
Number of Training data: 6090
Number of Validation data: 1523
Number of Testing data: 3263


## Tokenization/Vectorization

In [5]:
# Average number of tokens (words) in the training tweets
max_len = round(sum([len(i.split()) for i in train_sentences]) / len(train_sentences))

max_vocab_len = 10000

text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_len, # bounded the number of most occurrences of words (auto add <OOV>)
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None, # groups of n-words
    output_mode='int', # how to map tokens to numbers
    output_sequence_length=max_len, # how long the sequences to be
    pad_to_max_tokens=True
)

# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

# Get all the unique words in our training data
words_in_vocab = text_vectorizer.get_vocabulary()
print(f'The most common 5 words are: {[words_in_vocab[:5]]}')
print(f'The least common 5 words are: {[words_in_vocab[-5:]]}')

The most common 5 words are: [['', '[UNK]', 'the', 'a', 'in']]
The least common 5 words are: [['minded', 'mindblowing', 'milne', 'milledgeville', 'millcityio']]


## Build a non-DL baseline model with Sklearn

Scikit-Learn Multinomial Naive Bayes using the TF-IDF formula - https://scikit-learn.org/stable/modules/naive_bayes.html

In [7]:
baseline_model = Pipeline([
    ('tfidf', TfidfVectorizer()), # convert words to numbers using tfidf
    ('clf', MultinomialNB()) # model the text
])

baseline_model.fit(
    train_sentences,
    train_labels
)

## Evaluate the mode

In [8]:
baseline_eval = baseline_model.score(val_sentences, val_labels)
print('Accuracy:', baseline_eval)

Accuracy: 0.799080761654629


## Predictions

In [11]:
baseline_preds = baseline_model.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
      dtype=int64)

In [13]:
baseline_results = calculate_results(val_labels, baseline_preds)
baseline_results

{'accuracy': 79.9080761654629,
 'precision': 0.8146358812834972,
 'recall': 0.799080761654629,
 'f1': 0.7920155324845473}