<a href="https://colab.research.google.com/github/andy8744/tensorflow-certification-cheat-sheet/blob/main/02_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### EDA

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use("dark_background")

In [None]:
train_df.target.value_counts()

In [None]:
train_df.target.value_counts().plot.bar();

In [None]:
train_df.target.describe()

In [None]:
print(f"Total training samples: {len(train_df)}")
print(f"Total test samples: {len(test_df)}")
print(f"Total samples: {len(test_df) + len(train_df)}")

In [None]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].values,
                                                                            train_df_shuffled["target"].values,
                                                                            test_size=0.1,
                                                                            random_state=42)

get length of sentences

In [None]:
sent_lens = [len(sentence.split()) for sentence in train_sentences]
avg_sent_len = np.mean(sent_lens)

In [None]:
plt.hist(sent_lens, bins=100);

In [None]:
# How long of a sentence covers 95% of the lengths?
output_seq_len = int(np.percentile(sent_lens, 95))
output_seq_len

### Convert to one hot encoded (if needed)

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_df["target"].to_numpy())
val_labels_encoded = label_encoder.transform(val_df["target"].to_numpy())
test_labels_encoded = label_encoder.transform(test_df["target"].to_numpy())

In [None]:
num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_
num_classes, class_names

### Vectorizer layer

In [None]:
max_vocab_length = 10000
max_length = 15
text_vectorizer = layers.TextVectorization(max_tokens=max_vocab_length, output_sequence_length=max_length)
text_vectorizer.adapt(train_sentences)

### Model (LSTM)

In [None]:
embedding = layers.Embedding(input_dim=max_vocab_length,
                              output_dim=128,
                              input_length=max_length)

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.LSTM(64)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)

### Model (Conv1D)

In [None]:
embedding = layers.Embedding(input_dim=max_vocab_length,
                              output_dim=128,
                              input_length=max_length)

model = keras.Sequential([
  layers.Input(shape=(1,), dtype="string"),
  text_vectorizer,
  emedding,
  layers.Conv1D(32, 5, activation="relu"),
  layers.GlobalMaxPool1D(),
  layers.Dropout(0.5),
  layers.Dense(1, activation="sigmoid")
])

### Hub Layer w Universal sentence encoder

In [None]:
hub_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                           input_shape=[], dtype=tf.string, trainable=False)

model = keras.Sequential([
    hub_layer,
    
    layers.Dense(32, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    
    layers.Dense(10, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    
    layers.Dense(1, activation="sigmoid"),
])

In [None]:
hub_layer = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim128/2", input_shape=[], 
                           dtype=tf.string, trainable=True)

### Complile and fit

In [None]:
model.compile(loss=keras.losses.BinaryCrossentropy(),
                optimizer=keras.optimizers.Adam(),
                metrics=["accuracy"])

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import ModelCheckpoint

callbacks = [
	EarlyStopping(patience=5, monitor='val_loss', restore_best_weights=True),
	ReduceLROnPlateau(monitor='val_loss', min_lr=1e-7, patience=2, mode='min', verbose=1, factor=0.1),
	ModelCheckpoint(monitor='val_loss', filepath='./best_model.h5', save_best_only=True)
]

model.fit(train_sentences, train_labels, epochs=30,
            validation_data=(val_sentences, val_labels), 
            callbacks=[callbacks])

### Evaluation

In [None]:
def evaluate_model(ytrue, ypred, cm=False):
  """
  Takes as input ground truth and predictions, outputs dictionary of metrics
  """
  accuracy = sklearn.metrics.accuracy_score(ytrue, ypred)
  precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(ytrue, ypred, average="weighted")

  if cm == True:
    confusion_matrix = sklearn.metrics.confusion_matrix(ytrue, ypred)
    disp = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix)
    disp.plot(cmap=plt.cm.Blues)

  return{"accuracy": accuracy, "precision": precision, "recall": recall, "f1":f1}

model = tf.squeeze(tf.round(model.predict(val_sentences)))
model_results = evaluate_model(val_labels, model_preds)

model_8_results

### Text cleaning

Need to find a way to import into tensorflow layer

In [None]:
import neattext as nt

def preprocess_text(text):
    text = text.lower()
    text = nt.TextFrame(text)
    text = (text.remove_emails().remove_urls().remove_emojis()
            .remove_puncts().remove_stopwords().remove_special_characters()
						.fix_contractions())
    return str(text)

train_df["text"] = train_df["text"].apply(lambda x:preprocess_text(x))

In [None]:
### https://www.tensorflow.org/tutorials/load_data/text