<a href="https://colab.research.google.com/github/andy8744/tensorflow-certification-cheat-sheet/blob/main/00_Useful_Code_Snippets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dataset Manipulation

### Train Test split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

### Convert to One Hot Encoding



keras

In [None]:
from keras.utils import to_categorical
y = to_categorical(y)

sklearn

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_df["target"].to_numpy())

tensorflow

In [None]:
test_labels = tf.one_hot(test_encoded_labels, 3).numpy() # neutral, negative, positive

pandas

In [None]:
pd.get_dummies(df)

### Pandas Mapping

In [None]:
sample.housing.map(dict(yes=1, no=0)) 

### Combining features and labels to a tf dataset

In [None]:
train_features_dataset = tf.data.Dataset.from_tensor_slices(X_train)
train_labels_dataset = tf.data.Dataset.from_tensor_slices(Y_train)

test_features_dataset = tf.data.Dataset.from_tensor_slices(X_test)
test_labels_dataset = tf.data.Dataset.from_tensor_slices(Y_test)

# Combine labels and features by zipping together -> features, labels
train_dataset = tf.data.Dataset.zip((train_features_dataset, train_labels_dataset))
test_dataset = tf.data.Dataset.zip((test_features_dataset, test_labels_dataset))

### TensorFlow Datasets (TFDS)

In [None]:
(train_data, test_data), info = tfds.load('imdb_reviews/subwords8k',
                                         split=(tfds.Split.TRAIN, tfds.Split.TEST),
                                         with_info=True, as_supervised=True)

In [None]:
(train_data, test_data), ds_info = tfds.load(name="food101", # target dataset to get from TFDS
                                             split=["train", "validation"], # what splits of data should we get? note: not all datasets have train, valid, test
                                             shuffle_files=True, # shuffle files on download?
                                             as_supervised=True, # download data in tuple format (sample, label), e.g. (image, label)
                                             with_info=True) # include dataset metadata? if so, tfds.load() returns tuple (data, ds_info)
class_names = ds_info.features["label"].names

### Converting Dataset to Numpy

In [None]:
def dataset2numpy(dataset):
  inputs, targets = tuple(zip(*dataset))
  inputs = np.array(tf.squeeze(inputs))
  targets = np.array(tf.squeeze(targets))
  return inputs, targets

### Normalization Layer

In [None]:
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(X_train.values)

Normalizing for a tf dataset

In [None]:
norm_layer = layers.Normalization()
feature_ds = train_dataset.map(lambda x, y: x)
norm_layer.adapt(feature_ds)

## Dataset Loading

### Images with CSV Annotations

In [None]:
directory = "./mnist_images_csv/"
df = pd.read_csv(directory + "train.csv")

file_paths = df["file_name"].values
labels = df["label"].values

ds_train = tf.data.Dataset.from_tensor_slices((file_paths, labels))

def read_image(image_file, label):
  image = tf.io.read_file(directory + image_file)
  image = tf.image.decode_image(image, channels=1, dtype=tf.float32)
  return image, label

ds_train = ds_train.map(read_image).batch(2)

### Images in folder with name annotations

In [None]:
import pathlib

directory = "/content/mnist_images_only"
ds_train = tf.data.Dataset.list_files(str(pathlib.Path(directory+"/*.jpg")))
#ds_train = tf.data.Dataset.list_files(directory+"/*.jpg")

def process_path(file_path):
  image = tf.io.read_file(file_path)
  image = tf.image.decode_jpeg(image, channels=1)
  label = tf.strings.split(file_path, "/")[3]
  label = tf.strings.substr(label, pos=0, len=1)
  label = tf.strings.to_number(label, out_type=tf.int64)
  return image, label

ds_train = ds_train.map(process_path).batch(batch_size)

### Text Loading with tensorflow

In [None]:
tf.keras.utils.text_dataset_from_directory(
    directory, labels='inferred', label_mode='int',
    class_names=None, batch_size=32, max_length=None, shuffle=True, seed=None,
    validation_split=None, subset=None, follow_links=False
)

## Dataset Visualization

### 3x3 Matplotlib Subplots

In [None]:
plt.figure(figsize=(10,10)) # specifying the overall grid size

for i, image in enumerate(x_new):
    plt.subplot(1,3,i+1)    # the number of images in the grid is 5*5 (25)
    plt.imshow(image)

plt.show()

### Plot loss and accuracy

In [None]:
pd.DataFrame(history.history).plot()
plt.xlabel("epochs")
plt.ylabel("loss");

### Confusion Matrix

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
def plot_cm(labels, predictions, p=0.5):
  cm = confusion_matrix(labels, predictions > p)
  plt.figure(figsize=(5,5))
  sns.heatmap(cm, annot=True, fmt="d")
  plt.title('Confusion matrix @{:.2f}'.format(p))
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')

### Tensorboard Callback

In [None]:
def create_tensorboard_callback(dir_name, experiment_name):
  log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir
  )
  print(f"Saving TensorBoard log files to: {log_dir}")
  return tensorboard_callback

## Class Imbalances

### Examine label imbalance

In [None]:
neg, pos = np.bincount(df['Class'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

### Metrics and early stopping

In [None]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_prc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

### Class weights

In [None]:
# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

# In model.fit: class_weight=class_weightm

### Oversampling minority class

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(train_features, train_labels)

## Others

### Mixed Precision Training

In [None]:
keras.mixed_precision.set_global_policy(policy="mixed_float16")
# Make sure output is set to float32

### L2 Regularization

In [None]:
from tensorflow.keras import regularizers
Conv2D(64, 3, padding="same", kernel_regularizer=regularizers.l2(0.01))