In [1]:
!pip install -q -U "tensorflow-text==2.11.*"

!pip install -q tf-models-official==2.11.0

import pandas as pd
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

# DATASET HERE
csv_filename = 'new_test_set_0.csv'

data = pd.read_csv(csv_filename)

train_path = csv_filename.replace('.csv', '') + '/train'

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m630.1/630.1 KB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 KB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.2/38.2 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 KB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ...

In [3]:
#  DATA FORMATING
folderClass = lambda score: f'./{train_path}/class_hit' if bool(score) else f'./{train_path}/class_flop'

for index, row in data.iterrows():
  score = int(row['target'])
  lyric = str(row['Lyrics'])
  lyric = lyric.split('...')[0]
  class_folder = folderClass(score)

  if not os.path.exists(class_folder):
    os.makedirs(class_folder)
  
  file_path = os.path.join(class_folder, str(index)+".txt")
  with open(file_path, 'w') as train_file:
    train_file.write(lyric)

In [17]:
# DATA LOADING
# SET HYPERPARAMETERS
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

# LOAD TRAINING DATA
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    train_path,
    batch_size=batch_size,
    shuffle=True,
    seed=seed,
    validation_split=0.2,
    subset='training')

train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

# LOAD VALIDATION DATA
val_ds = tf.keras.utils.text_dataset_from_directory(
    train_path,
    batch_size=batch_size,
    shuffle=True,
    seed=seed,
    validation_split=0.2,
    subset='validation')

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

# test_ds = tf.keras.utils.text_dataset_from_directory(
#     '/test',
#     batch_size=batch_size)

# test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

# class_names = raw_train_ds.class_names

Found 521 files belonging to 2 classes.
Using 417 files for training.
Found 521 files belonging to 2 classes.
Using 104 files for validation.


In [10]:
# IMPORT BERT PREPROCESSOR AND ENCODER
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'

bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [18]:
# BUILD MODEL
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(64, activation='relu', name='preclass')(net)
  net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [21]:
# LOAD HYPERPARAMETERS
classifier_model = build_classifier_model()

loss = tf.keras.losses.BinaryCrossentropy()
metrics = tf.metrics.BinaryAccuracy()

epochs = 10
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [None]:
# TRAIN MODEL
history = classifier_model.fit(x=train_ds,
                               validation_data=val_ds,
                               epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 1/14 [=>............................] - ETA: 1:17 - loss: 0.3843 - binary_accuracy: 0.8438