# Training textual modality


In [None]:
from datetime import datetime

import numpy as np

import os

import pandas as pd

import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from time import time, gmtime, strftime

from bert.tokenization.bert_tokenization import FullTokenizer

from utils.models.modelUtils import calcAccTextModel

from final_models import create_text_model

from utils.callbacks.MyCallbacks import MyCallbacks

from utils.telegramUtils.telegram_bot import telegram_send_message
from utils.textUtils.commentsProcessing import FakeDetectionDataCommentsTest, FakeDetectionDataCommentsTrainVal

from utils.callbacks.callbackUtils import plotTimesPerEpoch

from utils.fileDirUtils.fileDirUtils import createDirIfNotExists

from sklearn.metrics import accuracy_score


In [None]:
#Verbose settings:
verbose = False
TF_VERBOSE = 1 # 1 = Progress bar 2 = one line per epoch only!
TF_DETERMINISTIC_OPS = 1 # Makes everything also on GPU deterministic

# Classes:
NUM_CLASS = 2  # FAKE | NO FAKE

# Hyperparameters
GLOBAL_BATCH_SIZE = 64
EPOCHS = 10

# Optimizer parameters:
# Adam
LEARNING_RATE = 1e-5
BETA_1 = 0.9
BETA_2 = 0.999
EPSILON = 1e-8

#optimizers:

optimizer = Adam(LEARNING_RATE)

# Bert Parameters
MAX_SEQUENCE_LENGTH = 32 

# Custom telegram send text 
CUSTOM_TEXT = f'Batch Size: {GLOBAL_BATCH_SIZE}, Epochs: {EPOCHS}, Optimizer: Adam, Learning Rate; {LEARNING_RATE}, Beta_1: {BETA_1}, Beta_2: {BETA_2}, Epsilon: {EPSILON}, BERT Max sequence length: {MAX_SEQUENCE_LENGTH}'


telegram_send_message(f'-----------------START-----------------')
print('START')
print(CUSTOM_TEXT)
telegram_send_message(CUSTOM_TEXT)

In [None]:
# Path settings
root = '/home/armin/repos/fkd-model-handling/'

bert_model_dir = os.path.join(root, 'multi_cased_L-12_H-768_A-12')
pathToBertVocabFile = os.path.join(bert_model_dir, "vocab.txt")
bert_ckpt_file = os.path.join(bert_model_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_model_dir, "bert_config.json")

pathToTextLabelFiles = '/home/armin/repos/FKD-Dataset/008_text_image_meta_label/'

trainTextFile = os.path.join(pathToTextLabelFiles, "train_text_image_meta_label.csv")
testTextFile = os.path.join(pathToTextLabelFiles, "test_text_image_meta_label.csv")
valTextFile = os.path.join(pathToTextLabelFiles, "val_text_image_meta_label.csv")

checkpointDir = '/home/armin/repos/FKD-Dataset/011_checkpoints/'

In [None]:
# Other settings:

# Time settings:
current_time = datetime.now().strftime("%Y-%m-%d_%H:%M")

#Checkpoint settings:
checkpoint_name = f'bert_only_{MAX_SEQUENCE_LENGTH}'

checkpointDir = os.path.join(checkpointDir, (checkpoint_name + '_' + current_time))

fileName="weights-improvement-{epoch:02d}-{val_accuracy:.2f}.hdf5"
filePath = os.path.join(checkpointDir, fileName)

In [None]:
df_train = pd.read_csv(trainTextFile, header=0, sep='\t')
df_test = pd.read_csv(testTextFile, header=0, sep='\t')
df_val = pd.read_csv(valTextFile, header=0, sep='\t')

# df_train = df_train[:1024]
# df_test = df_test[:1024]
# df_val = df_val[:1024]

In [None]:
# Creating BERT compatible data:

tokenizer = FullTokenizer(vocab_file=pathToBertVocabFile)
text_data_train = FakeDetectionDataCommentsTrainVal(df_train, df_val, tokenizer, [0,1], MAX_SEQUENCE_LENGTH)

text_data_test = FakeDetectionDataCommentsTest(df_test, tokenizer, [0,1], MAX_SEQUENCE_LENGTH)

In [None]:
# Callback Handling:
tensorboardDir = os.path.join(checkpointDir, 'tensorboard')

createDirIfNotExists(tensorboardDir)
createDirIfNotExists(checkpointDir)


callbacks_list = MyCallbacks(tensorboardDir, filePath, earlyStopping=True).createCheckpoints()

In [None]:
telegram_send_message(f'Starting with Text Model, checkpoints can be found in {checkpointDir} ')

In [None]:
start = time()

mirrored_strategy = tf.distribute.MirroredStrategy()


with mirrored_strategy.scope():
    
    model = create_text_model(MAX_SEQUENCE_LENGTH, bert_ckpt_file, bert_config_file, NUM_CLASS) 
    
    if verbose:
        model.summary()
    
    model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
          optimizer=optimizer,
          metrics=['accuracy'])

    history = model.fit(
      text_data_train.train_x, text_data_train.train_y,
      validation_data = (text_data_train.val_x, text_data_train.val_y),
      callbacks=callbacks_list,
      batch_size=GLOBAL_BATCH_SIZE,
      epochs=EPOCHS,
      verbose = TF_VERBOSE
    )

In [None]:
end = time()
timeProceed = (end - start) / 60
print(f'It took {timeProceed} minutes to train everything' )
telegram_send_message(f'Total time of training: {timeProceed}')

In [None]:
plotTimesPerEpoch(callbacks_list)

In [None]:
calcAccTextModel(model, text_data_train.val_x, text_data_train.val_y, 'Val', GLOBAL_BATCH_SIZE)
calcAccTextModel(model, text_data_test.test_x, text_data_test.test_y, 'Test', GLOBAL_BATCH_SIZE)

In [None]:
telegram_send_message(f'-----------------DONE-----------------')
print('done')