<a href="https://colab.research.google.com/github/alamhanz/the-one-with-friends/blob/master/notebooks/bert_model/.ipynb_checkpoints/20200803_training_bert-checkpoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Friends Classification Text

This is classification of dialogue in FRIENDS TV Series.

In [None]:
import pandas as pd
import re
import numpy as np
import time

In [None]:
PATH_DATA = '../../data/'

## Importing Data

In [None]:
df_dlg_seen = pd.read_csv('../../data/friends_seen_season.csv')
df_dlg_unseen = pd.read_csv('../../data/friends_unseen_season.csv')

In [None]:
data_train = df_dlg_seen[df_dlg_seen.is_train == True][['text','label']]
data_test = df_dlg_seen[df_dlg_seen.is_train == False][['text','label']]
data_unseen = df_dlg_unseen[['text','label']]

In [None]:
data_train.shape

(35123, 2)

In [None]:
data_test.shape

(11708, 2)

In [None]:
data_unseen.shape

(11207, 2)

## Tokenizer

In [None]:
from transformers import BertTokenizer
import tensorflow as tf

bert_token = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
MAX_LEN = 30
BATCH_SIZE = 30
NUMB_CLASS = 7

# Tokenizer use bert
def text_to_feature(x,tokenizer = bert_token):
    text_token = tokenizer.encode_plus(
                    x,                      
                    add_special_tokens = True, # add [CLS], [SEP]
                    max_length = MAX_LEN, # max length of the text that can go to BERT
                    pad_to_max_length = True, # add [PAD] tokens
                    return_attention_mask = True,
                    truncation=True,# add attention mask to not focus on pad tokens
                  )
    return text_token

# map to the expected input to TFBertForSequenceClassification, see here 
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
    }, label


def label_encode(i, numb_cls):
    X = np.zeros(numb_cls)
    X[i] = 1
    return list(X)

def encode_dataset(ds, limit=-1):
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    if (limit > 0):
        ds = ds.head(limit)
    
    for text, label in ds.values:
        bert_input = text_to_feature(text)
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append(label_encode(label, NUMB_CLASS))
        
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [None]:
L1 = time.time()
input_train = encode_dataset(data_train).batch(BATCH_SIZE)
finish_time = str(round((time.time()-L1)/60,3))
print('done in '+finish_time)

L1 = time.time()
input_test = encode_dataset(data_test).batch(BATCH_SIZE)
finish_time = str(round((time.time()-L1)/60,3))
print('done in '+finish_time)

L1 = time.time()
input_unseen = encode_dataset(data_unseen).batch(BATCH_SIZE)
finish_time = str(round((time.time()-L1)/60,3))
print('done in '+finish_time)


done in 0.415
done in 0.126
done in 0.13


## Training Model

In [None]:
from transformers import TFBertForSequenceClassification

In [None]:
# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
LR = 3e-5
EPOCHS = 25

# model initialization
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=NUMB_CLASS)
# classifier Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=LR, epsilon=1e-08)

# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
loss = tf.keras.losses.CategoricalCrossentropy()
metric = tf.keras.metrics.CategoricalCrossentropy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertForSequenceClassification: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['dropout_75', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
bert_history = model.fit(input_train, epochs=EPOCHS, validation_data=input_test)

Epoch 1/25
  15/1171 [..............................] - ETA: 2:03:39 - loss: 6.8291 - accuracy: 6.8291

In [1]:
test

NameError: ignored