# Friends Classification Text

This is classification of dialogue in FRIENDS TV Series.

<!-- https://stackoverflow.com/questions/61000500/tensorflow-keras-bert-multiclass-text-classification-accuracy -->

In [1]:
import pandas as pd
import re
import numpy as np
import time

import os
import pprint
import tensorflow as tf

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
# help(drive.mount)

In [3]:
# PATH_DATA = 'gdrive/'

In [4]:
os.listdir()

['.config', 'gdrive', 'sample_data']

## Checking Machine

In [5]:
if 'COLAB_TPU_ADDR' not in os.environ:
  print('ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!')
else:
  tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
  print ('TPU address is', tpu_address)

TPU address is grpc://10.103.117.34:8470


## Importing Data

In [6]:
%cd gdrive/My Drive/Repository/Data/

/content/gdrive/My Drive/Repository/Data


In [7]:
os.listdir()

['friends_seen_season.csv', 'friends_unseen_season.csv']

In [8]:
df_dlg_seen = pd.read_csv('friends_seen_season.csv')
df_dlg_unseen = pd.read_csv('friends_unseen_season.csv')

In [9]:
data_train = df_dlg_seen[(df_dlg_seen.is_train == True)&(df_dlg_seen.label!=0)][['text','label']]
data_train['label'] = data_train['label']-1
data_test = df_dlg_seen[(df_dlg_seen.is_train == False)&(df_dlg_seen.label!=0)][['text','label']]
data_test['label'] = data_test['label']-1
data_unseen = df_dlg_unseen[(df_dlg_unseen.label!=0)][['text','label']]
data_unseen['label'] = data_unseen['label']-1

In [10]:
data_train.shape

(25091, 2)

In [11]:
data_test.shape

(8320, 2)

In [12]:
data_unseen.shape

(8159, 2)

In [13]:
XX = data_test.label.value_counts()
XX/XX.sum()

1    0.180889
0    0.178005
3    0.170793
2    0.166466
4    0.161779
5    0.142067
Name: label, dtype: float64

## Tokenizer

In [14]:
# !pip install transformers

In [15]:
# from transformers import BertTokenizer

# bert_token = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# MAX_LEN = 25
# NUMB_CLASS = 7

# # Tokenizer use bert
# def text_to_feature(x,tokenizer = bert_token):
#     text_token = tokenizer.encode_plus(
#                     x,                      
#                     add_special_tokens = True, # add [CLS], [SEP]
#                     max_length = MAX_LEN, # max length of the text that can go to BERT
#                     pad_to_max_length = True, # add [PAD] tokens
#                     return_attention_mask = True,
#                     truncation=True,# add attention mask to not focus on pad tokens
#                   )
#     return text_token

# # map to the expected input to TFBertForSequenceClassification, see here 
# def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
#     return {
#       "input_ids": input_ids,
#       "token_type_ids": token_type_ids,
#       "attention_mask": attention_masks,
#     }, label


# def label_encode(i, numb_cls):
#     X = np.zeros(numb_cls)
#     X[i] = 1
#     return list(X)

# def encode_dataset(ds, limit=-1):
#     # prepare list, so that we can build up final TensorFlow dataset from slices.
#     input_ids_list = []
#     token_type_ids_list = []
#     attention_mask_list = []
#     label_list = []
#     if (limit > 0):
#         ds = ds.head(limit)
    
#     for text, label in ds.values:
#         bert_input = text_to_feature(text)
#         input_ids_list.append(bert_input['input_ids'])
#         token_type_ids_list.append(bert_input['token_type_ids'])
#         attention_mask_list.append(bert_input['attention_mask'])
        
#     return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [16]:
from transformers import BertTokenizer

bert_token = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenizer use bert
def text_to_feature(x,max_seq,tokenizer = bert_token):
    text_token = tokenizer.encode_plus(
                    x,                      
                    add_special_tokens = True, # add [CLS], [SEP]
                    max_length = max_seq, # max length of the text that can go to BERT
                    pad_to_max_length = True, # add [PAD] tokens
                    return_attention_mask = True,
                    truncation=True,# add attention mask to not focus on pad tokens

                    return_tensors='tf',
                  )
    return text_token

def label_encode(i, numb_cls):
    X = np.zeros(numb_cls)
    X[i] = 1
    return list(X)

def encode_dataset(ds, max_seq_len, limit=-1):
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    if (limit > 0):
        ds = ds.head(limit)
    
    for text, label in ds.values:
        bert_input = text_to_feature(text,max_seq = max_seq_len)
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append(label_encode(label, NUMB_CLASS))

    input_ids = tf.convert_to_tensor(input_ids_list)
    attention_masks = tf.convert_to_tensor(attention_mask_list)
    token_type_ids = tf.convert_to_tensor(token_type_ids_list)

    ids = tf.reshape(input_ids, (-1, max_seq_len))
    print("Input ids shape: ", ids.shape)
    masks = tf.reshape(attention_masks, (-1, max_seq_len))
    print("Input Masks shape: ", masks.shape)
    token_types = tf.reshape(token_type_ids, (-1, max_seq_len))
    print("Token type ids shape: ", token_types.shape)

    ids=ids.numpy()
    masks = masks.numpy()
    token_types = token_types.numpy()
    y = np.array(label_list)
        
    return [ids, masks, token_types, y]


In [17]:
# ALL_TEST = encode_dataset(data_test, MAX_LEN)
# ALL_TEST[:3],ALL_TEST[3].shape

## Training Model

In [18]:
from transformers import TFBertForSequenceClassification
from keras import Model
from keras.layers import Dense, Dropout, Flatten, Input
from keras import regularizers

In [19]:
# def create_model(L_RATE, NUMB_CLASS_TARGET):
#   model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=NUMB_CLASS_TARGET)
#   optimizer = tf.keras.optimizers.Adam(learning_rate=L_RATE, epsilon=1e-08)

#   loss = tf.keras.losses.CategoricalCrossentropy()
#   metric = tf.keras.metrics.CategoricalCrossentropy('accuracy')

#   model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

#   return model


def create_model(L_RATE,max_seq_len,NUMB_CLASS_TARGET):
  base_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', trainable=True, num_labels=NUMB_CLASS_TARGET)

  input_ids_layer = Input(shape=(max_seq_len, ), dtype=np.int32)
  input_mask_layer = Input(shape=(max_seq_len, ), dtype=np.int32)
  input_token_type_layer = Input(shape=(max_seq_len,), dtype=np.int32)

  bert_layer = base_model([input_ids_layer, input_mask_layer, input_token_type_layer])[0]
  flat_layer = Flatten()(bert_layer)
  # dropout_1= Dropout(0.1)(flat_layer)
  # dense_1 = Dense(NUMB_CLASS_TARGET, activation='relu', 
  #                 bias_regularizer=regularizers.l2(0.01),
  #                 activity_regularizer=regularizers.l2(0.02))(dropout_1)

  # dropout_2= Dropout(0.1)(dense_1)
  # dense_2 = Dense(2*NUMB_CLASS_TARGET, activation='relu',
  #                 bias_regularizer=regularizers.l2(0.01),
  #                 activity_regularizer=regularizers.l2(0.01))(dropout_2)

  dense_output = Dense(NUMB_CLASS_TARGET, activation='softmax',
                  bias_regularizer=regularizers.l2(0.02))(flat_layer)

  model_ = Model(inputs=[input_ids_layer, input_mask_layer, input_token_type_layer], outputs=dense_output)

  lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=L_RATE,
    decay_steps=10000000,
    decay_rate=0.15)
  
  optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, epsilon=5e-08)
  model_.compile(optimizer=optimizer,
              loss=['categorical_crossentropy'],
              metrics=['accuracy'])
  
  #loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  #metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
  #model_.compile(optimizer=optimizer, loss=loss, metrics=[metric])
  return model_


In [20]:
# mm = create_model(LR, MAX_LEN , NUMB_CLASS)

In [21]:
print(tf.__version__)

2.3.0


In [22]:
# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
LR = 1e-5
EPOCHS = 250
BATCH_SIZE = 250
MAX_LEN = 8
NUMB_CLASS = 6

use_tpu = 'COLAB_TPU_ADDR' in os.environ
if use_tpu:
    print('USING TPU')
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        bert_model = create_model(LR, MAX_LEN , NUMB_CLASS)
else:
    print('NOT USING TPU')
    bert_model = create_model(LR, MAX_LEN, NUMB_CLASS)

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


USING TPU
INFO:tensorflow:Initializing the TPU system: grpc://10.103.117.34:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.103.117.34:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['dropout_37', 'classifier']
You should probably TRAIN this model on a down-stream task to b

In [23]:
bert_model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 8)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 8)]          0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 8)]          0                                            
__________________________________________________________________________________________________
tf_bert_for_sequence_classifica ((None, 6),)         109486854   input_1[0][0]                    
                                                                 input_2[0][0]         

In [24]:
# XX = encode_dataset(data_test)
# YY = data_test['label'].tolist()
# ALL_TEST = convert_inputs_to_tf_dataset(XX,YY, MAX_LEN)
# ALL_TEST[:3],ALL_TEST[3].shape


In [25]:
L1 = time.time()
all_train = encode_dataset(data_train, MAX_LEN)
X_train,y_train = all_train[:3],all_train[3]
finish_time = str(round((time.time()-L1)/60,3))
print('done in '+finish_time)

L1 = time.time()
all_val = encode_dataset(data_test, MAX_LEN)
X_val,y_val = all_val[:3],all_val[3]
finish_time = str(round((time.time()-L1)/60,3))
print('done in '+finish_time)

L1 = time.time()
all_unseen = encode_dataset(data_unseen, MAX_LEN)
X_unseen,y_unseen = all_unseen[:3],all_unseen[3]
finish_time = str(round((time.time()-L1)/60,3))
print('done in '+finish_time)




Input ids shape:  (25091, 8)
Input Masks shape:  (25091, 8)
Token type ids shape:  (25091, 8)
done in 0.383
Input ids shape:  (8320, 8)
Input Masks shape:  (8320, 8)
Token type ids shape:  (8320, 8)
done in 0.146
Input ids shape:  (8159, 8)
Input Masks shape:  (8159, 8)
Token type ids shape:  (8159, 8)
done in 0.132


In [26]:
# X_val

In [27]:
# optimizer = tf.keras.optimizers.Adam(learning_rate = 2e-5, epsilon=1e-08)
# bert_model.compile(optimizer=optimizer,
#               loss=['categorical_crossentropy'],
#               metrics=['accuracy'])



In [28]:
# bert_history = bert_model.fit(input_train, epochs=EPOCHS, validation_data=input_test)

bert_history = bert_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size = BATCH_SIZE, verbose=1)


Epoch 1/250
Instructions for updating:
Use `tf.data.Iterator.get_next_as_optional()` instead.


Instructions for updating:
Use `tf.data.Iterator.get_next_as_optional()` instead.










Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78/250
Epoch 7

In [None]:
os.listdir('../../model')

In [None]:
# bert_model.save('../../model/friends_model.h5')

In [None]:
# model.save('cnn.h5')
# loaded_model = tf.keras.models.load_model('cnn.h5')

In [29]:
type(bert_model)

tensorflow.python.keras.engine.functional.Functional

In [30]:
# bert_model.save('../../model/friends_model.h5',save_format="tf")
bert_model.save_weights('friends_model_weights1.h5')

In [37]:
# loaded_model = tf.keras.models.load_model('cnn.h5')

test_load_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)
test_load_model

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['dropout_113', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<transformers.modeling_tf_bert.TFBertForSequenceClassification at 0x7f414ec79a58>

In [38]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-9, epsilon=1e-08)
loss = tf.keras.losses.CategoricalCrossentropy()
metric = tf.keras.metrics.CategoricalCrossentropy('accuracy')

test_load_model.load_weights('friends_model_weights1.h5')
test_load_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

ValueError: ignored

In [None]:
bert_model.evaluate(input_test)

In [None]:
test_load_model.evaluate(input_test)

## Testing

In [34]:
def map_to_dict_testing(input_ids, attention_masks, token_type_ids):
    return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
    }

def encode_test_dataset(ds, limit=-1):
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    if (limit > 0):
        ds = ds[limit-1:limit]
        print(ds)
    
    for text in ds.values:
        bert_input = text_to_feature(text)
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
    
    print(len(ds.values))
        
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list)).map(map_to_dict_testing)

In [35]:
data_test_sentence = encode_test_dataset(data_test['text'],5)

14    and they weren't looking at you before
Name: text, dtype: object


TypeError: ignored

In [36]:
data_test.head()

Unnamed: 0,text,label
1,c'mon you're going out with the guy there's go...,4
3,wait does he eat chalk,5
9,then i look down and i realize there's a phone...,3
11,never had that dream,4
14,and they weren't looking at you before,2


In [None]:
data_test_sentence

<MapDataset shapes: {input_ids: (25,), token_type_ids: (25,), attention_mask: (25,)}, types: {input_ids: tf.int32, token_type_ids: tf.int32, attention_mask: tf.int32}>

In [None]:
XX = test_load_model.predict(data_test_sentence)

In [None]:
XX[0][-1]

array([-0.42746884, -0.1117845 ,  0.1798155 , -0.21867178,  0.31221217,
        0.2932364 , -0.16892028], dtype=float32)