In [1]:
import os
import math
import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import datasets
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

In [3]:
from sklearn.metrics import confusion_matrix, classification_report

In [4]:
random_seed = 42
np.random.seed(random_seed)
tf.random.set_seed(random_seed)

# **Data**

In [5]:
def load_train_test_val_data(name_dataset : str):

    data = load_dataset('silicone',name_dataset)
    train = pd.DataFrame(data['train'])
    test = pd.DataFrame(data['test'])
    val = pd.DataFrame(data['validation'])

    return train, test, val

In [6]:
meld_s_train, meld_s_test, meld_s_val = load_train_test_val_data("meld_s")

Reusing dataset silicone (C:\Users\gguaquiere\.cache\huggingface\datasets\silicone\meld_s\1.0.0\af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)
100%|██████████| 3/3 [00:00<00:00, 598.56it/s]


In [7]:
meld_s_train

Unnamed: 0,Utterance,Speaker,Sentiment,Dialogue_ID,Utterance_ID,Label,Idx
0,also I was the point person on my company 's t...,Chandler,neutral,0,0,1,0
1,You must 've had your hands full .,The Interviewer,neutral,0,1,1,1
2,That I did . That I did .,Chandler,neutral,0,2,1,2
3,So let 's talk a little bit about your duties .,The Interviewer,neutral,0,3,1,3
4,My duties ? All right .,Chandler,positive,0,4,2,4
...,...,...,...,...,...,...,...
9984,You or me ?,Chandler,neutral,1038,13,1,9984
9985,"I got it . Uh , Joey , women do n't have Adam ...",Ross,neutral,1038,14,1,9985
9986,"You guys are messing with me , right ?",Joey,positive,1038,15,2,9986
9987,Yeah .,All,neutral,1038,16,1,9987


In [8]:
train = meld_s_train[['Utterance','Sentiment']]
test = meld_s_test[['Utterance','Sentiment']]
val = meld_s_val[['Utterance','Sentiment']]

In [9]:
os.makedirs("model", exist_ok=True)
bert_model_name="uncased_L-12_H-768_A-12"
bert_ckpt_dir = 'model/uncased_L-12_H-768_A-12'
bert_ckpt_file = 'model/uncased_L-12_H-768_A-12/bert_model.ckpt'
bert_config_file = 'model/uncased_L-12_H-768_A-12/bert_config.json'

# **Input Text Preparation**

In [10]:
class DataPreparation:
    
    text_column = "Utterance"
    label_column = "Sentiment"

    def __init__(self, train, test, tokenizer: FullTokenizer, classes, max_seq_len=192):
        self.tokenizer = tokenizer
        self.max_seq_len = 0
        self.classes = classes

        ((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self.prepare_data, [train, test])

        print("max seq_len", self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len, max_seq_len)
        self.train_x, self.test_x = map(self.data_padding, [self.train_x, self.test_x])

    def prepare_data(self, df):
        x, y = [], []

        for _, row in tqdm(df.iterrows()):
            text, label = row[DataPreparation.text_column], row[DataPreparation.label_column]
            tokens = self.tokenizer.tokenize(text)
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            self.max_seq_len = max(self.max_seq_len, len(token_ids))
            x.append(token_ids)
            y.append(self.classes.index(label))

        return np.array(x), np.array(y)

    def data_padding(self, ids):
        x = []
        for input_ids in ids:
            input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
            x.append(np.array(input_ids))
        return np.array(x)

In [11]:
tokenizer = FullTokenizer(vocab_file=bert_ckpt_dir+"/vocab.txt")

# **Model**

In [12]:
def model_defination(max_seq_len, bert_ckpt_file):
    
    with tf.io.gfile.GFile(bert_config_file, "r") as reader:
        bc = StockBertConfig.from_json_string(reader.read())
        bert_params = map_stock_config_to_params(bc)
        bert_params.adapter_size = None
        bert = BertModelLayer.from_params(bert_params, name="bert")
        
    input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="input_ids")
    bert_output = bert(input_ids)

    print("bert shape", bert_output.shape)

    cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
    cls_out = keras.layers.Dropout(0.5)(cls_out)
    logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
    logits = keras.layers.Dropout(0.5)(logits)
    logits = keras.layers.Dense(units=len(classes), activation="softmax")(logits)

    model = keras.Model(inputs=input_ids, outputs=logits)
    model.build(input_shape=(None, max_seq_len))

    load_stock_weights(bert, bert_ckpt_file)

    return model

In [13]:
classes = train.Sentiment.unique().tolist()

data = DataPreparation(train, test, tokenizer, classes, max_seq_len=128)

9989it [00:02, 4886.98it/s]
2610it [00:00, 4990.38it/s]


max seq_len 97


In [14]:
model = model_defination(data.max_seq_len, bert_ckpt_file)

bert shape (None, 97, 768)
Done loading 196 BERT weights from: model/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x00000248AD8819E8> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights


In [15]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 97)]              0         
_________________________________________________________________
bert (BertModelLayer)        (None, 97, 768)           108890112 
_________________________________________________________________
lambda (Lambda)              (None, 768)               0         
_________________________________________________________________
dropout (Dropout)            (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 768)               590592    
_________________________________________________________________
dropout_1 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 2307  

In [16]:
model.compile(
  optimizer=keras.optimizers.Adam(1e-5),
  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

In [17]:
history = model.fit(
  x=data.train_x, 
  y=data.train_y,
  validation_split=0.1,
  batch_size=16,
  shuffle=True,
  epochs=3,
  #callbacks=[tensorboard_callback]
)

Train on 8990 samples, validate on 999 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
