In [94]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# this training run in colab with gpu

In [95]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

In [96]:
df = pd.read_csv('chatbot.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16973 entries, 0 to 16972
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16973 non-null  object
 1   label   16973 non-null  object
dtypes: object(2)
memory usage: 265.3+ KB


In [97]:
# Create a new column 'category_id' with encoded categories 
df['category_id'] = df['label'].factorize()[0]
category_id_df = df[['label', 'category_id']].drop_duplicates()

# Dictionaries for future use
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'label']].values)

# New dataframe
df.head()

Unnamed: 0,text,label,category_id
0,"I don't have an online account, what do I have...",ACCOUNT,0
1,can you tell me if i can regisger two accounts...,ACCOUNT,0
2,"I have no online account, open one, please",ACCOUNT,0
3,"could you ask an agent how to open an account,...",ACCOUNT,0
4,"i want an online account, create one",ACCOUNT,0


In [98]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [99]:
token = tokenizer.encode_plus(
    df['text'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [100]:
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [101]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['text'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [102]:
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [103]:
labels = np.zeros((len(df), 5))
labels.shape

(16973, 5)

In [104]:
X_input_ids.shape

(16973, 256)

In [105]:
X_attn_masks.shape

(16973, 256)

In [106]:
labels[np.arange(len(df)), df['category_id'].values] = 1 # one-hot encoded target tensor

In [107]:
# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

In [108]:
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [109]:
dataset = dataset.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset 

In [110]:
dataset = dataset.shuffle(10000).batch(1, drop_remainder=True) # batch size, drop any left out tensor

In [111]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(1, 256), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(1, 256), dtype=tf.float64, name=None)}, TensorSpec(shape=(1, 5), dtype=tf.float64, name=None))>

In [112]:
p = 0.8
train_size = int(len(df)*p) 

In [113]:
train_size

13578

In [114]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [115]:
len(train_dataset)

13578

In [116]:
from transformers import TFBertModel

In [117]:
model = TFBertModel.from_pretrained('bert-base-uncased') # bert base model with pretrained weights

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [118]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(5, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
sentiment_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                         

In [119]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [120]:
sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [121]:
hist = sentiment_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=1
)



In [122]:
sentiment_model.save('sentiment_model')



In [129]:
tokenizer.save_pretrained('tokenizer')

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json')

In [123]:
category_id_df

Unnamed: 0,label,category_id
0,ACCOUNT,0
4557,CONTACT,1
7638,INVOICES,2
10081,ORDER,3
12337,PAYMENT,4


In [124]:
sentiment_model = tf.keras.models.load_model('sentiment_model')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [125]:
def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=['ACCOUNT', 'CONTACT', 'INVOICES', 'ORDER', 'PAYMENT']):
    probs = model.predict(processed_data)[0]
    print(probs)
    return classes[np.argmax(probs)]

In [126]:
# input_text = 'can you help me to get in touch with somebody'
input_text = 'money'
processed_data = prepare_data(input_text, tokenizer)
result = make_prediction(sentiment_model, processed_data=processed_data)
print(f"Predicted Sentiment: {result}")

[7.3660829e-04 3.6039910e-05 1.0279577e-03 4.4407556e-03 9.9375868e-01]
Predicted Sentiment: PAYMENT


In [127]:
# !zip -r /content/sentiment_model_zip /content/sentiment_model

updating: content/sentiment_model/ (stored 0%)
updating: content/sentiment_model/assets/ (stored 0%)
updating: content/sentiment_model/saved_model.pb (deflated 92%)
updating: content/sentiment_model/fingerprint.pb (stored 0%)
updating: content/sentiment_model/variables/ (stored 0%)
updating: content/sentiment_model/variables/variables.data-00000-of-00001 (deflated 21%)
updating: content/sentiment_model/variables/variables.index (deflated 79%)
updating: content/sentiment_model/keras_metadata.pb (deflated 96%)


In [93]:
# from google.colab import files
# files.download("/content/sentiment_model_zip.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>