In [63]:
import pandas as pd
import json
import requests
url = 'https://raw.githubusercontent.com/brearenee/NLP-Project/main/dataset/StarTrekDialogue_v2.json'
response = requests.get(url)

##This CodeBlock is thanks to ChatGPT :-) 
if response.status_code == 200:
    json_data = json.loads(response.text)
    lines = []
    characters = []
    episodes = []
  
    # extract the information from the JSON file for the "TNG" series
    for series_name, series_data in json_data.items():
        if series_name == "TNG": 
            for episode_name, episode_data in series_data.items():
                for character_name, character_lines in episode_data.items():
                    for line_text in character_lines:
                        lines.append(line_text)
                        characters.append(character_name)
                        episodes.append(episode_name)
                     
    # Create a DataFrame from the extracted data
    df = pd.DataFrame({
        'Line': lines,
        'Character': characters,
    })

    # Remove duplicate lines, keeping the first occurrence (preserving the original order)
    df = df.drop_duplicates(subset='Line', keep='first')

    # Reset the index of the DataFrame
    df.reset_index(drop=True, inplace=True)

else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")
    
    
##Remove Outliers (Characters with less than 1000 lines)
character_counts = df['Character'].value_counts()
characters_to_remove = character_counts[character_counts < 1000].index
df = df[~df['Character'].isin(characters_to_remove)]

##Print Value Count. 
print(df['Character'].value_counts())


Character
PICARD     10798
RIKER       6454
DATA        5699
LAFORGE     4111
WORF        3185
CRUSHER     2944
TROI        2856
WESLEY      1206
Name: count, dtype: int64


# BERT

In [64]:
#https://www.analyticsvidhya.com/blog/2021/12/multiclass-classification-using-transformers/


#Split the data 
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

#Converting our Character column into Categorical data
encoded_dict = {'PICARD':0,'RIKER':1, 'DATA':2, 'LAFORGE':3, 
                'WORF':4, 'CRUSHER':5, 'TROI':6,'WESLEY':7}
train_df['Character'] = train_df.Character.map(encoded_dict)
val_df['Character'] = val_df.Character.map(encoded_dict)

train_df.head()




Unnamed: 0,Line,Character
41033,The upper portion of the apparatus seems\r to ...,2
19647,how long before they cross over into Federatio...,1
20640,"You know, this might work. We can't change the...",3
16952,And you conclude because of this that I am imp...,2
11530,I was thinking the same thing about you. In al...,1


In [65]:
val_df.head()

Unnamed: 0,Line,Character
48560,"If you had to give this feeling a name, what w...",6
47262,I'm sure you all understand that in light of w...,5
7425,"Somehow, and there is limited information on t...",2
29414,But our sensors were malfunctioning. Our probe...,1
33815,The signal ended abruptly at oh four five five...,1


In [66]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(train_df.Character)
y_test = to_categorical(val_df.Character)

#We have successfully processed our Sentiment column( target); 
#now, it’s time to process our input text data using a tokenizer.

In [67]:
import transformers

#Loading Model and Tokenizer from the transformers package 

from transformers import AutoTokenizer,TFBertModel
#bert-base-uncased is another possible one
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
#TFBertModel = pretrained BERT model for Tensor Flow
bert = TFBertModel.from_pretrained('bert-base-cased')

#Input Data Modeling

#Before training, we need to convert the input textual data into 
#BERT’s input data format using a tokenizer.
#Since we have loaded bert-base-cased, 
#so tokenizer will also be Bert-base-cased.
# Tokenize the input (takes some time) 
# here tokenizer using from bert-base-cased
x_train = tokenizer(
    text=train_df.Line.tolist(),
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)
x_test = tokenizer(
    text=val_df.Line.tolist(),
    add_special_tokens=True,
    max_length=50,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)


#Hereafter data modelling, the tokenizer will return a dictionary (x_train) containing ‘Input_ids’, ‘attention_mask’ as key for their respective
#data.

input_ids = x_train['input_ids']
attention_mask = x_train['attention_mask']

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

# Model Building

In [None]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

max_len = 70
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0] 
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(6,activation = 'sigmoid')(out)
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

# Model Compilation

Defining learning parameters and compiling the model.

In [None]:


This article was published as a part of the Data Science Blogathon
Introduction

In the last article, we have discussed implementing the BERT model using the TensorFlow hub; you can read it here. Implementing BERT using the TensorFlow hub was tedious since we had to perform every step from scratch. First, we build our tokenizer, then design a function to process our data, and then develop our model for training.

here hugging face transformers package make implementation easier

This article will discuss the latest method to implement BERT or any other state of art model in the most accessible steps using the Transformers library

for a detailed explanation on BERT. Read my last article.
Implementation of BERT using hugging face transformers library

hugging face is an NLP-focused startup that provides a wide variety of solutions in NLP for TensorFlow and PyTorch.

The Transformers library contains more than 30 pre-trained models and 100 languages, along with 8 major architectures for natural language understanding (NLU) and natural language generation (NLG):
DataHour: Democratising AI Deployment

Date: 7 Dec   Time: 7 PM – 8 PM IST

    BERT (from Google);
    GPT-2 (from OpenAI);
    GPT (from OpenAI);
    Transformer-XL (from Google/CMU);
    XLNet (from Google/CMU);
    RoBERTa (from Facebook);
    XLM (from Facebook);
    DistilBERT (from HuggingFace).

The hugging face Transformers library required TensorFlow or PyTorch to load models, and it can train SOTA models in only a few lines of code and pre-process our data in only a few lines of code. The hugging face transformers library gives you the benefit of using pre-trained language models without requiring a vast and costly computational infrastructure and with simple implementation. Most State-of-the-Art models(SOTA) are provided directly and made available in the library in PyTorch and TensorFlow transparently and interchangeably. It works as an API in some sense.

    Loading the Dataset
    Pre-processing the raw data
    Getting BERT Pre-trained model and its tokenizer
    Training and evaluation
    Prediction Pipeline

Loading the Dataset

The dataset we are using the Emotions dataset for NLP.

This dataset contains text and their respective emotions, and it has train-data, test-data, and validation data.

‘i was feeling listless from the need of new things, something different; sadness.’

Python Code:
Loading the dataset | Multiclass Classification Using Transformers
Source: Local
Converting our Sentiment column into Categorical data

Mapping sentiments label with some numbers using a python dictionary and then convert them into a categorical column using to_categorical.

encoded_dict = {‘anger’:0,’fear’:1, ‘joy’:2, ‘love’:3, ‘sadness’:4, ‘surprise’:5}
df_train[‘Sentiment’] = df_train.Sentiment.map(encoded_dict)
df_test[‘Sentiment’] = df_train.Sentiment.map(encoded_dict)

importing to_categorical class from utils:

from tensorflow.keras.utils import to_categorical

converting our integer coded Sentiment column into categorical data(matrix)

y_train = to_categorical(df_train.Sentiment)
y_test = to_categorical(df_test.Sentiment)

Converting our Sentiment column into Categorical data
Source: Local

We have successfully processed our Sentiment column( target); now, it’s time to process our input text data using a tokenizer.
Getting transformers Package

you need to install the transformers package and then import it.

!pip install transformers
import transformers

Loading Model and Tokenizer from the transformers package 

from transformers import AutoTokenizer,TFBertModel
tokenizer = AutoTokenizer.from_pretrained(‘bert-base-cased’)
bert = TFBertModel.from_pretrained(‘bert-base-cased’)

We need a tokenizer to convert the input text’s word into tokens.

The classAutoTokenizer contains various types of tokenizers.

TFBertModel pre-trained Bert model for TensorFlow.

Here we are loading the bert-base-cased model.

 
Bert-Based-Case
Source: Local
Input Data Modeling

Before training, we need to convert the input textual data into BERT’s input data format using a tokenizer.

Since we have loaded bert-base-cased, so tokenizer will also be Bert-base-cased.

# Tokenize the input (takes some time) 
# here tokenizer using from bert-base-cased
x_train = tokenizer(
    text=df_train.Input.tolist(),
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)
x_test = tokenizer(
    text=df_test.Input.tolist(),
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

Tokenizer takes all the necessary parameters and returns tensor in the same format Bert accepts.

    return_token_type_ids = False: token_type_ids is not necessary for our training in this case.
    return_attention_mask = True we want to include attention_mask in our input.
    return_tensors=’tf’: we want our input tensor for the TensorFlow model.
    max_length=70:
    we want the maximum length of each sentence to be 70; if a sentence is
    bigger than this, it will be trimmed if a sentence is smaller than
    70 then it will be padded.
    add_special_tokens=True, CLS, SEP token will be added in the tokenization.

Hereafter data modelling, the tokenizer will return a dictionary (x_train) containing ‘Input_ids’, ‘attention_mask’ as key for their respective
data.

input_ids = x_train[‘input_ids’]
attention_mask = x_train['attention_mask']

 
Multiclass Classification Using Transformers
Source: Local
Model Building

Importing necessary libraries.

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

We are using functional API to design our model.

max_len = 70
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0] 
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(6,activation = 'sigmoid')(out)
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

Bert layers accept three input arrays, input_ids, attention_mask, token_type_ids

input_ids means our input words encoding, then attention mask,

token_type_ids is necessary for the question-answering model; in this case, we will not pass token_type_ids.

    For the Bert layer, we need two input layers, in this case, input_ids, attention_mask.
    Embeddings contain hidden states of the Bert layer.
    using
    GlobalMaxPooling1D then dense layer to build CNN layers using hidden
    states of Bert. These CNN layers will yield our output.

bert[0] is the last hidden state, bert[1] is the
pooler_output, for building CNN layers on top of the BERT layer, we have
used Bert’s hidden forms.
Model Compilation

Defining learning parameters and compiling the model.

optimizer = Adam(
    learning_rate=5e-05, # this learning rate is for bert model , taken from huggingface website 
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)
# Set loss and metrics
loss =CategoricalCrossentropy(from_logits = True)
metric = CategoricalAccuracy('balanced_accuracy'),
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

