<a href="https://colab.research.google.com/github/bryanbayup/Machine-Learning/blob/main/Untitled19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow==2.11.0
!pip install seqeval
!pip install tensorflow-addons

Collecting seqeval
  Using cached seqeval-1.2.2.tar.gz (43 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=f41906617780cd55d0c203bcb99de118db677a8b703d75d9580bc053203de2de
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [9]:
# Import Libraries
import json
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import nltk
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense, Dropout
from tensorflow.keras.models import Model
import tensorflow_addons as tfa
import pickle
import matplotlib.pyplot as plt
from tensorflow_addons.text import crf_log_likelihood, crf_decode

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load Dataset from dataaa.json
with open('dataaa.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)

# Display first few entries
print(df.head())

                                          utterances               intent  \
0  Anjing saya mengalami gatal-gatal terus meneru...  medical_inquiry_dog   
1  Anjing saya terlihat sering muntah dan kehilan...  medical_inquiry_dog   
2  Anjing saya terlihat lesu, demam, dan tidak ma...  medical_inquiry_dog   
3  Anjing saya mengalami batuk kering dan nafasny...  medical_inquiry_dog   
4  Anjing saya mengalami luka pada kulit yang men...  medical_inquiry_dog   

                                            entities  \
0  [{'entity': 'animal', 'value': 'Anjing', 'star...   
1  [{'entity': 'animal', 'value': 'Anjing', 'star...   
2  [{'entity': 'animal', 'value': 'Anjing', 'star...   
3  [{'entity': 'animal', 'value': 'Anjing', 'star...   
4  [{'entity': 'animal', 'value': 'Anjing', 'star...   

                                           responses  
0  Gunakan sampo hipoalergenik, oleskan salep hid...  
1  Berikan cairan elektrolit untuk mencegah dehid...  
2  Pastikan anjing tetap terhidrasi

In [4]:
# Preprocessing Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [5]:
# Apply Cleaning to Utterances
df['clean_utterances'] = df['utterances'].apply(clean_text)

# Encode Intents
label_encoder = LabelEncoder()
df['intent_label'] = label_encoder.fit_transform(df['intent'])
num_intent_classes = len(label_encoder.classes_)

# Prepare Entities for NER
all_labels = set(['O'])
for label_list in df['entities']:
    for ent in label_list:
        all_labels.add('B-' + ent['entity'])
        all_labels.add('I-' + ent['entity'])

# Create tag2idx and idx2tag mappings
tag2idx = {tag: idx for idx, tag in enumerate(sorted(all_labels))}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}
num_tags = len(tag2idx)

# Initialize Tokenizer
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean_utterances'])
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1  # +1 for padding

# Convert Utterances to Sequences
sequences = tokenizer.texts_to_sequences(df['clean_utterances'])

# Determine Maximum Sequence Length
max_seq_length = max(len(seq) for seq in sequences)

# Pad Sequences
X = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

# Prepare NER Labels
def prepare_ner_labels(entities, tokens, tag2idx):
    labels = ['O'] * len(tokens)
    for ent in entities:
        entity = ent['entity']
        value = ent['value']
        # Tokenize entity value
        ent_text_clean = clean_text(value)
        ent_tokens = ent_text_clean.split()
        ent_length = len(ent_tokens)
        for i in range(len(tokens) - ent_length + 1):
            if tokens[i:i+ent_length] == ent_tokens:
                labels[i] = 'B-' + entity
                for j in range(1, ent_length):
                    labels[i + j] = 'I-' + entity
                break  # Avoid overlapping
    # Convert labels to indices and pad
    label_ids = [tag2idx.get(label, tag2idx['O']) for label in labels]
    label_ids += [tag2idx['O']] * (max_seq_length - len(label_ids))
    return label_ids[:max_seq_length]

# Apply to All Entries
y_ner = []
for idx, row in df.iterrows():
    tokens = df['clean_utterances'].iloc[idx].split()
    label_ids = prepare_ner_labels(row['entities'], tokens, tag2idx)
    y_ner.append(label_ids)

y_ner = np.array(y_ner)

In [6]:
# Split Data into Train and Test for Intent Classification
X_train_intent, X_test_intent, y_train_intent, y_test_intent = train_test_split(
    X, df['intent_label'], test_size=0.1, random_state=42, stratify=df['intent_label']
)

# Split Data into Train and Test for NER
X_train_ner, X_test_ner, y_train_ner, y_test_ner = train_test_split(
    X, y_ner, test_size=0.1, random_state=42, stratify=df['intent_label']
)

print("Intent Classification:")
print("Train:", X_train_intent.shape, y_train_intent.shape)
print("Test:", X_test_intent.shape, y_test_intent.shape)

print("\nNER:")
print("Train:", X_train_ner.shape, y_train_ner.shape)
print("Test:", X_test_ner.shape, y_test_ner.shape)

Intent Classification:
Train: (511, 19) (511,)
Test: (57, 19) (57,)

NER:
Train: (511, 19) (511, 19)
Test: (57, 19) (57, 19)


In [12]:
from tensorflow.keras.optimizers import Adam

# Define Intent Classification Model
def build_intent_model(vocab_size, embedding_dim, lstm_units, num_classes, max_length):
    inputs = Input(shape=(max_length,), dtype='int32')
    embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length, mask_zero=True)(inputs)
    bilstm = Bidirectional(LSTM(units=lstm_units))(embedding)
    dropout = Dropout(0.5)(bilstm)
    dense = Dense(64, activation='relu')(dropout)
    outputs = Dense(num_classes, activation='softmax')(dense)

    model = Model(inputs, outputs)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

# Hyperparameters
embedding_dim_intent = 128
lstm_units_intent = 64

# Build Intent Model
model_intent = build_intent_model(vocab_size, embedding_dim_intent, lstm_units_intent, num_intent_classes, max_seq_length)
model_intent.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 19)]              0         
                                                                 
 embedding_3 (Embedding)     (None, 19, 128)           98816     
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_5 (Dense)             (None, 64)                8256      
                                                                 
 dense_6 (Dense)             (None, 16)                1040      
                                                           

In [14]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow_addons.text import crf_log_likelihood, crf_decode
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import numpy as np

In [15]:
# Custom CRF Layer
class CRFLayer(tf.keras.layers.Layer):
    def __init__(self, num_tags, name="crf_layer", **kwargs):
        super().__init__(name=name, **kwargs)
        self.num_tags = num_tags
        self.transition_params = tf.Variable(
            tf.random.uniform(shape=(num_tags, num_tags)), trainable=True
        )

    def call(self, inputs, seq_lengths):
        # Decode the highest scoring sequence
        decoded_sequence, _ = crf_decode(
            inputs, self.transition_params, seq_lengths
        )
        return decoded_sequence

    def crf_loss(self, y_true, y_pred, seq_lengths):
        log_likelihood, _ = crf_log_likelihood(
            y_pred, y_true, seq_lengths, self.transition_params
        )
        return -tf.reduce_mean(log_likelihood)


# Define NER Model
def build_ner_model(vocab_size, embedding_dim, lstm_units, num_tags, max_length):
    inputs = Input(shape=(max_length,), dtype="int32")
    seq_lengths = tf.reduce_sum(tf.cast(inputs != 0, tf.int32), axis=-1)

    # Embedding layer
    embedding = Embedding(
        input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length, mask_zero=True
    )(inputs)

    # BiLSTM layer
    bilstm = Bidirectional(LSTM(units=lstm_units, return_sequences=True))(embedding)
    dropout = Dropout(0.5)(bilstm)
    dense = TimeDistributed(Dense(num_tags))(dropout)

    # CRF Layer
    crf = CRFLayer(num_tags)
    outputs = crf(dense, seq_lengths)

    model = Model(inputs=inputs, outputs=outputs)

    # Compile with custom loss
    def custom_loss(y_true, y_pred):
        return crf.crf_loss(y_true, y_pred, seq_lengths)

    model.compile(optimizer="adam", loss=custom_loss, metrics=["accuracy"])
    return model


# Hyperparameters
embedding_dim_ner = 128
lstm_units_ner = 64

# Build NER Model
model_ner = build_ner_model(vocab_size, embedding_dim_ner, lstm_units_ner, num_tags, max_seq_length)
model_ner.summary()

# Callbacks
callbacks_ner = [
    EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True),
    ModelCheckpoint(filepath="best_ner_model.h5", save_best_only=True, save_weights_only=True),
]

# Training the model
history_ner = model_ner.fit(
    X_train_ner,
    y_train_ner,
    validation_data=(X_test_ner, y_test_ner),
    epochs=20,
    batch_size=32,
    callbacks=callbacks_ner,
)

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, 19)]         0           []                               
                                                                                                  
 embedding_5 (Embedding)        (None, 19, 128)      98816       ['input_6[0][0]']                
                                                                                                  
 bidirectional_5 (Bidirectional  (None, 19, 128)     98816       ['embedding_5[0][0]']            
 )                                                                                                
                                                                                                  
 tf.__operators__.ne_2 (TFOpLam  (None, 19)          0           ['input_6[0][0]']          

StagingError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "<ipython-input-15-980f3ff868e4>", line 47, in custom_loss  *
        return crf.crf_loss(y_true, y_pred, seq_lengths)
    File "<ipython-input-15-980f3ff868e4>", line 18, in crf_loss  *
        log_likelihood, _ = crf_log_likelihood(
    File "/usr/local/lib/python3.10/dist-packages/tensorflow_addons/text/crf.py", line 228, in crf_log_likelihood  *
        num_tags = inputs.shape[2]

    IndexError: tuple index out of range


In [None]:
# Callbacks for training
callbacks_ner = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ModelCheckpoint(filepath='best_model_ner.h5', save_best_only=True, save_weights_only=True)
]

# Training the model
history_ner = model_ner.fit(
    X_train_ner, y_train_ner,
    validation_data=(X_test_ner, y_test_ner),
    epochs=20,
    batch_size=32,
    callbacks=callbacks_ner
)